Select Git revision
ExecMultiview.py
execution.py 20.60 KiB
import argparse
import numpy as np
import pickle
import time
import os
import errno
import logging
import sklearn
def parseTheArgs(arguments):
"""Used to parse the args entered by the user"""
parser = argparse.ArgumentParser(
description='This file is used to benchmark the scores fo multiple classification algorithm on multiview data.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
groupStandard = parser.add_argument_group('Standard arguments')
groupStandard.add_argument('-log', action='store_true', help='Use option to activate logging to console')
groupStandard.add_argument('--name', metavar='STRING', action='store', help='Name of Database (default: %(default)s)',
default='Plausible')
groupStandard.add_argument('--type', metavar='STRING', action='store',
help='Type of database : .hdf5 or .csv (default: %(default)s)',
default='.hdf5')
groupStandard.add_argument('--views', metavar='STRING', action='store', nargs="+",
help='Name of the views selected for learning (default: %(default)s)',
default=[''])
groupStandard.add_argument('--pathF', metavar='STRING', action='store', help='Path to the hdf5 dataset or database '
'folder (default: %(default)s)',
default='../Data/')
groupStandard.add_argument('--nice', metavar='INT', action='store', type=int,
help='Niceness for the processes', default=0)
groupStandard.add_argument('--randomState', metavar='STRING', action='store',
help="The random state seed to use or the path to a pickle file where it is stored",
default=None)
groupStandard.add_argument('--nbCores', metavar='INT', action='store', help='Number of cores to use for parallel '
'computing, -1 for all',
type=int, default=2)
groupStandard.add_argument('--machine', metavar='STRING', action='store',
help='Type of machine on which the script runs', default="PC")
groupStandard.add_argument('-full', action='store_true', help='Use option to use full dataset and no labels or view filtering')
groupClass = parser.add_argument_group('Classification arguments')
groupClass.add_argument('--CL_multiclassMethod', metavar='STRING', action='store',
help='Determine which multiclass method to use if the dataset is multiclass',
default="oneVersusOne")
groupClass.add_argument('--CL_split', metavar='FLOAT', action='store',
help='Determine the split ratio between learning and validation sets', type=float,
default=0.2)
groupClass.add_argument('--CL_nbFolds', metavar='INT', action='store', help='Number of folds in cross validation',
type=int, default=2)
groupClass.add_argument('--CL_nbClass', metavar='INT', action='store', help='Number of classes, -1 for all', type=int,
default=2)
groupClass.add_argument('--CL_classes', metavar='STRING', action='store', nargs="+",
help='Classes used in the dataset (names of the folders) if not filled, random classes will be '
'selected', default=["yes", "no"])
groupClass.add_argument('--CL_type', metavar='STRING', action='store', nargs="+",
help='Determine whether to use Multiview and/or Monoview, or Benchmark classification',
default=['Benchmark'])
groupClass.add_argument('--CL_algos_monoview', metavar='STRING', action='store', nargs="+",
help='Determine which monoview classifier to use if empty, considering all',
default=[''])
groupClass.add_argument('--CL_algos_multiview', metavar='STRING', action='store', nargs="+",
help='Determine which multiview classifier to use if empty, considering all',
default=[''])
groupClass.add_argument('--CL_statsiter', metavar='INT', action='store',
help="Number of iteration for each algorithm to mean preds on different random states. "
"If using multiple cores, it's highly recommended to use statsiter mod nbCores == 0",
type=int,
default=2)
groupClass.add_argument('--CL_metrics', metavar='STRING', action='store', nargs="+",
help='Determine which metrics to use, separate metric and configuration with ":".'
' If multiple, separate with space. If no metric is specified, '
'considering all'
, default=[''])
groupClass.add_argument('--CL_metric_princ', metavar='STRING', action='store',
help='Determine which metric to use for randomSearch and optimization', default="f1_score")
groupClass.add_argument('--CL_GS_iter', metavar='INT', action='store',
help='Determine how many hyper parameters optimization tests to do', type=int, default=2)
groupClass.add_argument('--CL_HPS_type', metavar='STRING', action='store',
help='Determine which hyperparamter search function use', default="randomizedSearch")
groupRF = parser.add_argument_group('Random Forest arguments')
groupRF.add_argument('--CL_RandomForest_trees', metavar='INT', type=int, action='store', help='Number max trees',
default=25)
groupRF.add_argument('--CL_RandomForest_max_depth', metavar='INT', type=int, action='store',
help='Max depth for the trees',
default=5)
groupRF.add_argument('--CL_RandomForest_criterion', metavar='STRING', action='store', help='Criterion for the trees',
default="entropy")
groupSVMLinear = parser.add_argument_group('Linear SVM arguments')
groupSVMLinear.add_argument('--CL_SVMLinear_C', metavar='INT', type=int, action='store', help='Penalty parameter used',
default=1)
groupSVMRBF = parser.add_argument_group('SVW-RBF arguments')
groupSVMRBF.add_argument('--CL_SVMRBF_C', metavar='INT', type=int, action='store', help='Penalty parameter used',
default=1)
groupSVMPoly = parser.add_argument_group('Poly SVM arguments')
groupSVMPoly.add_argument('--CL_SVMPoly_C', metavar='INT', type=int, action='store', help='Penalty parameter used',
default=1)
groupSVMPoly.add_argument('--CL_SVMPoly_deg', metavar='INT', type=int, action='store', help='Degree parameter used',
default=2)
groupAdaboost = parser.add_argument_group('Adaboost arguments')
groupAdaboost.add_argument('--CL_Adaboost_n_est', metavar='INT', type=int, action='store', help='Number of estimators',
default=2)
groupAdaboost.add_argument('--CL_Adaboost_b_est', metavar='STRING', action='store', help='Estimators',
default='DecisionTreeClassifier')
groupDT = parser.add_argument_group('Decision Trees arguments')
groupDT.add_argument('--CL_DecisionTree_depth', metavar='INT', type=int, action='store',
help='Determine max depth for Decision Trees', default=3)
groupDT.add_argument('--CL_DecisionTree_criterion', metavar='STRING', action='store',
help='Determine max depth for Decision Trees', default="entropy")
groupDT.add_argument('--CL_DecisionTree_splitter', metavar='STRING', action='store',
help='Determine criterion for Decision Trees', default="random")
groupSGD = parser.add_argument_group('SGD arguments')
groupSGD.add_argument('--CL_SGD_alpha', metavar='FLOAT', type=float, action='store',
help='Determine alpha for SGDClassifier', default=0.1)
groupSGD.add_argument('--CL_SGD_loss', metavar='STRING', action='store',
help='Determine loss for SGDClassifier', default='log')
groupSGD.add_argument('--CL_SGD_penalty', metavar='STRING', action='store',
help='Determine penalty for SGDClassifier', default='l2')
groupKNN = parser.add_argument_group('KNN arguments')
groupKNN.add_argument('--CL_KNN_neigh', metavar='INT', type=int, action='store',
help='Determine number of neighbors for KNN', default=1)
groupKNN.add_argument('--CL_KNN_weights', metavar='STRING', action='store',
help='Determine number of neighbors for KNN', default="distance")
groupKNN.add_argument('--CL_KNN_algo', metavar='STRING', action='store',
help='Determine number of neighbors for KNN', default="auto")
groupKNN.add_argument('--CL_KNN_p', metavar='INT', type=int, action='store',
help='Determine number of neighbors for KNN', default=1)
groupSCM = parser.add_argument_group('SCM arguments')
groupSCM.add_argument('--CL_SCM_max_rules', metavar='INT', type=int, action='store',
help='Max number of rules for SCM', default=1)
groupSCM.add_argument('--CL_SCM_p', metavar='FLOAT', type=float, action='store',
help='Max number of rules for SCM', default=1.0)
groupSCM.add_argument('--CL_SCM_model_type', metavar='STRING', action='store',
help='Max number of rules for SCM', default="conjunction")
groupMumbo = parser.add_argument_group('Mumbo arguments')
groupMumbo.add_argument('--MU_types', metavar='STRING', action='store', nargs="+",
help='Determine which monoview classifier to use with Mumbo',
default=[''])
groupMumbo.add_argument('--MU_config', metavar='STRING', action='store', nargs='+',
help='Configuration for the monoview classifier in Mumbo separate each classifier with sapce and each argument with:',
default=[''])
groupMumbo.add_argument('--MU_iter', metavar='INT', action='store', nargs=3,
help='Max number of iteration, min number of iteration, convergence threshold', type=float,
default=[10, 1, 0.01])
groupMumbo.add_argument('--MU_combination', action='store_true',
help='Try all the monoview classifiers combinations for each view',
default=False)
groupFusion = parser.add_argument_group('Fusion arguments')
groupFusion.add_argument('--FU_types', metavar='STRING', action='store', nargs="+",
help='Determine which type of fusion to use',
default=[''])
groupEarlyFusion = parser.add_argument_group('Early Fusion arguments')
groupEarlyFusion.add_argument('--FU_early_methods', metavar='STRING', action='store', nargs="+",
help='Determine which early fusion method of fusion to use',
default=[''])
groupEarlyFusion.add_argument('--FU_E_method_configs', metavar='STRING', action='store', nargs='+',
help='Configuration for the early fusion methods separate '
'method by space and values by :',
default=[''])
groupEarlyFusion.add_argument('--FU_E_cl_config', metavar='STRING', action='store', nargs='+',
help='Configuration for the monoview classifiers used separate classifier by space '
'and configs must be of form argument1_name:value,argument2_name:value',
default=[''])
groupEarlyFusion.add_argument('--FU_E_cl_names', metavar='STRING', action='store', nargs='+',
help='Name of the classifiers used for each early fusion method', default=[''])
groupLateFusion = parser.add_argument_group('Late Fusion arguments')
groupLateFusion.add_argument('--FU_late_methods', metavar='STRING', action='store', nargs="+",
help='Determine which late fusion method of fusion to use',
default=[''])
groupLateFusion.add_argument('--FU_L_method_config', metavar='STRING', action='store', nargs='+',
help='Configuration for the fusion method', default=[''])
groupLateFusion.add_argument('--FU_L_cl_config', metavar='STRING', action='store', nargs='+',
help='Configuration for the monoview classifiers used', default=[''])
groupLateFusion.add_argument('--FU_L_cl_names', metavar='STRING', action='store', nargs="+",
help='Names of the classifier used for late fusion', default=[''])
groupLateFusion.add_argument('--FU_L_select_monoview', metavar='STRING', action='store',
help='Determine which method to use to select the monoview classifiers',
default="intersect")
groupFatLateFusion = parser.add_argument_group('Fat Late Fusion arguments')
groupFatLateFusion.add_argument('--FLF_weights', metavar='FLOAT', action='store', nargs="+",
help='Determine which late fusion method of fusion to use', type=float,
default=[])
args = parser.parse_args(arguments)
return args
def initRandomState(randomStateArg, directory):
"""Used to init a random state and multiple if needed (multicore)"""
if randomStateArg is None:
randomState = np.random.RandomState(randomStateArg)
else:
try:
seed = int(randomStateArg)
randomState = np.random.RandomState(seed)
except ValueError:
fileName = randomStateArg
with open(fileName, 'rb') as handle:
randomState = pickle.load(handle)
with open(directory + "randomState.pickle", "wb") as handle:
pickle.dump(randomState, handle)
return randomState
def initLogFile(args):
"""Used to init the directory where the preds will be stored and the log file"""
resultDirectory = "../Results/" + args.name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/"
logFileName = time.strftime("%Y%m%d-%H%M%S") + "-" + ''.join(args.CL_type) + "-" + "_".join(
args.views) + "-" + args.name + "-LOG"
if not os.path.exists(os.path.dirname(resultDirectory + logFileName)):
try:
os.makedirs(os.path.dirname(resultDirectory + logFileName))
except OSError as exc:
if exc.errno != errno.EEXIST:
raise
logFile = resultDirectory + logFileName
if os.path.isfile(logFile + ".log"):
for i in range(1, 20):
testFileName = logFileName + "-" + str(i) + ".log"
if not (os.path.isfile(resultDirectory + testFileName)):
logFile = resultDirectory + testFileName
break
else:
logFile += ".log"
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', filename=logFile, level=logging.DEBUG,
filemode='w')
if args.log:
logging.getLogger().addHandler(logging.StreamHandler())
return resultDirectory
def genSplits(labels, splitRatio, statsIterRandomStates):
"""Used to gen the train/test splits using one or multiple random states"""
indices = np.arange(len(labels))
splits = []
for randomState in statsIterRandomStates:
foldsObj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1,
random_state=randomState,
test_size=splitRatio)
folds = foldsObj.split(indices, labels)
for fold in folds:
train_fold, test_fold = fold
trainIndices = indices[train_fold]
testIndices = indices[test_fold]
splits.append([trainIndices, testIndices])
return splits
def genKFolds(statsIter, nbFolds, statsIterRandomStates):
"""Used to generate folds indices for cross validation and multiple if needed"""
if statsIter > 1:
foldsList = []
for randomState in statsIterRandomStates:
foldsList.append(sklearn.model_selection.StratifiedKFold(n_splits=nbFolds, random_state=randomState))
return foldsList
else:
return [sklearn.model_selection.StratifiedKFold(n_splits=nbFolds, random_state=statsIterRandomStates)]
def initViews(DATASET, args):
"""Used to return the views names that will be used by the algos, their indices and all the views names"""
NB_VIEW = DATASET.get("Metadata").attrs["nbView"]
if args.views != [""]:
allowedViews = args.views
allViews = [str(DATASET.get("View" + str(viewIndex)).attrs["name"])
if type(DATASET.get("View" + str(viewIndex)).attrs["name"])!=bytes
else DATASET.get("View" + str(viewIndex)).attrs["name"].decode("utf-8")
for viewIndex in range(NB_VIEW)]
views = []
viewsIndices = []
for viewIndex in range(NB_VIEW):
viewName = DATASET.get("View" + str(viewIndex)).attrs["name"]
if type(viewName) == bytes:
viewName = viewName.decode("utf-8")
if viewName in allowedViews:
views.append(viewName)
viewsIndices.append(viewsIndices)
return views, viewsIndices, allViews
else:
views = [str(DATASET.get("View" + str(viewIndex)).attrs["name"]) for viewIndex in range(NB_VIEW)]
viewsIndices = np.arange(NB_VIEW)
allViews = views
return views, viewsIndices, allViews
def genDirecortiesNames(directory, statsIter):
"""Used to generate the different directories of each iteration if needed"""
if statsIter > 1:
directories = []
for i in range(statsIter):
directories.append(directory + "iter_" + str(i + 1) + "/")
else:
directories = [directory]
return directories
def genArgumentDictionaries(labelsDictionary, directories, multiclassLabels, labelsCombinations, indicesMulticlass, hyperParamSearch, args,
kFolds, statsIterRandomStates, metrics, argumentDictionaries, benchmark, nbViews, views):
benchmarkArgumentDictionaries = []
for combinationIndex, labelsCombination in enumerate(labelsCombinations):
for iterIndex, iterRandomState in enumerate(statsIterRandomStates):
benchmarkArgumentDictionary = {"LABELS_DICTIONARY": {0:labelsDictionary[labelsCombination[0]],
1:labelsDictionary[labelsCombination[1]]},
"directory": directories[iterIndex]+
labelsDictionary[labelsCombination[0]]+
"-vs-"+
labelsDictionary[labelsCombination[1]]+"/",
"classificationIndices": [indicesMulticlass[combinationIndex][0][iterIndex],
indicesMulticlass[combinationIndex][1][iterIndex],
indicesMulticlass[combinationIndex][2][iterIndex]],
"args": args,
"labels": multiclassLabels[combinationIndex],
"kFolds": kFolds[iterIndex],
"randomState": iterRandomState,
"hyperParamSearch": hyperParamSearch,
"metrics": metrics,
"argumentDictionaries": argumentDictionaries,
"benchmark": benchmark,
"views": views,
"viewsIndices": range(nbViews),
"flag": [iterIndex, labelsCombination]}
benchmarkArgumentDictionaries.append(benchmarkArgumentDictionary)
return benchmarkArgumentDictionaries