diff --git a/docs/source/execution.rst b/docs/source/execution.rst index 4c20d80fc40f258014afdad9d3f2e20dc1935eea..91c16ad356895de12f3131cb4b747c21d655806c 100644 --- a/docs/source/execution.rst +++ b/docs/source/execution.rst @@ -1,5 +1,5 @@ Welcome tothe exection documentation! -============================================= +===================================== .. automodule:: multiview_platform.Exec :members: diff --git a/docs/source/index.rst b/docs/source/index.rst index 55ceb2cbc6dc9b2f79ee162365101c27f1d833c9..aad5d65fc35aac3a8ca3d0cdda2495b5a51d9c8f 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -15,7 +15,7 @@ The main advantage of the platform is that it allows to add and remove a classif :caption: Contents: readme -.. api + api .. examples diff --git a/docs/source/monomulti/utils/execution.rst b/docs/source/monomulti/utils/execution.rst new file mode 100644 index 0000000000000000000000000000000000000000..4956fcbe067c9cc4139c206dc72cd7c475b51d0b --- /dev/null +++ b/docs/source/monomulti/utils/execution.rst @@ -0,0 +1,6 @@ +Utils execution module +====================== + +.. automodule:: multiview_platform.MonoMultiViewClassifiers.utils.execution + :members: + :inherited-members: \ No newline at end of file diff --git a/docs/source/monomulti/utils/multiclass.rst b/docs/source/monomulti/utils/multiclass.rst new file mode 100644 index 0000000000000000000000000000000000000000..f7e11aee6c4d459f8c504247bda6ce15432bb53e --- /dev/null +++ b/docs/source/monomulti/utils/multiclass.rst @@ -0,0 +1,6 @@ +Utils Multiclass module +======================= + +.. automodule:: multiview_platform.MonoMultiViewClassifiers.utils.Multiclass + :members: + :inherited-members: \ No newline at end of file diff --git a/docs/source/monomultidoc.rst b/docs/source/monomultidoc.rst index 0bff743d48380198ca8f287c71189084df27e0a8..822774f04fb64c149824472fd17f7fa7af2404c2 100644 --- a/docs/source/monomultidoc.rst +++ b/docs/source/monomultidoc.rst @@ -11,3 +11,5 @@ Mono and mutliview classification monomulti/exec_classif monomulti/result_analysis monomulti/multiview_classifiers/diversity_fusion + monomulti/utils/execution + monomulti/utils/multiclass diff --git a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py index ca34a476f67201173e31b0f76eba6739d8a950f9..41af603b8c10235300f8236a58f8528c4166f3ae 100644 --- a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py +++ b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py @@ -27,47 +27,82 @@ __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype -def initBenchmark(args): - """Used to create a list of all the algorithm packages names used for the benchmark - Needs args.CL_type, args.CL_algos_multiview, args.MU_types, args.FU_types, args.FU_late_methods, - args.FU_early_methods, args.CL_algos_monoview""" +def initBenchmark(CL_type, multiviewAlgos, monoviewAlgos, args): + r"""Used to create a list of all the algorithm packages names used for the benchmark. + + First this function will check if the benchmark need mono- or/and multiview algorithms and adds to the right + dictionary the asked algorithms. If none is asked by the user, all will be added. + + If the keyword `"Benchmark"` is used, all mono- and multiview algorithms will be added. + + Parameters + ---------- + CL_type : List of string + List of types of needed benchmark + multiviewAlgos : List of strings + List of multiview algorithms needed for the benchmark + monoviewAlgos : Listof strings + List of monoview algorithms needed for the benchmark + args : ParsedArgumentParser args + All the input args (used to tune the algorithms) + + Returns + ------- + benchmark : Dictionary of dictionaries + Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. + """ benchmark = {"Monoview": {}, "Multiview": {}} allMultiviewPackages = [name for _, name, isPackage in pkgutil.iter_modules(['./MonoMultiViewClassifiers/MultiviewClassifiers/']) if isPackage] - if args.CL_type == ["Benchmark"]: + if "Monoview" in CL_type: + if monoviewAlgos == ['']: + benchmark["Monoview"] = [name for _, name, isPackage in pkgutil.iter_modules(["./MonoMultiViewClassifiers/MonoviewClassifiers"]) + if not isPackage] - allMonoviewAlgos = [name for _, name, isPackage in - pkgutil.iter_modules(['./MonoMultiViewClassifiers/MonoviewClassifiers']) - if (not isPackage) and name not in ["framework"]] - benchmark["Monoview"] = allMonoviewAlgos - benchmark["Multiview"] = dict((multiviewPackageName, "_") for multiviewPackageName in allMultiviewPackages) - for multiviewPackageName in allMultiviewPackages: - multiviewPackage = getattr(MultiviewClassifiers, multiviewPackageName) - multiviewModule = getattr(multiviewPackage, multiviewPackageName+"Module") - benchmark = multiviewModule.getBenchmark(benchmark, args=args) + else: + benchmark["Monoview"] = monoviewAlgos - if "Multiview" in args.CL_type: + if "Multiview" in CL_type: benchmark["Multiview"] = {} - if args.CL_algos_multiview == [""]: + if multiviewAlgos == [""]: algosMutliview = allMultiviewPackages else: - algosMutliview = args.CL_algos_multiview + algosMutliview = multiviewAlgos for multiviewPackageName in allMultiviewPackages: if multiviewPackageName in algosMutliview: multiviewPackage = getattr(MultiviewClassifiers, multiviewPackageName) multiviewModule = getattr(multiviewPackage, multiviewPackageName+"Module") benchmark = multiviewModule.getBenchmark(benchmark, args=args) - if "Monoview" in args.CL_type: - if args.CL_algos_monoview == ['']: - benchmark["Monoview"] = [name for _, name, isPackage in pkgutil.iter_modules(["./MonoMultiViewClassifiers/MonoviewClassifiers"]) - if not isPackage] - else: - benchmark["Monoview"] = args.CL_algos_monoview + if CL_type == ["Benchmark"]: + allMonoviewAlgos = [name for _, name, isPackage in + pkgutil.iter_modules(['./MonoMultiViewClassifiers/MonoviewClassifiers']) + if (not isPackage) and name not in ["framework"]] + benchmark["Monoview"] = allMonoviewAlgos + benchmark["Multiview"] = dict((multiviewPackageName, "_") for multiviewPackageName in allMultiviewPackages) + for multiviewPackageName in allMultiviewPackages: + multiviewPackage = getattr(MultiviewClassifiers, multiviewPackageName) + multiviewModule = getattr(multiviewPackage, multiviewPackageName+"Module") + benchmark = multiviewModule.getBenchmark(benchmark, args=args) + return benchmark def genViewsDictionnary(DATASET, views): + r"""Used to generate a dictionary mapping a view name (key) to it's index in the dataset (value). + + Parameters + ---------- + DATASET : `h5py` dataset file + The full dataset on which the benchmark will be done + views : List of strings + Names of the selected views on which the banchmark will be done + + Returns + ------- + viewDictionary : Dictionary + Dictionary mapping the view names totheir indexin the full dataset. + """ datasetsNames = DATASET.keys() viewsDictionary = {} for datasetName in datasetsNames: @@ -81,27 +116,64 @@ def genViewsDictionnary(DATASET, views): return viewsDictionary -def initMonoviewExps(benchmark, argumentDictionaries, viewsDictionary, NB_CLASS, kwargsInit): - """Used to add each monoview exeperience args to the list of monoview experiences args""" +def initMonoviewExps(benchmark, viewsDictionary, nbClass, kwargsInit): + r"""Used to add each monoview exeperience args to the list of monoview experiences args. + + First this function will check if the benchmark need mono- or/and multiview algorithms and adds to the right + dictionary the asked algorithms. If none is asked by the user, all will be added. + + If the keyword `"Benchmark"` is used, all mono- and multiview algorithms will be added. + + Parameters + ---------- + benchmark : dictionary + All types of monoview and multiview experiments that have to be benchmarked + argumentDictionaries : dictionary + Maps monoview and multiview experiments arguments. + viewDictionary : dictionary + Maps the view names to their index in the HDF5 dataset + nbClass : integer + Number of different labels in the classification + + Returns + ------- + benchmark : Dictionary of dictionaries + Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. + """ + argumentDictionaries = {"Monoview":[], "Multiview":[]} if benchmark["Monoview"]: argumentDictionaries["Monoview"] = [] for viewName, viewIndex in viewsDictionary.items(): for classifier in benchmark["Monoview"]: arguments = { "args": {classifier + "KWARGS": kwargsInit[classifier + "KWARGSInit"], "feat": viewName, - "CL_type": classifier, "nbClass": NB_CLASS}, "viewIndex": viewIndex} + "CL_type": classifier, "nbClass": nbClass}, "viewIndex": viewIndex} argumentDictionaries["Monoview"].append(arguments) return argumentDictionaries def initMonoviewKWARGS(args, classifiersNames): - """Used to init kwargs thanks to a function in each monoview classifier package""" + r"""Used to init kwargs thanks to a function in each monoview classifier package. + + Parameters + ---------- + args : parsed args objects + All the args passed by the user. + classifiersNames : list of strings + List of the benchmarks's monoview classifiers names. + + Returns + ------- + monoviewKWARGS : Dictionary of dictionaries + Dictionary resuming all the specific arguments for the benchmark, oe dictionary for each classifier. + + For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" + logging.debug("Start:\t Initializing Monoview classifiers arguments") monoviewKWARGS = {} for classifiersName in classifiersNames: classifierModule = getattr(MonoviewClassifiers, classifiersName) monoviewKWARGS[classifiersName + "KWARGSInit"] = classifierModule.getKWARGS(args) - # [(key, value) for key, value in vars(args).items() if key.startswith("CL_" + classifiersName)]) logging.debug("Done:\t Initializing Monoview classifiers arguments") return monoviewKWARGS @@ -296,10 +368,47 @@ def execBenchmark(nbCores, statsIter, nbMulticlass, benchmarkArgumentsDictionari directory, multiClassLabels, metrics, labelsDictionary, nbLabels, DATASET, execOneBenchmark=execOneBenchmark, execOneBenchmark_multicore=execOneBenchmark_multicore, execOneBenchmarkMonoCore=execOneBenchmarkMonoCore, getResults=getResults, delete=DB.deleteHDF5): - """Used to execute the needed benchmark(s) on multicore or mono-core functions - The execOneBenchmark and execOneBenchmark_multicore keywords args are only used in the tests""" + r"""Used to execute the needed benchmark(s) on multicore or mono-core functions. + + Parameters + ---------- + nbCores : int + Number of threads that the benchmarks can use. + statsIter : int + Number of statistical iterations that have to be done. + benchmarkArgumentsDictionaries : list of dictionaries + All the needed arguments for the benchmarks. + classificationIndices : list of lists of numpy.ndarray + For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and + the ones of the testing set. + directories : list of strings + List of the paths to the result directories for each statistical iteration. + directory : string + Path to the main results directory. + multiClassLabels : ist of lists of numpy.ndarray + For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the + indices for the biclass training set, the ones for the biclass testing set and the ones for the + multiclass testing set. + metrics : list of lists + Metrics that will be used to evaluate the algorithms performance. + labelsDictionary : dictionary + Dictionary mapping labels indices to labels names. + nbLabels : int + Total number of different labels in the dataset. + DATASET : HDF5 dataset file + The full dataset that wil be used by the benchmark. + classifiersNames : list of strings + List of the benchmarks's monoview classifiers names. + rest_of_the_args : + Just used for testing purposes + + + Returns + ------- + results : list of lists + The results of the benchmark. + """ # TODO : find a way to flag - logging.debug("Start:\t Executing all the needed biclass benchmarks") results = [] if nbCores > 1: @@ -342,8 +451,11 @@ def execClassif(arguments): statsIter = args.CL_statsiter hyperParamSearch = args.CL_HPS_type multiclassMethod = args.CL_multiclassMethod + CL_type = args.CL_type + monoviewAlgos = args.CL_algos_monoview + multiviewAlgos = args.CL_algos_multiview - directory = execution.initLogFile(args) + directory = execution.initLogFile(args.name, args.views, args.CL_type, args.log) randomState = execution.initRandomState(args.randomState, directory) statsIterRandomStates = execution.initStatsIterRandomStates(statsIter,randomState) @@ -352,21 +464,18 @@ def execClassif(arguments): DATASET, LABELS_DICTIONARY = getDatabase(args.views, args.pathF, args.name, args.CL_nbClass, args.CL_classes, randomState, args.full) - classificationIndices = execution.genSplits(DATASET.get("Labels").value, args.CL_split, statsIterRandomStates) + splits = execution.genSplits(DATASET.get("Labels").value, args.CL_split, statsIterRandomStates) - multiclassLabels, labelsCombinations, indicesMulticlass = Multiclass.genMulticlassLabels(DATASET.get("Labels").value, multiclassMethod, classificationIndices) + multiclassLabels, labelsCombinations, indicesMulticlass = Multiclass.genMulticlassLabels(DATASET.get("Labels").value, multiclassMethod, splits) kFolds = execution.genKFolds(statsIter, args.CL_nbFolds, statsIterRandomStates) - datasetFiles = Dataset.initMultipleDatasets(args, nbCores) + datasetFiles = Dataset.initMultipleDatasets(args.pathF, args.name, nbCores) # if not views: # raise ValueError("Empty views list, modify selected views to match dataset " + args.views) - - # nbViews = DATASET.get("Metadata").attrs["nbView"] - - views, viewsIndices, allViews = execution.initViews(DATASET, args) + views, viewsIndices, allViews = execution.initViews(DATASET, args.views) viewsDictionary = genViewsDictionnary(DATASET, views) nbViews = len(views) NB_CLASS = DATASET.get("Metadata").attrs["nbClass"] @@ -381,27 +490,23 @@ def execClassif(arguments): if len(metric) == 1: metrics[metricIndex] = [metric[0], None] - # logging.debug("Start:\t Finding all available mono- & multiview algorithms") - - benchmark = initBenchmark(args) + benchmark = initBenchmark(CL_type, monoviewAlgos, multiviewAlgos, args) initKWARGS = initKWARGSFunc(args, benchmark) dataBaseTime = time.time() - start - argumentDictionaries = {"Monoview": [], "Multiview": []} - argumentDictionaries = initMonoviewExps(benchmark, argumentDictionaries, viewsDictionary, NB_CLASS, - initKWARGS) + argumentDictionaries = initMonoviewExps(benchmark, viewsDictionary, NB_CLASS, initKWARGS) directories = execution.genDirecortiesNames(directory, statsIter) benchmarkArgumentDictionaries = execution.genArgumentDictionaries(LABELS_DICTIONARY, directories, multiclassLabels, labelsCombinations, indicesMulticlass, hyperParamSearch, args, kFolds, statsIterRandomStates, metrics, - argumentDictionaries, benchmark, nbViews, views, viewsIndices) - + argumentDictionaries, benchmark, nbViews, + views, viewsIndices) nbMulticlass = len(labelsCombinations) - execBenchmark(nbCores, statsIter, nbMulticlass, benchmarkArgumentDictionaries, classificationIndices, directories, + execBenchmark(nbCores, statsIter, nbMulticlass, benchmarkArgumentDictionaries, splits, directories, directory, multiclassLabels, metrics, LABELS_DICTIONARY, NB_CLASS, DATASET) diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/Dataset.py b/multiview_platform/MonoMultiViewClassifiers/utils/Dataset.py index c2ab4805eed7d0187625ae58d8d782bc1d5c0592..686cdc199aa3db43193d109ef51d3eae7d4a280a 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/Dataset.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/Dataset.py @@ -72,22 +72,39 @@ def extractSubset(matrix, usedIndices): return matrix[usedIndices] -def initMultipleDatasets(args, nbCores): - """Used to create copies of the dataset if multicore computation is used""" +def initMultipleDatasets(pathF, name, nbCores): + r"""Used to create copies of the dataset if multicore computation is used. + + This is a temporary solution to fix the sharing memory issue with HDF5 datasets. + + Parameters + ---------- + pathF : string + Path to the original dataset directory + name : string + Name of the dataset + nbCores : int + The number of threads that the benchmark can use + + Returns + ------- + datasetFiles : None + Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. + """ if nbCores > 1: - if DB.datasetsAlreadyExist(args.pathF, args.name, nbCores): + if DB.datasetsAlreadyExist(pathF, name, nbCores): logging.debug("Info:\t Enough copies of the dataset are already available") pass else: logging.debug("Start:\t Creating " + str(nbCores) + " temporary datasets for multiprocessing") logging.warning(" WARNING : /!\ This may use a lot of HDD storage space : " + - str(os.path.getsize(args.pathF + args.name + ".hdf5") * nbCores / float( + str(os.path.getsize(pathF + name + ".hdf5") * nbCores / float( 1024) / 1000 / 1000) + " Gbytes /!\ ") confirmation = confirm() if not confirmation: sys.exit(0) else: - datasetFiles = DB.copyHDF5(args.pathF, args.name, nbCores) + datasetFiles = DB.copyHDF5(pathF, name, nbCores) logging.debug("Start:\t Creating datasets for multiprocessing") return datasetFiles diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/Multiclass.py b/multiview_platform/MonoMultiViewClassifiers/utils/Multiclass.py index 92f5e353026f2e7998eef3ec52afb48d31c14d6f..bcf2e28fd76d5252bc200b53377fea34439df48d 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/Multiclass.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/Multiclass.py @@ -2,14 +2,50 @@ import numpy as np import itertools -def genMulticlassLabels(labels, multiclassMethod, classificationIndices): +def genMulticlassLabels(labels, multiclassMethod, splits): + r"""Used to gen the train/test splits and to set up the framework of the adaptation of a multiclass dataset + to biclass algorithms. + + First, the function checks whether the dataset is really multiclass. + + Then, it generates all the possible couples of different labels in order to perform one versus one classification. + + For each combination, it selects the examples in the training sets (for each statistical iteration) that have their + label in the combination and does the same for the testing set. It also saves the multiclass testing set in order to + use multiclass metrics on the decisions. + + Lastly, it creates a new array of biclass labels (0/1) for the biclass classifications used in oneVersusOne + + Parameters + ---------- + labels : numpy.ndarray + Name of the database. + multiclassMethod : string + The name of the multiclass method used (oneVersusOne, oneVersusAll, ...). + splits : list of lists of numpy.ndarray + For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and + the ones of the testing set. + + Returns + ------- + multiclassLabels : list of lists of numpy.ndarray + For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the + indices for the biclass training set, the ones for the biclass testing set and the ones for the + multiclass testing set. + + labelsIndices : list of lists of numpy.ndarray + Each original couple of different labels. + + indicesMulticlass : list of lists of numpy.ndarray + For each combination, contains a biclass labels numpy.ndarray with the 0/1 labels of combination. + """ if multiclassMethod == "oneVersusOne": nbLabels = len(set(list(labels))) if nbLabels == 2: - classificationIndices = [[trainIndices for trainIndices, _ in classificationIndices], - [testIndices for _, testIndices in classificationIndices], - [[] for _ in classificationIndices]] - return [labels], [(0,1)], [classificationIndices] + splits = [[trainIndices for trainIndices, _ in splits], + [testIndices for _, testIndices in splits], + [[] for _ in splits]] + return [labels], [(0,1)], [splits] else: combinations = itertools.combinations(np.arange(nbLabels), 2) multiclassLabels = [] @@ -21,10 +57,10 @@ def genMulticlassLabels(labels, multiclassMethod, classificationIndices): for exampleIndex, exampleLabel in enumerate(labels) if exampleLabel in combination] trainIndices = [np.array([oldIndex for oldIndex in oldIndices if oldIndex in iterIndices[0]]) - for iterIndices in classificationIndices] + for iterIndices in splits] testIndices = [np.array([oldIndex for oldIndex in oldIndices if oldIndex in iterindices[1]]) - for iterindices in classificationIndices] - testIndicesMulticlass = [np.array(iterindices[1]) for iterindices in classificationIndices] + for iterindices in splits] + testIndicesMulticlass = [np.array(iterindices[1]) for iterindices in splits] indicesMulticlass.append([trainIndices, testIndices, testIndicesMulticlass]) newLabels = np.zeros(len(labels), dtype=int)-100 for labelIndex, label in enumerate(labels): diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py index c8f8d6773bdd8405188505d4ebc0d46653ad63b8..6d6bfdd80b419a45748e226c93b7c9ee6a2023d5 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py @@ -222,13 +222,26 @@ def parseTheArgs(arguments): def initRandomState(randomStateArg, directory): - """ + r""" Used to init a random state. - If no randomState is specified, it will take a 'random' seed. - If the arg is a string containing only numbers, it will be converted in an int to gen a seed. - If the arg is a string with letters, it must be a path to a pickled random state file that will be loaded. + If no random state is specified, it will generate a 'random' seed. + If the `randomSateArg` is a string containing only numbers, it will be converted in an int to generate a seed. + If the `randomSateArg` is a string with letters, it must be a path to a pickled random state file that will be loaded. The function will also pickle the new random state in a file tobe able to retrieve it later. Tested + + + Parameters + ---------- + randomStateArg : None or string + See function description. + directory : string + Path to the results directory. + + Returns + ------- + randomState : numpy.random.RandomState object + This random state will be used all along the benchmark . """ if randomStateArg is None: randomState = np.random.RandomState(randomStateArg) @@ -246,7 +259,22 @@ def initRandomState(randomStateArg, directory): def initStatsIterRandomStates(statsIter, randomState): - """Used to init multiple random states if needed because of multiple statsIter""" + r""" + Used to initialize multiple random states if needed because of multiple statistical iteration of the same benchmark + + Parameters + ---------- + statsIter : int + Number of statistical iterations of the same benchmark done (with a different random state). + randomState : numpy.random.RandomState object + The random state of the whole experimentation, that will be used to generate the ones for each + statistical iteration. + + Returns + ------- + statsIterRandomStates : list of numpy.random.RandomState objects + Multiple random states, one for each sattistical iteration of the same benchmark. + """ if statsIter > 1: statsIterRandomStates = [np.random.RandomState(randomState.randint(5000)) for _ in range(statsIter)] else: @@ -255,7 +283,20 @@ def initStatsIterRandomStates(statsIter, randomState): def getDatabaseFunction(name, type): - """Used to get the right databes extraction function according to the type of and it's name""" + r"""Used to get the right database extraction function according to the type of database and it's name + + Parameters + ---------- + name : string + Name of the database. + type : string + type of dataset hdf5 or csv + + Returns + ------- + getDatabase : function + The function that will be used to extract the database + """ if name not in ["Fake", "Plausible"]: getDatabase = getattr(DB, "getClassicDB" + type[1:]) else: @@ -263,11 +304,32 @@ def getDatabaseFunction(name, type): return getDatabase -def initLogFile(args): - """Used to init the directory where the preds will be stored and the log file""" - resultDirectory = "../Results/" + args.name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/" - logFileName = time.strftime("%Y_%m_%d-%H_%M") + "-" + ''.join(args.CL_type) + "-" + "_".join( - args.views) + "-" + args.name + "-LOG" +def initLogFile(name, views, CL_type, log): + r"""Used to init the directory where the preds will be stored and the log file. + + First this function will check if the result directory already exists (only one per minute is allowed). + + If the the result directory name is available, it is created, and the logfile is initiated. + + Parameters + ---------- + name : string + Name of the database. + views : list of strings + List of the view names that will be used in the benchmark. + CL_type : list of strings + Type of benchmark that will be made . + log : bool + Whether to show the log file in console or hide it. + + Returns + ------- + resultsDirectory : string + Reference to the main results directory for the benchmark. + """ + resultDirectory = "../Results/" + name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/" + logFileName = time.strftime("%Y_%m_%d-%H_%M") + "-" + ''.join(CL_type) + "-" + "_".join( + views) + "-" + name + "-LOG" if os.path.exists(os.path.dirname(resultDirectory)): raise NameError("The result dir already exists, wait 1 min and retry") os.makedirs(os.path.dirname(resultDirectory + logFileName)) @@ -275,15 +337,30 @@ def initLogFile(args): logFile += ".log" logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', filename=logFile, level=logging.DEBUG, filemode='w') - if args.log: + if log: logging.getLogger().addHandler(logging.StreamHandler()) return resultDirectory def genSplits(labels, splitRatio, statsIterRandomStates): - """Used to gen the train/test splits using one or multiple random states - classificationIndices is a list of train/test splits""" + r"""Used to gen the train/test splits using one or multiple random states. + + Parameters + ---------- + labels : numpy.ndarray + Name of the database. + splitRatio : float + The ratio of examples between train and test set. + statsIterRandomStates : list of numpy.random.RandomState + The random states for each statistical iteration. + + Returns + ------- + splits : list of lists of numpy.ndarray + For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and + the ones of the testing set. + """ indices = np.arange(len(labels)) splits = [] for randomState in statsIterRandomStates: @@ -301,21 +378,53 @@ def genSplits(labels, splitRatio, statsIterRandomStates): def genKFolds(statsIter, nbFolds, statsIterRandomStates): - """Used to generate folds indices for cross validation and multiple if needed""" + r"""Used to generate folds indices for cross validation for each statistical iteration. + + Parameters + ---------- + statsIter : integer + Number of statistical iterations of the benchmark. + nbFolds : integer + The number of cross-validation folds for the benchmark. + statsIterRandomStates : list of numpy.random.RandomState + The random states for each statistical iteration. + + Returns + ------- + foldsList : list of list of sklearn.model_selection.StratifiedKFold + For each statistical iteration a Kfold stratified (keeping the ratio between classes in each fold). + """ if statsIter > 1: foldsList = [] for randomState in statsIterRandomStates: foldsList.append(sklearn.model_selection.StratifiedKFold(n_splits=nbFolds, random_state=randomState)) - return foldsList else: - return [sklearn.model_selection.StratifiedKFold(n_splits=nbFolds, random_state=statsIterRandomStates)] - - -def initViews(DATASET, args): - """Used to return the views names that will be used by the algos, their indices and all the views names""" + foldsList = [sklearn.model_selection.StratifiedKFold(n_splits=nbFolds, random_state=statsIterRandomStates)] + return foldsList + + +def initViews(DATASET, argViews): + r"""Used to return the views names that will be used by the benchmark, their indices and all the views names. + + Parameters + ---------- + DATASET : HDF5 dataset file + The full dataset that wil be used by the benchmark. + argViews : list of strings + The views that will be used by the benchmark (arg). + + Returns + ------- + views : list of strings + Names of the views that will be used by the benchmark. + viewIndices : list of ints + The list of the indices of the view that will be used in the benchmark (according to the dataset). + allViews : list of strings + Names of all the available views in the dataset. + """ NB_VIEW = DATASET.get("Metadata").attrs["nbView"] - if args.views != [""]: - allowedViews = args.views + if argViews != [""]: + allowedViews = argViews allViews = [str(DATASET.get("View" + str(viewIndex)).attrs["name"]) if type(DATASET.get("View" + str(viewIndex)).attrs["name"])!=bytes else DATASET.get("View" + str(viewIndex)).attrs["name"].decode("utf-8") @@ -329,19 +438,31 @@ def initViews(DATASET, args): if viewName in allowedViews: views.append(viewName) viewsIndices.append(viewIndex) - return views, viewsIndices, allViews else: views = [str(DATASET.get("View" + str(viewIndex)).attrs["name"]) if type(DATASET.get("View" + str(viewIndex)).attrs["name"])!=bytes else DATASET.get("View" + str(viewIndex)).attrs["name"].decode("utf-8") for viewIndex in range(NB_VIEW)] - viewsIndices = np.arange(NB_VIEW) + viewsIndices = range(NB_VIEW) allViews = views - return views, viewsIndices, allViews + return views, viewsIndices, allViews def genDirecortiesNames(directory, statsIter): - """Used to generate the different directories of each iteration if needed""" + r"""Used to generate the different directories of each iteration if needed. + + Parameters + ---------- + directory : string + Path to the results directory. + statsIter : int + The number of statistical iterations. + + Returns + ------- + directories : list of strings + Paths to each statistical iterations result directory. + """ if statsIter > 1: directories = [] for i in range(statsIter): @@ -351,8 +472,55 @@ def genDirecortiesNames(directory, statsIter): return directories -def genArgumentDictionaries(labelsDictionary, directories, multiclassLabels, labelsCombinations, indicesMulticlass, hyperParamSearch, args, - kFolds, statsIterRandomStates, metrics, argumentDictionaries, benchmark, nbViews, views, viewsIndices): +def genArgumentDictionaries(labelsDictionary, directories, multiclassLabels, labelsCombinations, indicesMulticlass, + hyperParamSearch, args, kFolds, statsIterRandomStates, metrics, argumentDictionaries, + benchmark, nbViews, views, viewsIndices): + r"""Used to generate a dictionary for each benchmark. + + One for each label combination (if multiclass), for each statistical iteration, generates an dictionary with + all necessary information to perform the benchmark + + Parameters + ---------- + labelsDictionary : dictionary + Dictionary mapping labels indices to labels names. + directories : list of strings + List of the paths to the result directories for each statistical iteration. + multiclassLabels : list of lists of numpy.ndarray + For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the + indices for the biclass training set, the ones for the biclass testing set and the ones for the + multiclass testing set. + labelsCombinations : list of lists of numpy.ndarray + Each original couple of different labels. + indicesMulticlass : list of lists of numpy.ndarray + For each combination, contains a biclass labels numpy.ndarray with the 0/1 labels of combination. + hyperParamSearch : string + Type of hyper parameter optimization method + args : parsed args objects + All the args passed by the user. + kFolds : list of list of sklearn.model_selection.StratifiedKFold + For each statistical iteration a Kfold stratified (keeping the ratio between classes in each fold). + statsIterRandomStates : list of numpy.random.RandomState objects + Multiple random states, one for each sattistical iteration of the same benchmark. + metrics : list of lists + Metrics that will be used to evaluate the algorithms performance. + argumentDictionaries : dictionary + Dictionary resuming all the specific arguments for the benchmark, oe dictionary for each classifier. + benchmark : dictionary + Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. + nbViews : int + THe number of views used by the benchmark. + views : list of strings + List of the names of the used views. + viewsIndices : list of ints + List of indices (according to the dataset) of the used views. + + Returns + ------- + benchmarkArgumentDictionaries : list of dicts + All the needed arguments for the benchmarks. + + """ benchmarkArgumentDictionaries = [] for combinationIndex, labelsCombination in enumerate(labelsCombinations): for iterIndex, iterRandomState in enumerate(statsIterRandomStates): diff --git a/multiview_platform/Tests/test_ExecClassif.py b/multiview_platform/Tests/test_ExecClassif.py index d5ec2a28aafb6771e8892246884bdb7bb76b7d6f..7701c24c0549717ca85f63b99a1fb66c52c87aab 100644 --- a/multiview_platform/Tests/test_ExecClassif.py +++ b/multiview_platform/Tests/test_ExecClassif.py @@ -29,12 +29,12 @@ class Test_initMonoviewArguments(unittest.TestCase): def test_initMonoviewArguments_no_monoview(self): benchmark = {"Monoview":{}, "Multiview":{}} - arguments = ExecClassif.initMonoviewExps(benchmark, {}, {}, 0, {}) - self.assertEqual(arguments, {}) + arguments = ExecClassif.initMonoviewExps(benchmark, {}, 0, {}) + self.assertEqual(arguments, {'Monoview':[], 'Multiview':[]}) def test_initMonoviewArguments_empty(self): benchmark = {"Monoview":{}, "Multiview":{}} - arguments = ExecClassif.initMonoviewExps(benchmark, {}, {}, 0, {}) + arguments = ExecClassif.initMonoviewExps(benchmark, {}, 0, {}) def fakeBenchmarkExec(coreIndex=-1, a=7, args=1):