From afecc2a11f00d20ae82d99c64a8fa9c7b36e5567 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Thu, 5 Sep 2019 16:06:01 -0400 Subject: [PATCH] Added possibility of multiple datasets in series --- .../MonoMultiViewClassifiers/ExecClassif.py | 345 ++++-------------- .../Monoview/Additions/BoostUtils.py | 3 + .../Monoview/Additions/MinCQUtils.py | 6 +- .../MonoviewClassifiers/AdaboostPregen.py | 2 + .../MonoviewClassifiers/MinCQ.py | 8 +- .../utils/execution.py | 16 +- 6 files changed, 100 insertions(+), 280 deletions(-) diff --git a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py index bcc03aed..d77fec48 100644 --- a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py +++ b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py @@ -592,279 +592,78 @@ def execClassif(arguments): CL_type = args.CL_type monoviewAlgos = args.CL_algos_monoview multiviewAlgos = args.CL_algos_multiview + dataset_list = execution.find_dataset_names(args.pathF, args.type, args.name) - directory = execution.initLogFile(args.name, args.views, args.CL_type, - args.log, args.debug, args.label, - args.res_dir) - randomState = execution.initRandomState(args.randomState, directory) - statsIterRandomStates = execution.initStatsIterRandomStates(statsIter, - randomState) + for name in dataset_list: - getDatabase = execution.getDatabaseFunction(args.name, args.type) + directory = execution.initLogFile(name, args.views, args.CL_type, + args.log, args.debug, args.label, + args.res_dir) + randomState = execution.initRandomState(args.randomState, directory) + statsIterRandomStates = execution.initStatsIterRandomStates(statsIter, + randomState) - DATASET, LABELS_DICTIONARY, datasetname = getDatabase(args.views, - args.pathF, args.name, - args.CL_nbClass, - args.CL_classes, - randomState, - args.full, - args.add_noise, - args.noise_std) - args.name = datasetname - - splits = execution.genSplits(DATASET.get("Labels").value, args.CL_split, - statsIterRandomStates) - - multiclassLabels, labelsCombinations, indicesMulticlass = Multiclass.genMulticlassLabels( - DATASET.get("Labels").value, multiclassMethod, splits) - - kFolds = execution.genKFolds(statsIter, args.CL_nbFolds, - statsIterRandomStates) - - datasetFiles = Dataset.initMultipleDatasets(args.pathF, args.name, nbCores) - - # if not views: - # raise ValueError("Empty views list, modify selected views to match dataset " + args.views) - - views, viewsIndices, allViews = execution.initViews(DATASET, args.views) - viewsDictionary = genViewsDictionnary(DATASET, views) - nbViews = len(views) - NB_CLASS = DATASET.get("Metadata").attrs["nbClass"] - - metrics = [metric.split(":") for metric in args.CL_metrics] - if metrics == [[""]]: - metricsNames = [name for _, name, isPackage - in pkgutil.iter_modules( - ['./MonoMultiViewClassifiers/Metrics']) if - not isPackage and name not in ["framework", "log_loss", - "matthews_corrcoef", - "roc_auc_score"]] - metrics = [[metricName] for metricName in metricsNames] - metrics = arangeMetrics(metrics, args.CL_metric_princ) - for metricIndex, metric in enumerate(metrics): - if len(metric) == 1: - metrics[metricIndex] = [metric[0], None] - - benchmark = initBenchmark(CL_type, monoviewAlgos, multiviewAlgos, args) - initKWARGS = initKWARGSFunc(args, benchmark) - dataBaseTime = time.time() - start - argumentDictionaries = initMonoviewExps(benchmark, viewsDictionary, - NB_CLASS, initKWARGS) - directories = execution.genDirecortiesNames(directory, statsIter) - benchmarkArgumentDictionaries = execution.genArgumentDictionaries( - LABELS_DICTIONARY, directories, multiclassLabels, - labelsCombinations, indicesMulticlass, - hyperParamSearch, args, kFolds, - statsIterRandomStates, metrics, - argumentDictionaries, benchmark, nbViews, - views, viewsIndices) - nbMulticlass = len(labelsCombinations) - - execBenchmark(nbCores, statsIter, nbMulticlass, - benchmarkArgumentDictionaries, splits, directories, - directory, multiclassLabels, metrics, LABELS_DICTIONARY, - NB_CLASS, DATASET) - - # -# def classifyOneIter_multicore(LABELS_DICTIONARY, argumentDictionaries, nbCores, directory, args, classificationIndices, -# kFolds, -# randomState, hyperParamSearch, metrics, coreIndex, viewsIndices, dataBaseTime, start, -# benchmark, -# views): -# """Used to execute mono and multiview classification and result analysis for one random state -# using multicore classification""" -# resultsMonoview = [] -# labelsNames = LABELS_DICTIONARY.values() -# np.savetxt(directory + "train_indices.csv", classificationIndices[0], delimiter=",") -# -# resultsMonoview += [ExecMonoview_multicore(directory, args.name, labelsNames, classificationIndices, kFolds, -# coreIndex, args.type, args.pathF, randomState, -# hyperParamSearch=hyperParamSearch, -# metrics=metrics, nIter=args.CL_HPS_iter, -# **arguments) -# for arguments in argumentDictionaries["Monoview"]] -# monoviewTime = time.time() - dataBaseTime - start -# -# argumentDictionaries = initMultiviewArguments(args, benchmark, views, viewsIndices, argumentDictionaries, -# randomState, directory, resultsMonoview, classificationIndices) -# -# resultsMultiview = [] -# resultsMultiview += [ -# ExecMultiview_multicore(directory, coreIndex, args.name, classificationIndices, kFolds, args.type, -# args.pathF, LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch, -# metrics=metrics, nIter=args.CL_HPS_iter, **arguments) -# for arguments in argumentDictionaries["Multiview"]] -# multiviewTime = time.time() - monoviewTime - dataBaseTime - start -# -# labels = np.array( -# [resultMonoview[1][3] for resultMonoview in resultsMonoview] + [resultMultiview[3] for resultMultiview in -# resultsMultiview]).transpose() -# DATASET = h5py.File(args.pathF + args.name + str(0) + ".hdf5", "r") -# trueLabels = DATASET.get("Labels").value -# times = [dataBaseTime, monoviewTime, multiviewTime] -# results = (resultsMonoview, resultsMultiview) -# labelAnalysis = analyzeLabels(labels, trueLabels, results, directory) -# logging.debug("Start:\t Analyze Iteration Results") -# resultAnalysis(benchmark, results, args.name, times, metrics, directory) -# logging.debug("Done:\t Analyze Iteration Results") -# globalAnalysisTime = time.time() - monoviewTime - dataBaseTime - start - multiviewTime -# totalTime = time.time() - start -# logging.info("Extraction time : " + str(int(dataBaseTime)) + -# "s, Monoview time : " + str(int(monoviewTime)) + -# "s, Multiview Time : " + str(int(multiviewTime)) + -# "s, Iteration Analysis Time : " + str(int(globalAnalysisTime)) + -# "s, Iteration Duration : " + str(int(totalTime)) + "s") -# return results, labelAnalysis -# -# -# def classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directory, args, classificationIndices, kFolds, -# randomState, hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start, -# benchmark, views): -# """Used to execute mono and multiview classification and result analysis for one random state -# classification""" -# #TODO : Clarify this one -# -# -# argumentDictionaries = initMultiviewArguments(args, benchmark, views, viewsIndices, argumentDictionaries, -# randomState, directory, resultsMonoview, classificationIndices) -# -# resultsMultiview = [] -# if nbCores > 1: -# nbExperiments = len(argumentDictionaries["Multiview"]) -# for stepIndex in range(int(math.ceil(float(nbExperiments) / nbCores))): -# resultsMultiview += Parallel(n_jobs=nbCores)( -# delayed(ExecMultiview_multicore)(directory, coreIndex, args.name, classificationIndices, kFolds, -# args.type, -# args.pathF, -# LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch, -# metrics=metrics, nIter=args.CL_HPS_iter, -# **argumentDictionaries["Multiview"][stepIndex * nbCores + coreIndex]) -# for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores))) -# else: -# resultsMultiview = [ -# ExecMultiview(directory, DATASET, args.name, classificationIndices, kFolds, 1, args.type, args.pathF, -# LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch, -# metrics=metrics, nIter=args.CL_HPS_iter, **arguments) for arguments in -# argumentDictionaries["Multiview"]] -# multiviewTime = time.time() - monoviewTime - dataBaseTime - start -# if nbCores > 1: -# logging.debug("Start:\t Deleting " + str(nbCores) + " temporary datasets for multiprocessing") -# datasetFiles = DB.deleteHDF5(args.pathF, args.name, nbCores) -# logging.debug("Start:\t Deleting datasets for multiprocessing") -# labels = np.array( -# [resultMonoview[1][3] for resultMonoview in resultsMonoview] + [resultMultiview[3] for resultMultiview in -# resultsMultiview]).transpose() -# trueLabels = DATASET.get("Labels").value -# times = [dataBaseTime, monoviewTime, multiviewTime] -# results = (resultsMonoview, resultsMultiview) -# labelAnalysis = analyzeLabels(labels, trueLabels, results, directory) -# logging.debug("Start:\t Analyze Iteration Results") -# resultAnalysis(benchmark, results, args.name, times, metrics, directory) -# logging.debug("Done:\t Analyze Iteration Results") -# globalAnalysisTime = time.time() - monoviewTime - dataBaseTime - start - multiviewTime -# totalTime = time.time() - start -# logging.info("Extraction time : " + str(int(dataBaseTime)) + -# "s, Monoview time : " + str(int(monoviewTime)) + -# "s, Multiview Time : " + str(int(multiviewTime)) + -# "s, Iteration Analysis Time : " + str(int(globalAnalysisTime)) + -# "s, Iteration Duration : " + str(int(totalTime)) + "s") -# return results, labelAnalysis -# -# -# -# -# -# -# -# if statsIter > 1: -# logging.debug("Start:\t Benchmark classification") -# for statIterIndex in range(statsIter): -# if not os.path.exists(os.path.dirname(directories[statIterIndex] + "train_labels.csv")): -# try: -# os.makedirs(os.path.dirname(directories[statIterIndex] + "train_labels.csv")) -# except OSError as exc: -# if exc.errno != errno.EEXIST: -# raise -# trainIndices, testIndices = classificationIndices[statIterIndex] -# trainLabels = DATASET.get("Labels").value[trainIndices] -# np.savetxt(directories[statIterIndex] + "train_labels.csv", trainLabels, delimiter=",") -# if nbCores > 1: -# iterResults = [] -# nbExperiments = statsIter*len(multiclassLabels) -# for stepIndex in range(int(math.ceil(float(nbExperiments) / nbCores))): -# iterResults += (Parallel(n_jobs=nbCores)( -# delayed(classifyOneIter_multicore)(LABELS_DICTIONARY, argumentDictionaries, 1, -# directories[coreIndex + stepIndex * nbCores], args, -# classificationIndices[coreIndex + stepIndex * nbCores], -# kFolds[coreIndex + stepIndex * nbCores], -# statsIterRandomStates[coreIndex + stepIndex * nbCores], -# hyperParamSearch, metrics, coreIndex, viewsIndices, dataBaseTime, -# start, benchmark, -# views) -# for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))) -# logging.debug("Start:\t Deleting " + str(nbCores) + " temporary datasets for multiprocessing") -# datasetFiles = DB.deleteHDF5(args.pathF, args.name, nbCores) -# logging.debug("Start:\t Deleting datasets for multiprocessing") -# else: -# iterResults = [] -# for iterIndex in range(statsIter): -# if not os.path.exists(os.path.dirname(directories[iterIndex] + "train_labels.csv")): -# try: -# os.makedirs(os.path.dirname(directories[iterIndex] + "train_labels.csv")) -# except OSError as exc: -# if exc.errno != errno.EEXIST: -# raise -# trainIndices, testIndices = classificationIndices[iterIndex] -# trainLabels = DATASET.get("Labels").value[trainIndices] -# np.savetxt(directories[iterIndex] + "train_labels.csv", trainLabels, delimiter=",") -# iterResults.append( -# classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories[iterIndex], args, -# classificationIndices[iterIndex], kFolds[iterIndex], statsIterRandomStates[iterIndex], -# hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start, benchmark, -# views)) -# logging.debug("Done:\t Benchmark classification") -# logging.debug("Start:\t Global Results Analysis") -# classifiersIterResults = [] -# iterLabelAnalysis = [] -# for result in iterResults: -# classifiersIterResults.append(result[0]) -# iterLabelAnalysis.append(result[1]) -# -# mono,multi = classifiersIterResults[0] -# classifiersNames = genNamesFromRes(mono, multi) -# analyzeIterLabels(iterLabelAnalysis, directory, classifiersNames) -# analyzeIterResults(classifiersIterResults, args.name, metrics, directory) -# logging.debug("Done:\t Global Results Analysis") -# totalDur = time.time() - start -# m, s = divmod(totalDur, 60) -# h, m = divmod(m, 60) -# d, h = divmod(h, 24) -# # print "%d_%02d_%02d" % (h, m, s) -# logging.info("Info:\t Total duration : " + str(d) + " days, " + str(h) + " hours, " + str(m) + " mins, " + str( -# int(s)) + "secs.") -# -# else: -# logging.debug("Start:\t Benchmark classification") -# if not os.path.exists(os.path.dirname(directories + "train_labels.csv")): -# try: -# os.makedirs(os.path.dirname(directories + "train_labels.csv")) -# except OSError as exc: -# if exc.errno != errno.EEXIST: -# raise -# trainIndices, testIndices = classificationIndices -# trainLabels = DATASET.get("Labels").value[trainIndices] -# np.savetxt(directories + "train_labels.csv", trainLabels, delimiter=",") -# res, labelAnalysis = classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories, args, classificationIndices, -# kFolds, -# statsIterRandomStates, hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start, -# benchmark, views) -# logging.debug("Done:\t Benchmark classification") -# totalDur = time.time()-start -# m, s = divmod(totalDur, 60) -# h, m = divmod(m, 60) -# d, h = divmod(h, 24) -# # print "%d_%02d_%02d" % (h, m, s) -# logging.info("Info:\t Total duration : "+str(d)+ " days, "+str(h)+" hours, "+str(m)+" mins, "+str(int(s))+"secs.") -# -# if statsIter > 1: -# pass + getDatabase = execution.getDatabaseFunction(name, args.type) + + DATASET, LABELS_DICTIONARY, datasetname = getDatabase(args.views, + args.pathF, name, + args.CL_nbClass, + args.CL_classes, + randomState, + args.full, + args.add_noise, + args.noise_std) + args.name = datasetname + + splits = execution.genSplits(DATASET.get("Labels").value, args.CL_split, + statsIterRandomStates) + + multiclassLabels, labelsCombinations, indicesMulticlass = Multiclass.genMulticlassLabels( + DATASET.get("Labels").value, multiclassMethod, splits) + + kFolds = execution.genKFolds(statsIter, args.CL_nbFolds, + statsIterRandomStates) + + datasetFiles = Dataset.initMultipleDatasets(args.pathF, args.name, nbCores) + + # if not views: + # raise ValueError("Empty views list, modify selected views to match dataset " + args.views) + + views, viewsIndices, allViews = execution.initViews(DATASET, args.views) + viewsDictionary = genViewsDictionnary(DATASET, views) + nbViews = len(views) + NB_CLASS = DATASET.get("Metadata").attrs["nbClass"] + + metrics = [metric.split(":") for metric in args.CL_metrics] + if metrics == [[""]]: + metricsNames = [name for _, name, isPackage + in pkgutil.iter_modules( + ['./MonoMultiViewClassifiers/Metrics']) if + not isPackage and name not in ["framework", "log_loss", + "matthews_corrcoef", + "roc_auc_score"]] + metrics = [[metricName] for metricName in metricsNames] + metrics = arangeMetrics(metrics, args.CL_metric_princ) + for metricIndex, metric in enumerate(metrics): + if len(metric) == 1: + metrics[metricIndex] = [metric[0], None] + + benchmark = initBenchmark(CL_type, monoviewAlgos, multiviewAlgos, args) + initKWARGS = initKWARGSFunc(args, benchmark) + dataBaseTime = time.time() - start + argumentDictionaries = initMonoviewExps(benchmark, viewsDictionary, + NB_CLASS, initKWARGS) + directories = execution.genDirecortiesNames(directory, statsIter) + benchmarkArgumentDictionaries = execution.genArgumentDictionaries( + LABELS_DICTIONARY, directories, multiclassLabels, + labelsCombinations, indicesMulticlass, + hyperParamSearch, args, kFolds, + statsIterRandomStates, metrics, + argumentDictionaries, benchmark, nbViews, + views, viewsIndices) + nbMulticlass = len(labelsCombinations) + + execBenchmark(nbCores, statsIter, nbMulticlass, + benchmarkArgumentDictionaries, splits, directories, + directory, multiclassLabels, metrics, LABELS_DICTIONARY, + NB_CLASS, DATASET) diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py index 10f034b7..29c99f15 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py @@ -937,6 +937,9 @@ def getInterpretBase(classifier, directory, classifier_name, weights, np.savetxt(directory + "times.csv", np.array([classifier.train_time, classifier.predict_time]), delimiter=',') + np.savetxt(directory + "times_iter.csv", + np.array([classifier.train_time, len(weights_sort)]), + delimiter=',') np.savetxt(directory + "sparsity.csv", np.array([len(weights_sort)]), delimiter=',') get_accuracy_graph(classifier.train_metrics, classifier_name, diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py index c1dfedad..043e095c 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py @@ -7,7 +7,7 @@ Related papers: """ from __future__ import print_function, division, absolute_import - +import time from operator import xor import numpy as np @@ -97,6 +97,8 @@ class MinCqClassifier(VotingClassifier): enumerate(self.estimators_generator.estimators_)] super().fit(X, y) + beg = time.time() + # Preparation and resolution of the quadratic program # logger.info("Preparing and solving QP...") self.weights = self._solve(X, y) @@ -112,6 +114,8 @@ class MinCqClassifier(VotingClassifier): np.sum(np.average( self._binary_classification_matrix(X), axis=1, weights=self.weights) ** 2)) + end = time.time() + self.train_time = end-beg return self def _binary_classification_matrix(self, X): diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py index 9df79130..07ae58db 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py @@ -106,6 +106,8 @@ class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, np.savetxt(directory + "train_metrics.csv", self.metrics, delimiter=',') np.savetxt(directory + "times.csv", np.array([self.train_time, self.pred_time]), delimiter=',') + np.savetxt(directory + "times_iter.csv", + np.array([self.train_time, len(self.estimator_weights_)]), delimiter=',') return interpretString # def pregen_voters(self, X, y=None): diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py index dcf6c5b6..620b3ff7 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py @@ -12,7 +12,7 @@ Related papers: http://graal.ift.ulaval.ca/majorityvote/ """ __author__ = 'Jean-Francis Roy' - +import time import logging from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics.pairwise import rbf_kernel, linear_kernel, \ @@ -142,7 +142,7 @@ class MinCqLearner(BaseEstimator, ClassifierMixin): if self.log: logging.info("Preparing QP...") self._prepare_qp(X, y_reworked) - + beg = time.time() try: if self.log: logging.info("Solving QP...") @@ -163,7 +163,8 @@ class MinCqLearner(BaseEstimator, ClassifierMixin): str(self), str(e))) self.majority_vote = None self.cbound_train = self.majority_vote.cbound_value(X, y_reworked) - + end=time.time() + self.train_time=end-beg return self def predict(self, X, save_data=True): @@ -608,6 +609,7 @@ class MinCQ(MinCqLearner, BaseMonoviewClassifier): y_rework[np.where(y_rework == 0)] = -1 interpret_string += "\n Test c_bound value : " + str( self.majority_vote.cbound_value(self.x_test, y_rework)) + np.savetxt(directory+"times.csv", np.array([self.train_time, 0])) return interpret_string def get_name_for_fusion(self): diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py index 92b96cbf..f9e5ba3b 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py @@ -22,9 +22,9 @@ def parseTheArgs(arguments): groupStandard = parser.add_argument_group('Standard arguments') groupStandard.add_argument('-log', action='store_true', help='Use option to activate logging to console') - groupStandard.add_argument('--name', metavar='STRING', action='store', + groupStandard.add_argument('--name', metavar='STRING', nargs='+', action='store', help='Name of Database (default: %(default)s)', - default='Plausible') + default=['Plausible']) groupStandard.add_argument('--label', metavar='STRING', action='store', help='Labeling the results directory (default: ' '%(default)s)', @@ -375,7 +375,7 @@ def parseTheArgs(arguments): action='store', nargs="+", help='Set the n_max_iterations parameter for ' 'CGreed', - default=[100]) + default=[10]) groupCBBoost= parser.add_argument_group('CBBoost arguments') groupCBBoost.add_argument('--CBB_stumps', nargs="+", metavar='INT', type=int, @@ -960,6 +960,16 @@ def genDirecortiesNames(directory, statsIter): return directories +def find_dataset_names(path, type, names): + """This function goal is to browse the dataset directory and extarcts all the needed dataset names.""" + available_file_names = [file_name.strip().split(".")[0] for file_name in os.listdir(path) if file_name.endswith(type)] + if names == ["all"]: + return available_file_names + elif len(names)>1: + return [used_name for used_name in available_file_names if used_name in names] + else: + return names + def genArgumentDictionaries(labelsDictionary, directories, multiclassLabels, labelsCombinations, indicesMulticlass, hyperParamSearch, args, kFolds, -- GitLab