diff --git a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py index 07535d9dfee05bb129322d844474f32f9d616f44..8c73ab9aadb2c8da300010435788e8d82bcc7ef7 100644 --- a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py +++ b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py @@ -9,6 +9,7 @@ import matplotlib import itertools import numpy as np from joblib import Parallel, delayed +from sklearn.tree import DecisionTreeClassifier matplotlib.use( 'Agg') # Anti-Grain Geometry C++ library to make a raster (pixel) image of the figure @@ -20,7 +21,7 @@ from .Multiview.ExecMultiview import ExecMultiview, ExecMultiview_multicore from .Monoview.ExecClassifMonoView import ExecMonoview, ExecMonoview_multicore from .utils import GetMultiviewDb as DB from .ResultAnalysis import \ - getResults # resultAnalysis, analyzeLabels, analyzeIterResults, analyzeIterLabels, genNamesFromRes, + getResults, plot_results_noise # resultAnalysis, analyzeLabels, analyzeIterResults, analyzeIterLabels, genNamesFromRes, from .utils import execution, Dataset, Multiclass # Author-Info @@ -161,8 +162,8 @@ def initMonoviewExps(benchmark, viewsDictionary, nbClass, kwargsInit): argumentDictionaries["Monoview"] += gen_multiple_args_dictionnaries(nbClass, kwargsInit, classifier, viewName, viewIndex) else: arguments = { - "args": {classifier + "KWARGS": kwargsInit[ - classifier + "KWARGSInit"], "feat": viewName, + "args": {classifier + "KWARGS": dict((key, value[0]) for key, value in kwargsInit[ + classifier + "KWARGSInit"].items()), "feat": viewName, "CL_type": classifier, "nbClass": nbClass}, "viewIndex": viewIndex} argumentDictionaries["Monoview"].append(arguments) @@ -183,15 +184,23 @@ def gen_multiple_kwargs_combinations(clKWARGS): keys = clKWARGS.keys() kwargs_combination = [dict((key, value) for key, value in zip(keys, values)) for values in values_cartesian_prod] - return kwargs_combination + + reduce_dict = {DecisionTreeClassifier: "DT", } + reduced_listed_values = [ + [_ if type(_) not in reduce_dict else reduce_dict[type(_)] for _ in + list_] for list_ in listed_values] + reduced_values_cartesian_prod = [_ for _ in itertools.product(*reduced_listed_values)] + reduced_kwargs_combination = [dict((key, value) for key, value in zip(keys, values)) + for values in reduced_values_cartesian_prod] + return kwargs_combination, reduced_kwargs_combination def gen_multiple_args_dictionnaries(nbClass, kwargsInit, classifier, viewName, viewIndex): - multiple_kwargs_list = gen_multiple_kwargs_combinations(kwargsInit[classifier + "KWARGSInit"]) + multiple_kwargs_list, reduced_multiple_kwargs_list = gen_multiple_kwargs_combinations(kwargsInit[classifier + "KWARGSInit"]) multiple_kwargs_dict = dict( - (classifier+"_"+"_".join(map(str,list(dictionary.values()))), dictionary) - for dictionary in multiple_kwargs_list) + (classifier+"_"+"_".join(map(str,list(reduced_dictionary.values()))), dictionary) + for reduced_dictionary, dictionary in zip(reduced_multiple_kwargs_list, multiple_kwargs_list )) args_dictionnaries = [{ "args": {classifier_name + "KWARGS": arguments, "feat": viewName, @@ -560,12 +569,15 @@ def execBenchmark(nbCores, statsIter, nbMulticlass, classificationIndices[0][1]) multiclassGroundTruth = DATASET.get("Labels").value logging.debug("Start:\t Analyzing predictions") - getResults(results, statsIter, nbMulticlass, benchmarkArgumentsDictionaries, + results_mean_stds =getResults(results, statsIter, nbMulticlass, benchmarkArgumentsDictionaries, multiclassGroundTruth, metrics, classificationIndices, directories, directory, labelsDictionary, nbExamples, nbLabels) logging.debug("Done:\t Analyzing predictions") - - return results + filename = DATASET.filename + DATASET.close() + if "_temp_" in filename: + os.remove(filename) + return results_mean_stds def execClassif(arguments): @@ -583,285 +595,85 @@ def execClassif(arguments): CL_type = args.CL_type monoviewAlgos = args.CL_algos_monoview multiviewAlgos = args.CL_algos_multiview - - directory = execution.initLogFile(args.name, args.views, args.CL_type, - args.log, args.debug, args.label, - args.res_dir) - randomState = execution.initRandomState(args.randomState, directory) - statsIterRandomStates = execution.initStatsIterRandomStates(statsIter, - randomState) - - getDatabase = execution.getDatabaseFunction(args.name, args.type) - - DATASET, LABELS_DICTIONARY, datasetname = getDatabase(args.views, - args.pathF, args.name, - args.CL_nbClass, - args.CL_classes, - randomState, - args.full, - args.add_noise, - args.noise_std) - args.name = datasetname - - splits = execution.genSplits(DATASET.get("Labels").value, args.CL_split, - statsIterRandomStates) - - multiclassLabels, labelsCombinations, indicesMulticlass = Multiclass.genMulticlassLabels( - DATASET.get("Labels").value, multiclassMethod, splits) - - kFolds = execution.genKFolds(statsIter, args.CL_nbFolds, - statsIterRandomStates) - - datasetFiles = Dataset.initMultipleDatasets(args.pathF, args.name, nbCores) - - # if not views: - # raise ValueError("Empty views list, modify selected views to match dataset " + args.views) - - views, viewsIndices, allViews = execution.initViews(DATASET, args.views) - viewsDictionary = genViewsDictionnary(DATASET, views) - nbViews = len(views) - NB_CLASS = DATASET.get("Metadata").attrs["nbClass"] - - metrics = [metric.split(":") for metric in args.CL_metrics] - if metrics == [[""]]: - metricsNames = [name for _, name, isPackage - in pkgutil.iter_modules( - ['./MonoMultiViewClassifiers/Metrics']) if - not isPackage and name not in ["framework", "log_loss", - "matthews_corrcoef", - "roc_auc_score"]] - metrics = [[metricName] for metricName in metricsNames] - metrics = arangeMetrics(metrics, args.CL_metric_princ) - for metricIndex, metric in enumerate(metrics): - if len(metric) == 1: - metrics[metricIndex] = [metric[0], None] - - benchmark = initBenchmark(CL_type, monoviewAlgos, multiviewAlgos, args) - print(benchmark, "\n") - - initKWARGS = initKWARGSFunc(args, benchmark) - - dataBaseTime = time.time() - start - - argumentDictionaries = initMonoviewExps(benchmark, viewsDictionary, - NB_CLASS, initKWARGS) - print(argumentDictionaries, "\n") - directories = execution.genDirecortiesNames(directory, statsIter) - benchmarkArgumentDictionaries = execution.genArgumentDictionaries( - LABELS_DICTIONARY, directories, multiclassLabels, - labelsCombinations, indicesMulticlass, - hyperParamSearch, args, kFolds, - statsIterRandomStates, metrics, - argumentDictionaries, benchmark, nbViews, - views, viewsIndices) - print(benchmarkArgumentDictionaries, "\n") - nbMulticlass = len(labelsCombinations) - - execBenchmark(nbCores, statsIter, nbMulticlass, - benchmarkArgumentDictionaries, splits, directories, - directory, multiclassLabels, metrics, LABELS_DICTIONARY, - NB_CLASS, DATASET) - - # -# def classifyOneIter_multicore(LABELS_DICTIONARY, argumentDictionaries, nbCores, directory, args, classificationIndices, -# kFolds, -# randomState, hyperParamSearch, metrics, coreIndex, viewsIndices, dataBaseTime, start, -# benchmark, -# views): -# """Used to execute mono and multiview classification and result analysis for one random state -# using multicore classification""" -# resultsMonoview = [] -# labelsNames = LABELS_DICTIONARY.values() -# np.savetxt(directory + "train_indices.csv", classificationIndices[0], delimiter=",") -# -# resultsMonoview += [ExecMonoview_multicore(directory, args.name, labelsNames, classificationIndices, kFolds, -# coreIndex, args.type, args.pathF, randomState, -# hyperParamSearch=hyperParamSearch, -# metrics=metrics, nIter=args.CL_HPS_iter, -# **arguments) -# for arguments in argumentDictionaries["Monoview"]] -# monoviewTime = time.time() - dataBaseTime - start -# -# argumentDictionaries = initMultiviewArguments(args, benchmark, views, viewsIndices, argumentDictionaries, -# randomState, directory, resultsMonoview, classificationIndices) -# -# resultsMultiview = [] -# resultsMultiview += [ -# ExecMultiview_multicore(directory, coreIndex, args.name, classificationIndices, kFolds, args.type, -# args.pathF, LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch, -# metrics=metrics, nIter=args.CL_HPS_iter, **arguments) -# for arguments in argumentDictionaries["Multiview"]] -# multiviewTime = time.time() - monoviewTime - dataBaseTime - start -# -# labels = np.array( -# [resultMonoview[1][3] for resultMonoview in resultsMonoview] + [resultMultiview[3] for resultMultiview in -# resultsMultiview]).transpose() -# DATASET = h5py.File(args.pathF + args.name + str(0) + ".hdf5", "r") -# trueLabels = DATASET.get("Labels").value -# times = [dataBaseTime, monoviewTime, multiviewTime] -# results = (resultsMonoview, resultsMultiview) -# labelAnalysis = analyzeLabels(labels, trueLabels, results, directory) -# logging.debug("Start:\t Analyze Iteration Results") -# resultAnalysis(benchmark, results, args.name, times, metrics, directory) -# logging.debug("Done:\t Analyze Iteration Results") -# globalAnalysisTime = time.time() - monoviewTime - dataBaseTime - start - multiviewTime -# totalTime = time.time() - start -# logging.info("Extraction time : " + str(int(dataBaseTime)) + -# "s, Monoview time : " + str(int(monoviewTime)) + -# "s, Multiview Time : " + str(int(multiviewTime)) + -# "s, Iteration Analysis Time : " + str(int(globalAnalysisTime)) + -# "s, Iteration Duration : " + str(int(totalTime)) + "s") -# return results, labelAnalysis -# -# -# def classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directory, args, classificationIndices, kFolds, -# randomState, hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start, -# benchmark, views): -# """Used to execute mono and multiview classification and result analysis for one random state -# classification""" -# #TODO : Clarify this one -# -# -# argumentDictionaries = initMultiviewArguments(args, benchmark, views, viewsIndices, argumentDictionaries, -# randomState, directory, resultsMonoview, classificationIndices) -# -# resultsMultiview = [] -# if nbCores > 1: -# nbExperiments = len(argumentDictionaries["Multiview"]) -# for stepIndex in range(int(math.ceil(float(nbExperiments) / nbCores))): -# resultsMultiview += Parallel(n_jobs=nbCores)( -# delayed(ExecMultiview_multicore)(directory, coreIndex, args.name, classificationIndices, kFolds, -# args.type, -# args.pathF, -# LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch, -# metrics=metrics, nIter=args.CL_HPS_iter, -# **argumentDictionaries["Multiview"][stepIndex * nbCores + coreIndex]) -# for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores))) -# else: -# resultsMultiview = [ -# ExecMultiview(directory, DATASET, args.name, classificationIndices, kFolds, 1, args.type, args.pathF, -# LABELS_DICTIONARY, randomState, hyperParamSearch=hyperParamSearch, -# metrics=metrics, nIter=args.CL_HPS_iter, **arguments) for arguments in -# argumentDictionaries["Multiview"]] -# multiviewTime = time.time() - monoviewTime - dataBaseTime - start -# if nbCores > 1: -# logging.debug("Start:\t Deleting " + str(nbCores) + " temporary datasets for multiprocessing") -# datasetFiles = DB.deleteHDF5(args.pathF, args.name, nbCores) -# logging.debug("Start:\t Deleting datasets for multiprocessing") -# labels = np.array( -# [resultMonoview[1][3] for resultMonoview in resultsMonoview] + [resultMultiview[3] for resultMultiview in -# resultsMultiview]).transpose() -# trueLabels = DATASET.get("Labels").value -# times = [dataBaseTime, monoviewTime, multiviewTime] -# results = (resultsMonoview, resultsMultiview) -# labelAnalysis = analyzeLabels(labels, trueLabels, results, directory) -# logging.debug("Start:\t Analyze Iteration Results") -# resultAnalysis(benchmark, results, args.name, times, metrics, directory) -# logging.debug("Done:\t Analyze Iteration Results") -# globalAnalysisTime = time.time() - monoviewTime - dataBaseTime - start - multiviewTime -# totalTime = time.time() - start -# logging.info("Extraction time : " + str(int(dataBaseTime)) + -# "s, Monoview time : " + str(int(monoviewTime)) + -# "s, Multiview Time : " + str(int(multiviewTime)) + -# "s, Iteration Analysis Time : " + str(int(globalAnalysisTime)) + -# "s, Iteration Duration : " + str(int(totalTime)) + "s") -# return results, labelAnalysis -# -# -# -# -# -# -# -# if statsIter > 1: -# logging.debug("Start:\t Benchmark classification") -# for statIterIndex in range(statsIter): -# if not os.path.exists(os.path.dirname(directories[statIterIndex] + "train_labels.csv")): -# try: -# os.makedirs(os.path.dirname(directories[statIterIndex] + "train_labels.csv")) -# except OSError as exc: -# if exc.errno != errno.EEXIST: -# raise -# trainIndices, testIndices = classificationIndices[statIterIndex] -# trainLabels = DATASET.get("Labels").value[trainIndices] -# np.savetxt(directories[statIterIndex] + "train_labels.csv", trainLabels, delimiter=",") -# if nbCores > 1: -# iterResults = [] -# nbExperiments = statsIter*len(multiclassLabels) -# for stepIndex in range(int(math.ceil(float(nbExperiments) / nbCores))): -# iterResults += (Parallel(n_jobs=nbCores)( -# delayed(classifyOneIter_multicore)(LABELS_DICTIONARY, argumentDictionaries, 1, -# directories[coreIndex + stepIndex * nbCores], args, -# classificationIndices[coreIndex + stepIndex * nbCores], -# kFolds[coreIndex + stepIndex * nbCores], -# statsIterRandomStates[coreIndex + stepIndex * nbCores], -# hyperParamSearch, metrics, coreIndex, viewsIndices, dataBaseTime, -# start, benchmark, -# views) -# for coreIndex in range(min(nbCores, nbExperiments - stepIndex * nbCores)))) -# logging.debug("Start:\t Deleting " + str(nbCores) + " temporary datasets for multiprocessing") -# datasetFiles = DB.deleteHDF5(args.pathF, args.name, nbCores) -# logging.debug("Start:\t Deleting datasets for multiprocessing") -# else: -# iterResults = [] -# for iterIndex in range(statsIter): -# if not os.path.exists(os.path.dirname(directories[iterIndex] + "train_labels.csv")): -# try: -# os.makedirs(os.path.dirname(directories[iterIndex] + "train_labels.csv")) -# except OSError as exc: -# if exc.errno != errno.EEXIST: -# raise -# trainIndices, testIndices = classificationIndices[iterIndex] -# trainLabels = DATASET.get("Labels").value[trainIndices] -# np.savetxt(directories[iterIndex] + "train_labels.csv", trainLabels, delimiter=",") -# iterResults.append( -# classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories[iterIndex], args, -# classificationIndices[iterIndex], kFolds[iterIndex], statsIterRandomStates[iterIndex], -# hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start, benchmark, -# views)) -# logging.debug("Done:\t Benchmark classification") -# logging.debug("Start:\t Global Results Analysis") -# classifiersIterResults = [] -# iterLabelAnalysis = [] -# for result in iterResults: -# classifiersIterResults.append(result[0]) -# iterLabelAnalysis.append(result[1]) -# -# mono,multi = classifiersIterResults[0] -# classifiersNames = genNamesFromRes(mono, multi) -# analyzeIterLabels(iterLabelAnalysis, directory, classifiersNames) -# analyzeIterResults(classifiersIterResults, args.name, metrics, directory) -# logging.debug("Done:\t Global Results Analysis") -# totalDur = time.time() - start -# m, s = divmod(totalDur, 60) -# h, m = divmod(m, 60) -# d, h = divmod(h, 24) -# # print "%d_%02d_%02d" % (h, m, s) -# logging.info("Info:\t Total duration : " + str(d) + " days, " + str(h) + " hours, " + str(m) + " mins, " + str( -# int(s)) + "secs.") -# -# else: -# logging.debug("Start:\t Benchmark classification") -# if not os.path.exists(os.path.dirname(directories + "train_labels.csv")): -# try: -# os.makedirs(os.path.dirname(directories + "train_labels.csv")) -# except OSError as exc: -# if exc.errno != errno.EEXIST: -# raise -# trainIndices, testIndices = classificationIndices -# trainLabels = DATASET.get("Labels").value[trainIndices] -# np.savetxt(directories + "train_labels.csv", trainLabels, delimiter=",") -# res, labelAnalysis = classifyOneIter(LABELS_DICTIONARY, argumentDictionaries, nbCores, directories, args, classificationIndices, -# kFolds, -# statsIterRandomStates, hyperParamSearch, metrics, DATASET, viewsIndices, dataBaseTime, start, -# benchmark, views) -# logging.debug("Done:\t Benchmark classification") -# totalDur = time.time()-start -# m, s = divmod(totalDur, 60) -# h, m = divmod(m, 60) -# d, h = divmod(h, 24) -# # print "%d_%02d_%02d" % (h, m, s) -# logging.info("Info:\t Total duration : "+str(d)+ " days, "+str(h)+" hours, "+str(m)+" mins, "+str(int(s))+"secs.") -# -# if statsIter > 1: -# pass + dataset_list = execution.find_dataset_names(args.pathF, args.type, args.name) + + if not args.add_noise: + args.noise_std=[0.0] + + for name in dataset_list: + noise_results = [] + for noise_std in args.noise_std: + + directory = execution.initLogFile(name, args.views, args.CL_type, + args.log, args.debug, args.label, + args.res_dir, args.add_noise, noise_std) + randomState = execution.initRandomState(args.randomState, directory) + statsIterRandomStates = execution.initStatsIterRandomStates(statsIter, + randomState) + + getDatabase = execution.getDatabaseFunction(name, args.type) + + DATASET, LABELS_DICTIONARY, datasetname = getDatabase(args.views, + args.pathF, name, + args.CL_nbClass, + args.CL_classes, + randomState, + args.full, + args.add_noise, + noise_std) + args.name = datasetname + + splits = execution.genSplits(DATASET.get("Labels").value, args.CL_split, + statsIterRandomStates) + + multiclassLabels, labelsCombinations, indicesMulticlass = Multiclass.genMulticlassLabels( + DATASET.get("Labels").value, multiclassMethod, splits) + + kFolds = execution.genKFolds(statsIter, args.CL_nbFolds, + statsIterRandomStates) + + datasetFiles = Dataset.initMultipleDatasets(args.pathF, args.name, nbCores) + + # if not views: + # raise ValueError("Empty views list, modify selected views to match dataset " + args.views) + + views, viewsIndices, allViews = execution.initViews(DATASET, args.views) + viewsDictionary = genViewsDictionnary(DATASET, views) + nbViews = len(views) + NB_CLASS = DATASET.get("Metadata").attrs["nbClass"] + + metrics = [metric.split(":") for metric in args.CL_metrics] + if metrics == [[""]]: + metricsNames = [name for _, name, isPackage + in pkgutil.iter_modules( + ['./MonoMultiViewClassifiers/Metrics']) if + not isPackage and name not in ["framework", "log_loss", + "matthews_corrcoef", + "roc_auc_score"]] + metrics = [[metricName] for metricName in metricsNames] + metrics = arangeMetrics(metrics, args.CL_metric_princ) + for metricIndex, metric in enumerate(metrics): + if len(metric) == 1: + metrics[metricIndex] = [metric[0], None] + + benchmark = initBenchmark(CL_type, monoviewAlgos, multiviewAlgos, args) + initKWARGS = initKWARGSFunc(args, benchmark) + dataBaseTime = time.time() - start + argumentDictionaries = initMonoviewExps(benchmark, viewsDictionary, + NB_CLASS, initKWARGS) + directories = execution.genDirecortiesNames(directory, statsIter) + benchmarkArgumentDictionaries = execution.genArgumentDictionaries( + LABELS_DICTIONARY, directories, multiclassLabels, + labelsCombinations, indicesMulticlass, + hyperParamSearch, args, kFolds, + statsIterRandomStates, metrics, + argumentDictionaries, benchmark, nbViews, + views, viewsIndices) + nbMulticlass = len(labelsCombinations) + + results_mean_stds = execBenchmark(nbCores, statsIter, nbMulticlass, + benchmarkArgumentDictionaries, splits, directories, + directory, multiclassLabels, metrics, LABELS_DICTIONARY, + NB_CLASS, DATASET) + noise_results.append([noise_std, results_mean_stds]) + plot_results_noise(directory, noise_results, metrics[0][0], name) diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py index 5573f6269d86f4c82cc169efebcfa010db40c2be..29c99f15d90bd48a1a8178ffd8c339649f50736b 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py @@ -851,7 +851,7 @@ def get_accuracy_graph(plotted_data, classifier_name, file_name, # plt.tight_layout() else: ax.legend((scat,), (name,)) - f.savefig(file_name) + f.savefig(file_name, transparent=True) plt.close() @@ -937,6 +937,9 @@ def getInterpretBase(classifier, directory, classifier_name, weights, np.savetxt(directory + "times.csv", np.array([classifier.train_time, classifier.predict_time]), delimiter=',') + np.savetxt(directory + "times_iter.csv", + np.array([classifier.train_time, len(weights_sort)]), + delimiter=',') np.savetxt(directory + "sparsity.csv", np.array([len(weights_sort)]), delimiter=',') get_accuracy_graph(classifier.train_metrics, classifier_name, diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CBBoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CBBoostUtils.py new file mode 100644 index 0000000000000000000000000000000000000000..38b3ab8740f4068e0478eafcb69932cdd55ec3a3 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CBBoostUtils.py @@ -0,0 +1,533 @@ +import logging +import math +import time + +import numpy as np +import numpy.ma as ma +import scipy +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils.validation import check_is_fitted + +from .BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, \ + getInterpretBase, get_accuracy_graph, TreeClassifiersGenerator +from ..MonoviewUtils import change_label_to_minus +from ... import Metrics + + +# Used for CBBoost + +class CBBoostClassifier(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, n_max_iterations=100, estimators_generator="Stumps", + random_state=42, self_complemented=True, twice_the_same=True, + random_start=False, n_stumps=1, c_bound_sol=True, + plotted_metric=Metrics.zero_one_loss, save_train_data=False, + test_graph=True, mincq_tracking=False): + super(CBBoostClassifier, self).__init__() + r""" + + Parameters + ---------- + n_max_iterations : int + Maximum number of iterations for the boosting algorithm. + estimators_generator : object + Sk-learn classifier object used to generate the hypotheses with the data. + random_state : np.random.RandomState or int + The random state, used in order to be reproductible + self_complemented : bool + If True, in the hypotheses generation process, for each hypothesis, it's complement will be generated too. + twice_the_same : bool + If True, the algorithm will be allowed to select twice the same hypothesis in the boosting process. + c_bound_choice : bool + If True, the C-Bound will be used to select the hypotheses. If False, the margin will be the criterion. + n_stumps_per_attribute : int + The number of hypotheses generated by data attribute + use_r : bool + If True, uses edge to compute the performance of a voter. If False, use the error instead. + plotted_metric : Metric module + The metric that will be plotted for each iteration of boosting. + """ + if type(random_state) is int: + self.random_state = np.random.RandomState(random_state) + else: + self.random_state = random_state + self.train_time = 0 + self.train_shape = None + self.step_decisions = None + self.step_prod = None + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.self_complemented = self_complemented + self.twice_the_same = twice_the_same + self.random_start = random_start + self.plotted_metric = plotted_metric + self.n_stumps = n_stumps + self.c_bound_sol = c_bound_sol + self.save_train_data = save_train_data + self.test_graph = test_graph + self.printed_args_name_list = ["n_max_iterations", "self_complemented", + "twice_the_same", + "random_start", + "n_stumps",] + self.mincq_tracking = mincq_tracking + + def fit(self, X, y): + + formatted_X, formatted_y = self.format_X_y(X, y) + + self.init_info_containers() + + # Initialize the weak classifiers ensemble + m, n, y_kernel_matrix = self.init_hypotheses(formatted_X, formatted_y) + + start = time.time() + self.n_total_hypotheses_ = n + self.n_total_examples = m + + # Initialize the majority vote + self.init_boosting(m, formatted_y, y_kernel_matrix) + + self.break_cause = " the maximum number of iterations was attained." + + for k in range(min(n - 1, + self.n_max_iterations - 1 if self.n_max_iterations is not None else np.inf)): + + # Print dynamically the step and the error of the current classifier + self.it = k + print( + "Resp. bound : {}/{}".format( + k + 2, + self.n_max_iterations), + end="\r") + + # Find the best (weight, voter) couple. + self.q, new_voter_index = self._find_new_voter(y_kernel_matrix, + formatted_y) + + if type(self.q) == str: + self.break_cause = new_voter_index # + break + + self.append_new_voter(new_voter_index) + self.weights_.append(self.q) + + voter_perf = self.compute_voter_perf(formatted_y) + + self.update_info_containers(formatted_y, voter_perf, k) + + self.estimators_generator.choose(self.chosen_columns_) + + self.nb_opposed_voters = self.check_opposed_voters() + if self.save_train_data: + self.X_train = self.classification_matrix[:, self.chosen_columns_] + self.raw_weights = self.weights_ + self.y_train = formatted_y + + self.weights_ = np.array(self.weights_)/np.sum(np.array(self.weights_)) + + formatted_y[formatted_y == -1] = 0 + formatted_y = formatted_y.reshape((m,)) + + end = time.time() + self.train_time = end - start + return self + + def predict(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + + classification_matrix = self._binary_classification_matrix(X) + margins = np.sum(classification_matrix * self.weights_, axis=1) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + + end = time.time() + self.predict_time = end - start + + # Predict for each step of the boosting process + self.step_predict(classification_matrix) + + return signs_array + + def step_predict(self, classification_matrix): + """Used to predict with each step of the greedy algorithm to analyze its performance increase""" + if classification_matrix.shape != self.train_shape: + self.step_decisions = np.zeros(classification_matrix.shape) + self.mincq_step_decisions = np.zeros(classification_matrix.shape) + self.step_prod = np.zeros(classification_matrix.shape) + for weight_index in range(self.weights_.shape[0] - 1): + margins = np.sum( + classification_matrix[:, :weight_index + 1] * self.weights_[ + :weight_index + 1], + axis=1) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + self.step_decisions[:, weight_index] = signs_array + self.step_prod[:, weight_index] = np.sum( + classification_matrix[:, :weight_index + 1] * self.weights_[ + :weight_index + 1], + axis=1) + if self.mincq_tracking: + if weight_index == 0: + self.mincq_step_decisions[:, weight_index] = signs_array + else: + mincq_margins = np.sum(self.mincq_learners[ + weight_index - 1].majority_vote._weights * classification_matrix[ + :, + :weight_index + 1], + axis=1) + mincq_signs_array = np.array( + [int(x) for x in sign(mincq_margins)]) + mincq_signs_array[mincq_signs_array == -1] = 0 + self.mincq_step_decisions[:, + weight_index] = mincq_signs_array + # self.mincq_step_cbounds = self.mincq_learners[weight_index-1].majority_vote.cbound_value() + + def update_info_containers(self, y, voter_perf, k): + """Is used at each iteration to compute and store all the needed quantities for later analysis""" + self.tau.append( + np.sum(np.multiply(self.previous_vote, self.new_voter)) / float( + self.n_total_examples)) + # print(np.sum(np.multiply(self.previous_vote, self.new_voter))/float(self.n_total_examples)) + self.previous_vote += self.q * self.new_voter + self.norm.append(np.linalg.norm(self.previous_vote) ** 2) + self.previous_votes.append(self.previous_vote) + self.previous_margins.append( + np.sum(np.multiply(y, self.previous_vote)) / float( + self.n_total_examples)) + self.selected_margins.append( + np.sum(np.multiply(y, self.new_voter)) / float( + self.n_total_examples)) + train_metric = self.plotted_metric.score(y, np.sign(self.previous_vote)) + self.train_metrics.append(train_metric) + + # Used to compute the optimal c-bound distribution on the chose set + if self.mincq_tracking: + from ...MonoviewClassifiers.MinCQ import MinCqLearner + mincq = MinCqLearner(10e-3, "stumps", n_stumps_per_attribute=1, + self_complemented=False) + training_set = self.classification_matrix[:, self.chosen_columns_] + mincq.fit(training_set, y) + mincq_pred = mincq.predict(training_set) + self.mincq_learners.append(mincq) + self.mincq_train_metrics.append( + self.plotted_metric.score(y, change_label_to_minus(mincq_pred))) + self.mincq_weights.append(mincq.majority_vote._weights) + self.mincq_c_bounds.append( + mincq.majority_vote.cbound_value(training_set, + y.reshape((y.shape[0],)))) + + def compute_voter_perf(self, formatted_y): + """Used to computer the performance (error or edge) of the selected voter""" + epsilon = self._compute_epsilon(formatted_y) + self.voter_perfs.append(epsilon) + return epsilon + + def _compute_epsilon(self, y): + """Updating the error variable, the old fashioned way uses the whole majority vote to update the error""" + ones_matrix = np.zeros(y.shape) + ones_matrix[np.multiply(y, self.new_voter.reshape( + y.shape)) < 0] = 1 # can np.divide if needed + epsilon = np.average(np.multiply(y, self.new_voter.reshape( + y.shape)), axis=0) + return epsilon + + def append_new_voter(self, new_voter_index): + """Used to append the voter to the majority vote""" + self.chosen_columns_.append(new_voter_index) + self.new_voter = self.classification_matrix[:, new_voter_index].reshape( + (self.n_total_examples, 1)) + + def init_boosting(self, m, y, y_kernel_matrix): + """THis initialization corressponds to the first round of boosting with equal weights for each examples and the voter chosen by it's margin.""" + + if self.random_start: + first_voter_index = self.random_state.choice( + np.where(np.sum(y_kernel_matrix, axis=0) > 0)[0]) + else: + first_voter_index, _ = self._find_best_weighted_margin( + y_kernel_matrix) + + self.chosen_columns_.append(first_voter_index) + self.new_voter = np.array(self.classification_matrix[:, + first_voter_index].reshape((m, 1)), copy=True) + + self.previous_vote = self.new_voter + self.norm.append(np.linalg.norm(self.previous_vote) ** 2) + + self.q = 1 + self.weights_.append(self.q) + + self.previous_margins.append( + np.sum(np.multiply(y, self.previous_vote)) / float( + self.n_total_examples)) + self.selected_margins.append(np.sum(np.multiply(y, self.previous_vote))) + self.tau.append( + np.sum(np.multiply(self.previous_vote, self.new_voter)) / float( + self.n_total_examples)) + + train_metric = self.plotted_metric.score(y, np.sign(self.previous_vote)) + self.train_metrics.append(train_metric) + + if self.mincq_tracking: + self.mincq_train_metrics.append(train_metric) + + def format_X_y(self, X, y): + """Formats the data : X -the examples- and y -the labels- to be used properly by the algorithm """ + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + # Initialization + y_neg = change_label_to_minus(y) + y_neg = y_neg.reshape((y.shape[0], 1)) + return X, y_neg + + def init_hypotheses(self, X, y): + """Inintialization for the hyptotheses used to build the boosted vote""" + if self.estimators_generator is "Stumps": + self.estimators_generator = StumpsClassifiersGenerator( + n_stumps_per_attribute=self.n_stumps, + self_complemented=self.self_complemented) + if self.estimators_generator is "Trees": + self.estimators_generator = TreeClassifiersGenerator( + n_trees=self.n_stumps, max_depth=self.max_depth, + self_complemented=self.self_complemented) + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + self.train_shape = self.classification_matrix.shape + + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y, self.classification_matrix) + + return m, n, y_kernel_matrix + + def init_info_containers(self): + """Initialize the containers that will be collected at each iteration for the analysis""" + self.weights_ = [] + self.chosen_columns_ = [] + self.fobidden_columns = [] + self.c_bounds = [] + self.voter_perfs = [] + self.example_weights_ = [] + self.train_metrics = [] + self.bounds = [] + self.disagreements = [] + self.margins = [] + self.previous_votes = [] + self.previous_margins = [] + self.respected_bound = True + self.selected_margins = [] + self.tau = [] + self.norm = [] + self.mincq_train_metrics = [] + self.mincq_c_bounds = [] + self.mincq_weights = [] + self.mincq_learners = [] + self.mincq_step_decisions = [] + + + def _find_best_weighted_margin(self, y_kernel_matrix, upper_bound=1.0, + lower_bound=0.0): + """Finds the new voter by choosing the one that has the best weighted margin between 0.5 and 0.55 + to avoid too god voters that will get all the votes weights""" + pseudo_h_values = ma.array(np.sum(y_kernel_matrix, axis=0), + fill_value=-np.inf) + pseudo_h_values[self.chosen_columns_] = ma.masked + return np.argmax(pseudo_h_values), [0] + + def _find_new_voter(self, y_kernel_matrix, y): + """Here, we solve the two_voters_mincq_problem for each potential new voter, + and select the one that has the smallest minimum""" + m = y_kernel_matrix.shape[0] + previous_sum = np.multiply(y, + self.previous_vote.reshape(m, 1)) + margin_old = np.sum(previous_sum) + + bad_margins = np.where(np.sum(y_kernel_matrix, axis=0) <= 0.0)[0] + + self.B2 = m + self.B1s = np.sum( + 2 * np.multiply(previous_sum, y_kernel_matrix), + axis=0) + self.B0 = np.sum(previous_sum ** 2) + + self.A2s = np.sum(y_kernel_matrix, axis=0) ** 2 + self.A1s = np.sum(y_kernel_matrix, axis=0) * margin_old * 2 + self.A0 = margin_old ** 2 + + C2s = (self.A1s * self.B2 - self.A2s * self.B1s) + C1s = 2 * (self.A0 * self.B2 - self.A2s * self.B0) + C0s = self.A0 * self.B1s - self.A1s * self.B0 + + sols = np.zeros(C0s.shape) - 3 + sols[np.where(C2s != 0)[0]] = (-C1s[np.where(C2s != 0)[0]] + np.sqrt( + C1s[np.where(C2s != 0)[0]] * C1s[np.where(C2s != 0)[0]] - 4 * C2s[ + np.where(C2s != 0)[0]] * C0s[np.where(C2s != 0)[0]])) / ( + 2 * C2s[ + np.where(C2s != 0)[0]]) + + masked_c_bounds = self.make_masked_c_bounds(sols, bad_margins) + if masked_c_bounds.mask.all(): + return "No more pertinent voters", 0 + else: + best_hyp_index = np.argmin(masked_c_bounds) + + self.c_bounds.append(masked_c_bounds[best_hyp_index]) + self.margins.append(math.sqrt(self.A2s[best_hyp_index] / m)) + self.disagreements.append(0.5 * self.B1s[best_hyp_index] / m) + return sols[best_hyp_index], best_hyp_index + + def make_masked_c_bounds(self, sols, bad_margins): + c_bounds = self.compute_c_bounds(sols) + trans_c_bounds = self.compute_c_bounds(sols + 1) + masked_c_bounds = ma.array(c_bounds, fill_value=np.inf) + # Masing Maximums + masked_c_bounds[c_bounds >= trans_c_bounds] = ma.masked + # Masking magrins <= 0 + masked_c_bounds[bad_margins] = ma.masked + # Masking weights < 0 (because self-complemented) + masked_c_bounds[sols < 0] = ma.masked + # Masking nan c_bounds + masked_c_bounds[np.isnan(c_bounds)] = ma.masked + if not self.twice_the_same: + masked_c_bounds[self.chosen_columns_] = ma.masked + return masked_c_bounds + + def compute_c_bounds(self, sols): + return 1 - (self.A2s * sols ** 2 + self.A1s * sols + self.A0) / (( + self.B2 * sols ** 2 + self.B1s * sols + self.B0) * self.n_total_examples) + + def _cbound(self, sol): + """Computing the objective function""" + return 1 - (self.A2 * sol ** 2 + self.A1 * sol + self.A0) / (( + self.B2 * sol ** 2 + self.B1 * sol + self.B0) * self.n_total_examples) + + def disagreement(self, sol): + return ( + self.B2 * sol ** 2 + self.B1 * sol + self.B0) / self.n_total_examples + + def margin(self, sol): + return ( + self.A2 * sol ** 2 + self.A1 * sol + self.A0) / self.n_total_examples + + def _best_sol(self, sols): + """Return the best min in the two possible sols""" + values = np.array([self._cbound(sol) for sol in sols]) + return sols[np.argmin(values)] + + def get_step_decision_test_graph(self, directory, y_test): + np.savetxt(directory + "y_test_step.csv", self.step_decisions, + delimiter=',') + step_metrics = [] + for step_index in range(self.step_decisions.shape[1] - 1): + step_metrics.append(self.plotted_metric.score(y_test, + self.step_decisions[:, + step_index])) + step_metrics = np.array(step_metrics) + np.savetxt(directory + "step_test_metrics.csv", step_metrics, + delimiter=',') + get_accuracy_graph(step_metrics, self.__class__.__name__, + directory + 'step_test_metrics.png', + self.plotted_metric, set="test") + + if self.mincq_tracking: + step_mincq_test_metrics = [] + for step_index in range(self.step_decisions.shape[1] - 1): + step_mincq_test_metrics.append(self.plotted_metric.score(y_test, + self.mincq_step_decisions[ + :, + step_index])) + np.savetxt(directory + "mincq_step_test_metrics.csv", + step_mincq_test_metrics, + delimiter=',') + get_accuracy_graph(step_metrics, self.__class__.__name__, + directory + 'step_test_metrics_comparaison.png', + self.plotted_metric, step_mincq_test_metrics, + "MinCQ metric", set="test") + + step_cbounds = [] + for step_index in range(self.step_prod.shape[1]): + num = np.sum(y_test * self.step_prod[:, step_index]) ** 2 + den = np.sum((self.step_prod[:, step_index]) ** 2) + step_cbounds.append(1 - num / (den * self.step_prod.shape[0])) + step_cbounds = np.array(step_cbounds) + np.savetxt(directory + "step_test_c_bounds.csv", step_cbounds, + delimiter=',') + get_accuracy_graph(step_cbounds, self.__class__.__name__, + directory + 'step_test_c_bounds.png', + "C_bound", set="test") + + def getInterpretCBBoost(self, directory, y_test=None): + self.directory = directory + """Used to interpret the functionning of the algorithm""" + if self.step_decisions is not None: + self.get_step_decision_test_graph(directory, y_test) + # get_accuracy_graph(self.voter_perfs[:20], self.__class__.__name__, + # directory + 'voter_perfs.png', "Rs") + get_accuracy_graph(self.weights_, self.__class__.__name__, + directory + 'vote_weights.png', "weights", + zero_to_one=False) + get_accuracy_graph(self.c_bounds, self.__class__.__name__, + directory + 'c_bounds.png', "C-Bounds") + if self.mincq_tracking: + get_accuracy_graph(self.c_bounds, self.__class__.__name__, + directory + 'c_bounds_comparaison.png', + "1-var mins", self.mincq_c_bounds, "MinCQ min", + zero_to_one=False) + get_accuracy_graph(self.train_metrics, self.__class__.__name__, + directory + 'train_metrics_comparaison.png', + self.plotted_metric, + self.mincq_train_metrics, "MinCQ metrics") + get_accuracy_graph(self.previous_margins, self.__class__.__name__, + directory + 'margins.png', "Margins", + zero_to_one=False) + get_accuracy_graph(self.selected_margins, self.__class__.__name__, + directory + 'selected_margins.png', + "Selected Margins") + self.tau[0] = 0 + get_accuracy_graph(self.tau, self.__class__.__name__, + directory + 'disagreements.png', "disagreements", + zero_to_one=False) + get_accuracy_graph(self.train_metrics[:-1], self.__class__.__name__, + directory + 'c_bounds_train_metrics.png', + self.plotted_metric, self.c_bounds, "C-Bound", + self.bounds[:-1]) + get_accuracy_graph(self.norm, self.__class__.__name__, + directory + 'norms.png', + "squared 2-norm", zero_to_one=False) + interpretString = getInterpretBase(self, directory, + self.__class__.__name__, + self.weights_, self.break_cause) + if self.save_train_data: + np.savetxt(directory + "x_train.csv", self.X_train, delimiter=',') + np.savetxt(directory + "y_train.csv", self.y_train, delimiter=',') + np.savetxt(directory + "raw_weights.csv", self.raw_weights, + delimiter=',') + np.savetxt(directory + "c_bounds.csv", self.c_bounds, delimiter=',') + np.savetxt(directory + "train_metrics.csv", self.train_metrics, + delimiter=',') + np.savetxt(directory + "margins.csv", self.previous_margins, + delimiter=',') + np.savetxt(directory + "disagreements.csv", self.tau, + delimiter=',') + np.savetxt(directory + "disagreements.csv", self.norm, + delimiter=',') + if self.mincq_tracking: + np.savetxt(directory + "mincq_cbounds.csv", self.mincq_c_bounds, + delimiter=',') + np.savetxt(directory + "mincq_train_metrics.csv", + self.mincq_train_metrics, + delimiter=',') + args_dict = dict( + (arg_name, str(self.__dict__[arg_name])) for arg_name in + self.printed_args_name_list) + interpretString += "\n \n With arguments : \n" + u'\u2022 ' + ( + "\n" + u'\u2022 ').join(['%s: \t%s' % (key, value) + for (key, value) in + args_dict.items()]) + if not self.respected_bound: + interpretString += "\n\n The bound was not respected" + + return interpretString diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py index 82522b306b2ab41e111496655000f16a7165aa14..3916a4b2b8c17483acbd08f747d9d45be6fbb0e4 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py @@ -22,7 +22,7 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): c_bound_choice=True, random_start=True, n_stumps=1, use_r=True, c_bound_sol=True, plotted_metric=Metrics.zero_one_loss, save_train_data=True, - test_graph=True, mincq_tracking=True): + test_graph=True, mincq_tracking=False): super(ColumnGenerationClassifierQar, self).__init__() r""" @@ -74,6 +74,15 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): self.mincq_tracking = mincq_tracking def fit(self, X, y): + ones = [] + tows = [] + threes = [] + fours = [] + fives = [] + sixes = [] + sevens = [] + eights = [] + formatted_X, formatted_y = self.format_X_y(X, y) @@ -95,29 +104,24 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): # Print dynamically the step and the error of the current classifier self.it = k - print( - "Resp. bound : {}, {}; {}/{}, eps :{}".format( - self.respected_bound, - self.bounds[-1] > self.train_metrics[-1], - k + 2, - self.n_max_iterations, - self.voter_perfs[-1]), - end="\r") + # print( + # "Resp. bound : {}, {}; {}/{}, eps :{}, ".format( + # self.respected_bound, + # self.bounds[-1] > self.train_metrics[-1], + # k + 2, + # self.n_max_iterations, + # self.voter_perfs[-1], + # ), + # end="\r") sol, new_voter_index = self.choose_new_voter(y_kernel_matrix, - formatted_y) - + formatted_y) if type(sol) == str: - self.break_cause = new_voter_index # + self.break_cause = sol # break - self.append_new_voter(new_voter_index) - voter_perf = self.compute_voter_perf(formatted_y) - self.compute_voter_weight(voter_perf, sol) - self.update_example_weights(formatted_y) - self.update_info_containers(formatted_y, voter_perf, k) self.nb_opposed_voters = self.check_opposed_voters() @@ -128,8 +132,11 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): self.raw_weights = self.weights_ self.y_train = formatted_y + # print(self.classification_matrix) + # print(self.weights_, self.break_cause) self.weights_ = np.array(self.weights_) - self.weights_ /= np.sum(self.weights_) + if np.sum(self.weights_) != 1: + self.weights_ /= np.sum(self.weights_) formatted_y[formatted_y == -1] = 0 formatted_y = formatted_y.reshape((m,)) @@ -390,7 +397,8 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): ones_matrix = np.zeros(y.shape) ones_matrix[np.multiply(y, self.new_voter.reshape( y.shape)) < 0] = 1 # can np.divide if needed - epsilon = np.average(ones_matrix, weights=self.example_weights, axis=0) + epsilon = np.average(np.multiply(y, self.new_voter.reshape( + y.shape)), axis=0) return epsilon def _compute_r(self, y): @@ -444,11 +452,26 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): self.A2s = np.sum(weighted_hypothesis, axis=0) ** 2 self.A1s = np.sum(weighted_hypothesis, axis=0) * margin_old * 2 self.A0 = margin_old ** 2 + import matplotlib.pyplot as plt + # plt.plot(self.A2s * 0.5 * self.B1s / m**3) + # plt.plot(np.array([margin_old/m for _ in range(len(self.A2s))])) + # plt.savefig("try.png") + + # print("C2 < 0 :", np.where(np.array([margin_old/m for _ in range(len(self.A2s))]) < np.sqrt(self.A2s) * 0.5 * self.B1s / m**2)[0]) + # print("C1 < 0 :", np.where(np.array([margin_old ** 2 / m for _ in range( + # len(self.A2s))]) < self.A2s * self.B0 / m ** 2)[0]) + # print("Double root:", np.where((0.5 * self.B1s / m)**2 * m > self.B0)[0]) + C2s = (self.A1s * self.B2 - self.A2s * self.B1s) + # print("Wrong C2 :" , np.where(C2s < 0)[0].shape, bad_margins.shape) C1s = 2 * (self.A0 * self.B2 - self.A2s * self.B0) + # print("Wrong C2 :", np.where(C1s < 0)[0].shape, bad_margins.shape) C0s = self.A0 * self.B1s - self.A1s * self.B0 + # print(np.where(C2s==0)) + # print(self.chosen_columns_) + sols = np.zeros(C0s.shape) - 3 # sols[np.where(C2s == 0)[0]] = C0s[np.where(C2s == 0)[0]] / C1s[np.where(C2s == 0)[0]] sols[np.where(C2s != 0)[0]] = (-C1s[np.where(C2s != 0)[0]] + np.sqrt( @@ -462,7 +485,6 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): return "No more pertinent voters", 0 else: best_hyp_index = np.argmin(masked_c_bounds) - self.c_bounds.append(masked_c_bounds[best_hyp_index]) self.margins.append(math.sqrt(self.A2s[best_hyp_index] / m)) self.disagreements.append(0.5 * self.B1s[best_hyp_index] / m) diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py index c1dfedad46ccda63a5044f5b258bc68c48e9837c..043e095cdc448d1c312ee0dfc234aea5276ba502 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py @@ -7,7 +7,7 @@ Related papers: """ from __future__ import print_function, division, absolute_import - +import time from operator import xor import numpy as np @@ -97,6 +97,8 @@ class MinCqClassifier(VotingClassifier): enumerate(self.estimators_generator.estimators_)] super().fit(X, y) + beg = time.time() + # Preparation and resolution of the quadratic program # logger.info("Preparing and solving QP...") self.weights = self._solve(X, y) @@ -112,6 +114,8 @@ class MinCqClassifier(VotingClassifier): np.sum(np.average( self._binary_classification_matrix(X), axis=1, weights=self.weights) ** 2)) + end = time.time() + self.train_time = end-beg return self def _binary_classification_matrix(self, X): diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/_custom_criterion.pyx b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/_custom_criterion.pyx index 8e50ea220339c5d49d4327202517d51eea707a86..f6deb43ea8c5446085180908985397bfc8e325d6 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/_custom_criterion.pyx +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/_custom_criterion.pyx @@ -1,3 +1,11 @@ +from sklearn.tree._criterion import ClassificationCriterion + +class Cbound(ClassificationCriterion): + def node_impurity(self): + pass + + + # # cython: cdivision=True # # cython: boundscheck=False # # cython: wraparound=False @@ -14,6 +22,22 @@ # # Jacob Schreiber <jmschreiber91@gmail.com> # # Nelson Liu <nelson@nelsonliu.me> # # +# # License: BSD 3 clause# cython: cdivision=True +# # cython: boundscheck=False +# # cython: wraparound=False +# +# # Authors: Gilles Louppe <g.louppe@gmail.com> +# # Peter Prettenhofer <peter.prettenhofer@gmail.com> +# # Brian Holt <bdholt1@gmail.com> +# # Noel Dawe <noel@dawe.me> +# # Satrajit Gosh <satrajit.ghosh@gmail.com> +# # Lars Buitinck +# # Arnaud Joly <arnaud.v.joly@gmail.com> +# # Joel Nothman <joel.nothman@gmail.com> +# # Fares Hedayati <fares.hedayati@gmail.com> +# # Jacob Schreiber <jmschreiber91@gmail.com> +# # Nelson Liu <nelson@nelsonliu.me> +# # # # License: BSD 3 clause # # calloc @@ -76,7 +100,602 @@ # cdef # # -# class CustomCriterion: +# class CustomCriterion(Criterion): +# """Interface for impurity criteria. +# This object stores methods on how to calculate how good a split is using +# different metrics. +# """ +# +# def __dealloc__(self): +# """Destructor.""" +# +# free(self.sum_total) +# free(self.sum_left) +# free(self.sum_right) +# +# def __getstate__(self): +# return {} +# +# def __setstate__(self, d): +# pass +# +# cdef +# int +# init(self, DOUBLE_t * y, SIZE_t +# y_stride, DOUBLE_t * sample_weight, +# double +# weighted_n_samples, SIZE_t * samples, SIZE_t +# start, +# SIZE_t +# end) nogil except -1: +# """Placeholder for a method which will initialize the criterion. +# Returns -1 in case of failure to allocate memory (and raise MemoryError) +# or 0 otherwise. +# Parameters +# ---------- +# y : array-like, dtype=DOUBLE_t +# y is a buffer that can store values for n_outputs target variables +# y_stride : SIZE_t +# y_stride is used to index the kth output value as follows: +# y[i, k] = y[i * y_stride + k] +# sample_weight : array-like, dtype=DOUBLE_t +# The weight of each sample +# weighted_n_samples : DOUBLE_t +# The total weight of the samples being considered +# samples : array-like, dtype=DOUBLE_t +# Indices of the samples in X and y, where samples[start:end] +# correspond to the samples in this node +# start : SIZE_t +# The first sample to be used on this node +# end : SIZE_t +# The last sample used on this node +# """ +# +# pass +# # +# # cdef int reset(self) nogil except -1: +# # """Reset the criterion at pos=start. +# # This method must be implemented by the subclass. +# # """ +# # +# # pass +# # +# # cdef int reverse_reset(self) nogil except -1: +# # """Reset the criterion at pos=end. +# # This method must be implemented by the subclass. +# # """ +# # pass +# # +# # cdef int update(self, SIZE_t new_pos) nogil except -1: +# # """Updated statistics by moving samples[pos:new_pos] to the left child. +# # This updates the collected statistics by moving samples[pos:new_pos] +# # from the right child to the left child. It must be implemented by +# # the subclass. +# # Parameters +# # ---------- +# # new_pos : SIZE_t +# # New starting index position of the samples in the right child +# # """ +# # +# # pass +# # +# # cdef double node_impurity(self) nogil: +# # """Placeholder for calculating the impurity of the node. +# # Placeholder for a method which will evaluate the impurity of +# # the current node, i.e. the impurity of samples[start:end]. This is the +# # primary function of the criterion class. +# # """ +# # +# # pass +# # +# # cdef void children_impurity(self, double* impurity_left, +# # double* impurity_right) nogil: +# # """Placeholder for calculating the impurity of children. +# # Placeholder for a method which evaluates the impurity in +# # children nodes, i.e. the impurity of samples[start:pos] + the impurity +# # of samples[pos:end]. +# # Parameters +# # ---------- +# # impurity_left : double pointer +# # The memory address where the impurity of the left child should be +# # stored. +# # impurity_right : double pointer +# # The memory address where the impurity of the right child should be +# # stored +# # """ +# # +# # pass +# # +# # cdef void node_value(self, double* dest) nogil: +# # """Placeholder for storing the node value. +# # Placeholder for a method which will compute the node value +# # of samples[start:end] and save the value into dest. +# # Parameters +# # ---------- +# # dest : double pointer +# # The memory address where the node value should be stored. +# # """ +# # +# # pass +# # +# # cdef double proxy_impurity_improvement(self) nogil: +# # """Compute a proxy of the impurity reduction +# # This method is used to speed up the search for the best split. +# # It is a proxy quantity such that the split that maximizes this value +# # also maximizes the impurity improvement. It neglects all constant terms +# # of the impurity decrease for a given split. +# # The absolute impurity improvement is only computed by the +# # impurity_improvement method once the best split has been found. +# # """ +# # cdef double impurity_left +# # cdef double impurity_right +# # self.children_impurity(&impurity_left, &impurity_right) +# # +# # return (- self.weighted_n_right * impurity_right +# # - self.weighted_n_left * impurity_left) +# # +# # cdef double impurity_improvement(self, double impurity) nogil: +# # """Compute the improvement in impurity +# # This method computes the improvement in impurity when a split occurs. +# # The weighted impurity improvement equation is the following: +# # N_t / N * (impurity - N_t_R / N_t * right_impurity +# # - N_t_L / N_t * left_impurity) +# # where N is the total number of samples, N_t is the number of samples +# # at the current node, N_t_L is the number of samples in the left child, +# # and N_t_R is the number of samples in the right child, +# # Parameters +# # ---------- +# # impurity : double +# # The initial impurity of the node before the split +# # Return +# # ------ +# # double : improvement in impurity after the split occurs +# # """ +# # +# # cdef double impurity_left +# # cdef double impurity_right +# # +# # self.children_impurity(&impurity_left, &impurity_right) +# # +# # return ((self.weighted_n_node_samples / self.weighted_n_samples) * +# # (impurity - (self.weighted_n_right / +# # self.weighted_n_node_samples * impurity_right) +# # - (self.weighted_n_left / +# # self.weighted_n_node_samples * impurity_left))) +# # +# # +# # cdef class CustomClassificationCriterion(Criterion): +# # """Abstract criterion for classification.""" +# # +# # def __cinit__(self, SIZE_t n_outputs, +# # np.ndarray[SIZE_t, ndim=1] n_classes): +# # """Initialize attributes for this criterion. +# # Parameters +# # ---------- +# # n_outputs : SIZE_t +# # The number of targets, the dimensionality of the prediction +# # n_classes : numpy.ndarray, dtype=SIZE_t +# # The number of unique classes in each target +# # """ +# # +# # self.y = NULL +# # self.y_stride = 0 +# # self.sample_weight = NULL +# # +# # self.samples = NULL +# # self.start = 0 +# # self.pos = 0 +# # self.end = 0 +# # +# # self.n_outputs = n_outputs +# # self.n_samples = 0 +# # self.n_node_samples = 0 +# # self.weighted_n_node_samples = 0.0 +# # self.weighted_n_left = 0.0 +# # self.weighted_n_right = 0.0 +# # +# # # Count labels for each output +# # self.sum_total = NULL +# # self.sum_left = NULL +# # self.sum_right = NULL +# # self.n_classes = NULL +# # +# # safe_realloc(&self.n_classes, n_outputs) +# # +# # cdef SIZE_t k = 0 +# # cdef SIZE_t sum_stride = 0 +# # +# # # For each target, set the number of unique classes in that target, +# # # and also compute the maximal stride of all targets +# # for k in range(n_outputs): +# # self.n_classes[k] = n_classes[k] +# # +# # if n_classes[k] > sum_stride: +# # sum_stride = n_classes[k] +# # +# # self.sum_stride = sum_stride +# # +# # cdef SIZE_t n_elements = n_outputs * sum_stride +# # self.sum_total = <double*> calloc(n_elements, sizeof(double)) +# # self.sum_left = <double*> calloc(n_elements, sizeof(double)) +# # self.sum_right = <double*> calloc(n_elements, sizeof(double)) +# # +# # if (self.sum_total == NULL or +# # self.sum_left == NULL or +# # self.sum_right == NULL): +# # raise MemoryError() +# # +# # def __dealloc__(self): +# # """Destructor.""" +# # free(self.n_classes) +# # +# # def __reduce__(self): +# # return (type(self), +# # (self.n_outputs, +# # sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)), +# # self.__getstate__()) +# # +# # cdef int init(self, DOUBLE_t* y, SIZE_t y_stride, +# # DOUBLE_t* sample_weight, double weighted_n_samples, +# # SIZE_t* samples, SIZE_t start, SIZE_t end) nogil except -1: +# # """Initialize the criterion at node samples[start:end] and +# # children samples[start:start] and samples[start:end]. +# # Returns -1 in case of failure to allocate memory (and raise MemoryError) +# # or 0 otherwise. +# # Parameters +# # ---------- +# # y : array-like, dtype=DOUBLE_t +# # The target stored as a buffer for memory efficiency +# # y_stride : SIZE_t +# # The stride between elements in the buffer, important if there +# # are multiple targets (multi-output) +# # sample_weight : array-like, dtype=DTYPE_t +# # The weight of each sample +# # weighted_n_samples : SIZE_t +# # The total weight of all samples +# # samples : array-like, dtype=SIZE_t +# # A mask on the samples, showing which ones we want to use +# # start : SIZE_t +# # The first sample to use in the mask +# # end : SIZE_t +# # The last sample to use in the mask +# # """ +# # +# # self.y = y +# # self.y_stride = y_stride +# # self.sample_weight = sample_weight +# # self.samples = samples +# # self.start = start +# # self.end = end +# # self.n_node_samples = end - start +# # self.weighted_n_samples = weighted_n_samples +# # self.weighted_n_node_samples = 0.0 +# # +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef double* sum_total = self.sum_total +# # +# # cdef SIZE_t i +# # cdef SIZE_t p +# # cdef SIZE_t k +# # cdef SIZE_t c +# # cdef DOUBLE_t w = 1.0 +# # cdef SIZE_t offset = 0 +# # +# # for k in range(self.n_outputs): +# # memset(sum_total + offset, 0, n_classes[k] * sizeof(double)) +# # offset += self.sum_stride +# # +# # for p in range(start, end): +# # i = samples[p] +# # +# # # w is originally set to be 1.0, meaning that if no sample weights +# # # are given, the default weight of each sample is 1.0 +# # if sample_weight != NULL: +# # w = sample_weight[i] +# # +# # # Count weighted class frequency for each target +# # for k in range(self.n_outputs): +# # c = <SIZE_t> y[i * y_stride + k] +# # sum_total[k * self.sum_stride + c] += w +# # +# # self.weighted_n_node_samples += w +# # +# # # Reset to pos=start +# # self.reset() +# # return 0 +# # +# # cdef int reset(self) nogil except -1: +# # """Reset the criterion at pos=start +# # Returns -1 in case of failure to allocate memory (and raise MemoryError) +# # or 0 otherwise. +# # """ +# # self.pos = self.start +# # +# # self.weighted_n_left = 0.0 +# # self.weighted_n_right = self.weighted_n_node_samples +# # +# # cdef double* sum_total = self.sum_total +# # cdef double* sum_left = self.sum_left +# # cdef double* sum_right = self.sum_right +# # +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef SIZE_t k +# # +# # for k in range(self.n_outputs): +# # memset(sum_left, 0, n_classes[k] * sizeof(double)) +# # memcpy(sum_right, sum_total, n_classes[k] * sizeof(double)) +# # +# # sum_total += self.sum_stride +# # sum_left += self.sum_stride +# # sum_right += self.sum_stride +# # return 0 +# # +# # cdef int reverse_reset(self) nogil except -1: +# # """Reset the criterion at pos=end +# # Returns -1 in case of failure to allocate memory (and raise MemoryError) +# # or 0 otherwise. +# # """ +# # self.pos = self.end +# # +# # self.weighted_n_left = self.weighted_n_node_samples +# # self.weighted_n_right = 0.0 +# # +# # cdef double* sum_total = self.sum_total +# # cdef double* sum_left = self.sum_left +# # cdef double* sum_right = self.sum_right +# # +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef SIZE_t k +# # +# # for k in range(self.n_outputs): +# # memset(sum_right, 0, n_classes[k] * sizeof(double)) +# # memcpy(sum_left, sum_total, n_classes[k] * sizeof(double)) +# # +# # sum_total += self.sum_stride +# # sum_left += self.sum_stride +# # sum_right += self.sum_stride +# # return 0 +# # +# # cdef int update(self, SIZE_t new_pos) nogil except -1: +# # """Updated statistics by moving samples[pos:new_pos] to the left child. +# # Returns -1 in case of failure to allocate memory (and raise MemoryError) +# # or 0 otherwise. +# # Parameters +# # ---------- +# # new_pos : SIZE_t +# # The new ending position for which to move samples from the right +# # child to the left child. +# # """ +# # cdef DOUBLE_t* y = self.y +# # cdef SIZE_t pos = self.pos +# # cdef SIZE_t end = self.end +# # +# # cdef double* sum_left = self.sum_left +# # cdef double* sum_right = self.sum_right +# # cdef double* sum_total = self.sum_total +# # +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef SIZE_t* samples = self.samples +# # cdef DOUBLE_t* sample_weight = self.sample_weight +# # +# # cdef SIZE_t i +# # cdef SIZE_t p +# # cdef SIZE_t k +# # cdef SIZE_t c +# # cdef SIZE_t label_index +# # cdef DOUBLE_t w = 1.0 +# # +# # # Update statistics up to new_pos +# # # +# # # Given that +# # # sum_left[x] + sum_right[x] = sum_total[x] +# # # and that sum_total is known, we are going to update +# # # sum_left from the direction that require the least amount +# # # of computations, i.e. from pos to new_pos or from end to new_po. +# # +# # if (new_pos - pos) <= (end - new_pos): +# # for p in range(pos, new_pos): +# # i = samples[p] +# # +# # if sample_weight != NULL: +# # w = sample_weight[i] +# # +# # for k in range(self.n_outputs): +# # label_index = (k * self.sum_stride + +# # <SIZE_t> y[i * self.y_stride + k]) +# # sum_left[label_index] += w +# # +# # self.weighted_n_left += w +# # +# # else: +# # self.reverse_reset() +# # +# # for p in range(end - 1, new_pos - 1, -1): +# # i = samples[p] +# # +# # if sample_weight != NULL: +# # w = sample_weight[i] +# # +# # for k in range(self.n_outputs): +# # label_index = (k * self.sum_stride + +# # <SIZE_t> y[i * self.y_stride + k]) +# # sum_left[label_index] -= w +# # +# # self.weighted_n_left -= w +# # +# # # Update right part statistics +# # self.weighted_n_right = self.weighted_n_node_samples - self.weighted_n_left +# # for k in range(self.n_outputs): +# # for c in range(n_classes[k]): +# # sum_right[c] = sum_total[c] - sum_left[c] +# # +# # sum_right += self.sum_stride +# # sum_left += self.sum_stride +# # sum_total += self.sum_stride +# # +# # self.pos = new_pos +# # return 0 +# # +# # cdef double node_impurity(self) nogil: +# # pass +# # +# # cdef void children_impurity(self, double* impurity_left, +# # double* impurity_right) nogil: +# # pass +# # +# # cdef void node_value(self, double* dest) nogil: +# # """Compute the node value of samples[start:end] and save it into dest. +# # Parameters +# # ---------- +# # dest : double pointer +# # The memory address which we will save the node value into. +# # """ +# # +# # cdef double* sum_total = self.sum_total +# # cdef SIZE_t* n_classes = self.n_classes +# # cdef SIZE_t k +# # +# # for k in range(self.n_outputs): +# # memcpy(dest, sum_total, n_classes[k] * sizeof(double)) +# # dest += self.sum_stride +# # sum_total += self.sum_stride +# # +# # cdef class CCriterion(CustomClassificationCriterion): +# # r"""Cross Entropy impurity criterion. +# # This handles cases where the target is a classification taking values +# # 0, 1, ... K-2, K-1. If node m represents a region Rm with Nm observations, +# # then let +# # count_k = 1 / Nm \sum_{x_i in Rm} I(yi = k) +# # be the proportion of class k observations in node m. +# # The cross-entropy is then defined as +# # cross-entropy = -\sum_{k=0}^{K-1} count_k log(count_k) +# # """ +# # +# # cdef double node_impurity(self) nogil: +# # """Evaluate the impurity of the current node, i.e. the impurity of +# # samples[start:end], using the cross-entropy criterion.""" +# # +# # # cdef SIZE_t* n_classes = self.n_classes +# # # cdef double* sum_total = self.sum_total +# # # cdef double entropy = 0.0 +# # # cdef double count_k +# # # cdef SIZE_t k +# # # cdef SIZE_t c +# # # +# # # for k in range(self.n_outputs): +# # # for c in range(n_classes[k]): +# # # count_k = sum_total[c] +# # # if count_k > 0.0: +# # # count_k /= self.weighted_n_node_samples +# # # entropy -= count_k * log(count_k) +# # # +# # # sum_total += self.sum_stride +# # +# # return 1.0 +# # +# # cdef void children_impurity(self, double* impurity_left, +# # double* impurity_right) nogil: +# # """Evaluate the impurity in children nodes +# # i.e. the impurity of the left child (samples[start:pos]) and the +# # impurity the right child (samples[pos:end]). +# # Parameters +# # ---------- +# # impurity_left : double pointer +# # The memory address to save the impurity of the left node +# # impurity_right : double pointer +# # The memory address to save the impurity of the right node +# # """ +# # +# # # cdef SIZE_t* n_classes = self.n_classes +# # # cdef double* sum_left = self.sum_left +# # # cdef double* sum_right = self.sum_right +# # # cdef double entropy_left = 0.0 +# # # cdef double entropy_right = 0.0 +# # # cdef double count_k +# # # cdef SIZE_t k +# # # cdef SIZE_t c +# # # +# # # for k in range(self.n_outputs): +# # # for c in range(n_classes[k]): +# # # count_k = sum_left[c] +# # # if count_k > 0.0: +# # # count_k /= self.weighted_n_left +# # # entropy_left -= count_k * log(count_k) +# # # +# # # count_k = sum_right[c] +# # # if count_k > 0.0: +# # # count_k /= self.weighted_n_right +# # # entropy_right -= count_k * log(count_k) +# # # +# # # sum_left += self.sum_stride +# # # sum_right += self.sum_stride +# # # +# # # impurity_left[0] = entropy_left / self.n_outputs +# # # impurity_right[0] = entropy_right / self.n_outputs +# +# +# calloc +# +# free +# +# memcpy +# +# memset +# +# fabs +# +# malloc +# +# realloc +# # from libc.math cimport log as ln +# +# import numpy as np +# from sklearn.tree import Crit +# +# cimport +# numpy as np +# np.import_array() +# # from sklearn.tree._criterion cimport Criterion, ClassificationCriterion +# +# cdef +# realloc_ptr +# safe_realloc(realloc_ptr * p, size_t +# nelems) nogil except *: +# # sizeof(realloc_ptr[0]) would be more like idiomatic C, but causes Cython +# # 0.20.1 to crash. +# cdef +# size_t +# nbytes = nelems * sizeof(p[0][0]) +# if nbytes / sizeof(p[0][0]) != nelems: +# # Overflow in the multiplication +# with gil: +# raise MemoryError("could not allocate (%d * %d) bytes" +# % (nelems, sizeof(p[0][0]))) +# cdef +# realloc_ptr +# tmp = < realloc_ptr > realloc(p[0], nbytes) +# if tmp == NULL: +# with gil: +# raise MemoryError("could not allocate %d bytes" % nbytes) +# p[0] = tmp +# return tmp # for +# +# cdef +# inline +# np.ndarray +# sizet_ptr_to_ndarray(SIZE_t * data, SIZE_t +# size): +# """Return copied data as 1D numpy array of intp's.""" +# cdef +# np.npy_intp +# shape[1] +# shape[0] = < np.npy_intp > size +# return np.PyArray_SimpleNewFromData(1, shape, np.NPY_INTP, data).copy() +# +# cdef +# +# +# class CustomCriterion(Criterion): # """Interface for impurity criteria. # This object stores methods on how to calculate how good a split is using # different metrics. diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py index 76b0e7814bd44fff3021f0b3706a4eb2c0289908..0e6890857c8e9b56801acf2d759f1f9ccb00bb32 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py @@ -56,7 +56,6 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, labelsString, \ outputFileName = initConstants(args, X, classificationIndices, labelsNames, name, directory) - logging.debug("Done:\t Loading data") logging.debug( @@ -223,11 +222,11 @@ def saveResults(stringAnalysis, outputFileName, full_labels_pred, y_train_pred, testFileName = outputFileName + imageName + "-" + str( i) + ".png" if not os.path.isfile(testFileName): - imagesAnalysis[imageName].savefig(testFileName) + imagesAnalysis[imageName].savefig(testFileName, transparent=True) break imagesAnalysis[imageName].savefig( - outputFileName + imageName + '.png') + outputFileName + imageName + '.png', transparent=True) if __name__ == '__main__': diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExportResults.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExportResults.py index ba1a9088fcfd4aeec3606e70117d4b244a5f9530..086080ee0cb61a7731a533035497a5284b159bdb 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExportResults.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExportResults.py @@ -135,11 +135,11 @@ def showScoreTime(directory, filename, store, resScore, resTime, rangeX, for i in range(1, 20): testFileName = filename + "-" + str(i) + ".png" if not os.path.isfile(directory + testFileName): - plt.savefig(directory + testFileName) + plt.savefig(directory + testFileName, transparent=True) break else: - plt.savefig(file) + plt.savefig(file, transparent=True) else: plt.show() @@ -180,11 +180,11 @@ def showResults(directory, filename, db, feat, score): for i in range(1, 20): testFileName = filename + "-" + str(i) + ".png" if not os.path.isfile(directory + testFileName): - plt.savefig(directory + testFileName) + plt.savefig(directory + testFileName, transparent=True) break else: - plt.savefig(file) + plt.savefig(file, transparent=True) plt.close() @@ -262,11 +262,11 @@ def plot_confusion_matrix(directory, filename, df_confusion, for i in range(1, 20): testFileName = filename + "-" + str(i) + ".png" if not os.path.isfile(directory + testFileName): - plt.savefig(directory + testFileName) + plt.savefig(directory + testFileName, transparent=True) break else: - plt.savefig(file) + plt.savefig(file, transparent=True) plt.close() diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py index daa2fff718d2d083adfc89b985b9c1e973356cda..1d6c41290cc0ec639707df8309953712c36e8b7f 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py @@ -190,7 +190,7 @@ class BaseMonoviewClassifier(object): ax.yaxis.set_major_formatter(formatter) plt.bar(x, featureImportancesSorted) plt.title("Importance depending on feature") - fig.savefig(directory + "feature_importances.png") + fig.savefig(directory + "feature_importances.png", transparent=True) plt.close() featuresImportancesDict = dict((featureIndex, featureImportance) for featureIndex, featureImportance in diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py index 1965fa6855ce670e74f6b6a7ce3fcdddc1a864fe..82b53efa4e1b4765ea1e5d463ecc67c910c901ff 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/Adaboost.py @@ -89,7 +89,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): def formatCmdArgs(args): """Used to format kwargs for the parsed args""" kwargsDict = {'n_estimators': args.Ada_n_est, - 'base_estimator': DecisionTreeClassifier(max_depth=1)} + 'base_estimator': [DecisionTreeClassifier(max_depth=1)]} return kwargsDict diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py index 6393b19d998158b7957e56a7290d0afb2e3bad6b..07ae58db2946ba9fb7c7514ef581aff1b9a80932 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py @@ -55,6 +55,7 @@ class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, self.metrics = np.array( [self.plotted_metric.score(change_label_to_zero(pred), y) for pred in self.staged_predict(pregen_X)]) + self.bounds = np.array([np.prod( np.sqrt(1 - 4 * np.square(0.5 - self.estimator_errors_[:i + 1]))) for i in @@ -105,6 +106,8 @@ class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, np.savetxt(directory + "train_metrics.csv", self.metrics, delimiter=',') np.savetxt(directory + "times.csv", np.array([self.train_time, self.pred_time]), delimiter=',') + np.savetxt(directory + "times_iter.csv", + np.array([self.train_time, len(self.estimator_weights_)]), delimiter=',') return interpretString # def pregen_voters(self, X, y=None): @@ -123,7 +126,7 @@ class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, def formatCmdArgs(args): """Used to format kwargs for the parsed args""" kwargsDict = {'n_estimators': args.AdP_n_est, - 'base_estimator': DecisionTreeClassifier(max_depth=1), + 'base_estimator': [DecisionTreeClassifier(max_depth=1)], 'n_stumps': args.AdP_stumps} return kwargsDict diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen10.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen10.py index 7d6763e8f752d666828fa9f59b02dd5ef350acaa..de729f2dcc63238bb45cc74daef7f4bd777acc1d 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen10.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen10.py @@ -24,7 +24,7 @@ class AdaboostPregen10(AdaboostPregen): def formatCmdArgs(args): """Used to format kwargs for the parsed args""" kwargsDict = {'n_estimators': args.AdP_n_est, - 'base_estimator': DecisionTreeClassifier(max_depth=1), + 'base_estimator': [DecisionTreeClassifier(max_depth=1)], } return kwargsDict diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregenTree.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregenTree.py index 7e4d67377fd43eae8e50cf5a5a5f95599e52fede..fd9401bfa281070ce6c53ba29238f9573b31d744 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregenTree.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregenTree.py @@ -108,7 +108,7 @@ class AdaboostPregenTree(AdaBoostClassifier, BaseMonoviewClassifier, def formatCmdArgs(args): """Used to format kwargs for the parsed args""" kwargsDict = {'n_estimators': args.AdPT_n_est, - 'base_estimator': DecisionTreeClassifier(max_depth=1), + 'base_estimator': [DecisionTreeClassifier(max_depth=1)], 'n_stumps': args.AdPT_trees, "max_depth": args.AdPT_max_depth} return kwargsDict diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CBBoost.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CBBoost.py new file mode 100644 index 0000000000000000000000000000000000000000..4833778269b71e2c4297d3373c9b58a8f1c7e9cd --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CBBoost.py @@ -0,0 +1,48 @@ +from ..Monoview.Additions.CBBoostUtils import CBBoostClassifier +from ..Monoview.MonoviewUtils import BaseMonoviewClassifier, CustomRandint + + +class CBBoost(CBBoostClassifier, BaseMonoviewClassifier): + + def __init__(self, random_state=None, n_max_iterations=500, n_stumps=1, + **kwargs): + super(CBBoost, self).__init__(n_max_iterations=n_max_iterations, + random_state=random_state, + self_complemented=True, + twice_the_same=True, + random_start=False, + n_stumps=n_stumps, + c_bound_sol=True, + estimators_generator="Stumps", + mincq_tracking=False + ) + self.param_names = ["n_max_iterations", "n_stumps", "random_state"] + self.distribs = [CustomRandint(low=2, high=500), [n_stumps], + [random_state]] + self.classed_params = [] + self.weird_strings = {} + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return True + + def getInterpret(self, directory, y_test): + return self.getInterpretCBBoost(directory, y_test) + + def get_name_for_fusion(self): + return "CBB" + + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {"n_stumps": args.CBB_stumps, + "n_max_iterations": args.CBB_n_iter} + return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({}) + return paramsSet diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc.py index 89c84deed67d1e0e4cbb1efdb182133161851607..30a42f593b5e161ee299e36771e0c2eb215f7f23 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc.py @@ -13,9 +13,10 @@ class CGDesc(ColumnGenerationClassifierQar, BaseMonoviewClassifier): c_bound_choice=True, random_start=False, n_stumps=n_stumps, - use_r=True, + use_r=False, c_bound_sol=True, - estimators_generator="Stumps" + estimators_generator="Stumps", + mincq_tracking=False, ) self.param_names = ["n_max_iterations", "n_stumps", "random_state"] self.distribs = [CustomRandint(low=2, high=500), [n_stumps], diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py index dcf6c5b642481395855186c3b2d5c260e25859d8..620b3ff76d73b7084a4e115ac79928a4d3a57b21 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py @@ -12,7 +12,7 @@ Related papers: http://graal.ift.ulaval.ca/majorityvote/ """ __author__ = 'Jean-Francis Roy' - +import time import logging from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.metrics.pairwise import rbf_kernel, linear_kernel, \ @@ -142,7 +142,7 @@ class MinCqLearner(BaseEstimator, ClassifierMixin): if self.log: logging.info("Preparing QP...") self._prepare_qp(X, y_reworked) - + beg = time.time() try: if self.log: logging.info("Solving QP...") @@ -163,7 +163,8 @@ class MinCqLearner(BaseEstimator, ClassifierMixin): str(self), str(e))) self.majority_vote = None self.cbound_train = self.majority_vote.cbound_value(X, y_reworked) - + end=time.time() + self.train_time=end-beg return self def predict(self, X, save_data=True): @@ -608,6 +609,7 @@ class MinCQ(MinCqLearner, BaseMonoviewClassifier): y_rework[np.where(y_rework == 0)] = -1 interpret_string += "\n Test c_bound value : " + str( self.majority_vote.cbound_value(self.x_test, y_rework)) + np.savetxt(directory+"times.csv", np.array([self.train_time, 0])) return interpret_string def get_name_for_fusion(self): diff --git a/multiview_platform/MonoMultiViewClassifiers/Multiview/ExecMultiview.py b/multiview_platform/MonoMultiViewClassifiers/Multiview/ExecMultiview.py index a8b973396d3c6e1a9b09a626710b87681d72bcc8..54cd4e8585dccb29b8f0b3cf6fffd33d6a2784d0 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Multiview/ExecMultiview.py +++ b/multiview_platform/MonoMultiViewClassifiers/Multiview/ExecMultiview.py @@ -69,11 +69,11 @@ def saveResults(LABELS_DICTIONARY, stringAnalysis, views, classifierModule, testFileName = outputFileName + imageName + "-" + str( i) + ".png" if not os.path.isfile(testFileName): - imagesAnalysis[imageName].savefig(testFileName) + imagesAnalysis[imageName].savefig(testFileName, transparent=True) break imagesAnalysis[imageName].savefig( - outputFileName + imageName + '.png') + outputFileName + imageName + '.png', transparent=True) def ExecMultiview_multicore(directory, coreIndex, name, learningRate, nbFolds, diff --git a/multiview_platform/MonoMultiViewClassifiers/ResultAnalysis.py b/multiview_platform/MonoMultiViewClassifiers/ResultAnalysis.py index 34fb2ad53ef8eadceef98f57dac0490c360c3b6c..df79a60e94154361c95bfb8ab09841ca02fe98bc 100644 --- a/multiview_platform/MonoMultiViewClassifiers/ResultAnalysis.py +++ b/multiview_platform/MonoMultiViewClassifiers/ResultAnalysis.py @@ -5,9 +5,11 @@ import os import time import matplotlib as mpl +from matplotlib.patches import Patch # Import third party modules import matplotlib.pyplot as plt import numpy as np +import pandas as pd # Import own Modules from . import Metrics @@ -17,6 +19,49 @@ __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype +def plot_results_noise(directory, noise_results, metric_to_plot, name, width=0.1): + avail_colors = ["tab:blue", "tab:orange", "tab:brown", "tab:gray", + "tab:olive", "tab:red", ] + colors ={} + lengend_patches = [] + noise_levels = np.array([noise_level for noise_level, _ in noise_results]) + df = pd.DataFrame(columns=['noise_level', 'classifier_name', 'mean_score', 'score_std'], ) + if len(noise_results)>1: + width = np.min(np.diff(noise_levels)) + for noise_level, noise_result in noise_results: + classifiers_names, meaned_metrics, metric_stds = [], [], [] + for noise_result in noise_result: + classifier_name = noise_result[0].split("-")[0] + if noise_result[1] is metric_to_plot: + classifiers_names.append(classifier_name) + meaned_metrics.append(noise_result[2]) + metric_stds.append(noise_result[3]) + if classifier_name not in colors: + try: + colors[classifier_name] = avail_colors.pop(0) + except IndexError: + colors[classifier_name] = "k" + classifiers_names, meaned_metrics, metric_stds = np.array(classifiers_names), np.array(meaned_metrics), np.array(metric_stds) + sorted_indices = np.argsort(-meaned_metrics) + for index in sorted_indices: + row = pd.DataFrame( + {'noise_level':noise_level, 'classifier_name':classifiers_names[index], 'mean_score':meaned_metrics[index], + 'score_std':metric_stds[index]}, index=[0]) + df = pd.concat([df, row]) + plt.bar(noise_level, meaned_metrics[index], yerr=metric_stds[index], width=0.5*width, label=classifiers_names[index], color=colors[classifiers_names[index]]) + for classifier_name, color in colors.items(): + lengend_patches.append(Patch(facecolor=color, label=classifier_name)) + plt.legend(handles=lengend_patches, loc='lower center', bbox_to_anchor=(0.5, 1.05), ncol=2) + plt.ylabel(metric_to_plot) + plt.title(name) + plt.xticks(noise_levels) + plt.xlabel("Noise level") + plt.savefig(directory+name+"_noise_analysis.png") + plt.close() + df.to_csv(directory+name+"_noise_analysis.csv") + + + def autolabel(rects, ax, set=1, std=None): r"""Used to print the score below the bars. @@ -194,7 +239,7 @@ def sort_by_test_score(train_scores, test_scores, names, train_STDs=None, return sorted_names, sorted_train_scores, sorted_test_scores, sorted_train_STDs, sorted_test_STDs -def plotMetricScores(trainScores, testScores, names, nbResults, metricName, +def plotMetricScores(trainScores, testScores, names, nbResults, metricName, fileName, tag="", train_STDs=None, test_STDs=None): r"""Used to plot and save the score barplot for a specific metric. @@ -249,7 +294,7 @@ def plotMetricScores(trainScores, testScores, names, nbResults, metricName, plt.tight_layout() except: pass - f.savefig(fileName + '.png') + f.savefig(fileName + '.png', transparent=True) plt.close() import pandas as pd if train_STDs is None: @@ -285,6 +330,7 @@ def publishMetricsGraphs(metricsScores, directory, databaseName, labelsNames): Returns ------- """ + results=[] for metricName, metricScores in metricsScores.items(): logging.debug( "Start:\t Biclass score graph generation for " + metricName) @@ -303,7 +349,8 @@ def publishMetricsGraphs(metricsScores, directory, databaseName, labelsNames): logging.debug( "Done:\t Biclass score graph generation for " + metricName) - + results+=[[classifiersName, metricName, testMean, testSTD] for classifiersName, testMean, testSTD in zip(np.array(metricScores["classifiersNames"]), np.array(metricScores["testScores"]), np.zeros(len(np.array(metricScores["testScores"]))))] + return results def iterCmap(statsIter): r"""Used to generate a colormap that will have a tick for each iteration : the whiter the better. @@ -377,7 +424,7 @@ def publish2Dplot(data, classifiersNames, nbClassifiers, nbExamples, nbCopies, cbar = fig.colorbar(cax, ticks=[-100 * statsIter / 2, 0, statsIter]) cbar.ax.set_yticklabels(['Unseen', 'Always Wrong', 'Always Right']) fig.tight_layout() - fig.savefig(fileName + "error_analysis_2D.png", bbox_inches="tight") + fig.savefig(fileName + "error_analysis_2D.png", bbox_inches="tight", transparent=True) plt.close() @@ -405,7 +452,7 @@ def publishErrorsBarPlot(errorOnExamples, nbClassifiers, nbExamples, fileName): plt.bar(x, errorOnExamples) plt.ylim([0, nbClassifiers]) plt.title("Number of classifiers that failed to classify each example") - fig.savefig(fileName + "error_analysis_bar.png") + fig.savefig(fileName + "error_analysis_bar.png", transparent=True) plt.close() @@ -547,7 +594,7 @@ def analyzeBiclass(results, benchmarkArgumentDictionaries, statsIter, metrics): labelsNames = [arguments["LABELS_DICTIONARY"][0], arguments["LABELS_DICTIONARY"][1]] - publishMetricsGraphs(metricsScores, directory, databaseName, + results = publishMetricsGraphs(metricsScores, directory, databaseName, labelsNames) publishExampleErrors(exampleErrors, directory, databaseName, labelsNames) @@ -558,7 +605,7 @@ def analyzeBiclass(results, benchmarkArgumentDictionaries, statsIter, metrics): "exampleErrors": exampleErrors} logging.debug("Done:\t Analzing all biclass resuls") - return biclassResults + return results, biclassResults def genMetricsScoresMulticlass(results, trueLabels, metrics, @@ -612,6 +659,7 @@ def getErrorOnLabelsMulticlass(multiclassResults, multiclassLabels): def publishMulticlassScores(multiclassResults, metrics, statsIter, direcories, databaseName): + results=[] for iterIndex in range(statsIter): directory = direcories[iterIndex] for metric in metrics: @@ -639,6 +687,8 @@ def publishMulticlassScores(multiclassResults, metrics, statsIter, direcories, logging.debug( "Done:\t Multiclass score graph generation for " + metric[0]) + results+=[[classifiersName, metric, testMean, testSTD] for classifiersName, testMean, testSTD in zip(classifiersNames, validationScores, np.zeros(len(validationScores)))] + return results def publishMulticlassExmapleErrors(multiclassResults, directories, @@ -713,12 +763,12 @@ def analyzeMulticlass(results, statsIter, benchmarkArgumentDictionaries, multiclassResults = getErrorOnLabelsMulticlass(multiclassResults, multiclassLabels) - publishMulticlassScores(multiclassResults, metrics, statsIter, directories, + results = publishMulticlassScores(multiclassResults, metrics, statsIter, directories, benchmarkArgumentDictionaries[0]["args"].name) publishMulticlassExmapleErrors(multiclassResults, directories, benchmarkArgumentDictionaries[0][ "args"].name) - return multiclassResults + return results, multiclassResults def numpy_mean_and_std(scores_array): @@ -728,6 +778,7 @@ def numpy_mean_and_std(scores_array): def publishIterBiclassMetricsScores(iterResults, directory, labelsDictionary, classifiersDict, dataBaseName, statsIter, minSize=10): + results=[] for labelsCombination, iterResult in iterResults.items(): currentDirectory = directory + labelsDictionary[ int(labelsCombination[0])] + "-vs-" + labelsDictionary[ @@ -754,6 +805,8 @@ def publishIterBiclassMetricsScores(iterResults, directory, labelsDictionary, metricName=metricName, fileName=fileName, tag=" averaged", train_STDs=trainSTDs, test_STDs=testSTDs) + results+=[[classifiersName, metricName, testMean, testSTD] for classifiersName, testMean, testSTD in zip(names, testMeans, testSTDs)] + return results def gen_error_dat_glob(combiResults, statsIter, base_file_name): @@ -796,6 +849,7 @@ def publishIterBiclassExampleErrors(iterResults, directory, labelsDictionary, def publishIterMulticlassMetricsScores(iterMulticlassResults, classifiersNames, dataBaseName, directory, statsIter, minSize=10): + results = [] for metricName, scores in iterMulticlassResults["metricsScores"].items(): trainMeans, trainSTDs = numpy_mean_and_std(scores["trainScores"]) testMeans, testSTDs = numpy_mean_and_std(scores["testScores"]) @@ -812,6 +866,9 @@ def publishIterMulticlassMetricsScores(iterMulticlassResults, classifiersNames, tag=" averaged multiclass", train_STDs=trainSTDs, test_STDs=testSTDs) + results+=[[classifiersName, metricName,testMean, testSTD] for classifiersName, testMean, testSTD in zip(classifiersNames, testMeans, testSTDs)] + return results + def publishIterMulticlassExampleErrors(iterMulticlassResults, directory, classifiersNames, statsIter, minSize=10): @@ -900,13 +957,13 @@ def analyzebiclassIter(biclassResults, metrics, statsIter, directory, classifiersDict[classifierName], :] += errorOnExample[ "errorOnExamples"] - publishIterBiclassMetricsScores(iterBiclassResults, directory, + results = publishIterBiclassMetricsScores(iterBiclassResults, directory, labelsDictionary, classifiersDict, dataBaseName, statsIter) publishIterBiclassExampleErrors(iterBiclassResults, directory, labelsDictionary, classifiersDict, statsIter) - + return results def analyzeIterMulticlass(multiclassResults, directory, statsIter, metrics, dataBaseName, nbExamples): @@ -942,10 +999,11 @@ def analyzeIterMulticlass(multiclassResults, directory, statsIter, metrics, logging.debug("Start:\t Getting mean results for multiclass classification") classifiersNames = np.array(classifiersNames) - publishIterMulticlassMetricsScores(iterMulticlassResults, classifiersNames, + results = publishIterMulticlassMetricsScores(iterMulticlassResults, classifiersNames, dataBaseName, directory, statsIter) publishIterMulticlassExampleErrors(iterMulticlassResults, directory, classifiersNames, statsIter) + return results def getResults(results, statsIter, nbMulticlass, benchmarkArgumentDictionaries, @@ -954,18 +1012,20 @@ def getResults(results, statsIter, nbMulticlass, benchmarkArgumentDictionaries, nbExamples, nbLabels): """Used to analyze the results of the previous benchmarks""" dataBaseName = benchmarkArgumentDictionaries[0]["args"].name - biclassResults = analyzeBiclass(results, benchmarkArgumentDictionaries, + results_means_std, biclassResults = analyzeBiclass(results, benchmarkArgumentDictionaries, statsIter, metrics) + if nbMulticlass > 1: - multiclassResults = analyzeMulticlass(results, statsIter, + results_means_std, multiclassResults = analyzeMulticlass(results, statsIter, benchmarkArgumentDictionaries, nbExamples, nbLabels, multiclassLabels, metrics, classificationIndices, directories) if statsIter > 1: - analyzebiclassIter(biclassResults, metrics, statsIter, directory, + results_means_std = analyzebiclassIter(biclassResults, metrics, statsIter, directory, labelsDictionary, dataBaseName, nbExamples) if nbMulticlass > 1: - analyzeIterMulticlass(multiclassResults, directory, statsIter, + results_means_std = analyzeIterMulticlass(multiclassResults, directory, statsIter, metrics, dataBaseName, nbExamples) + return results_means_std diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py b/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py index 1e24d982a3c70d1fa5578d4b714f06bf9c297137..52a9c21c4adc9a9094d5362fdb60edbbe2c308b1 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py @@ -82,7 +82,7 @@ def makeMeNoisy(viewData, randomState, percentage=15): def getPlausibleDBhdf5(features, pathF, name, NB_CLASS=3, LABELS_NAME="", randomState=None, full=True, add_noise=False, noise_std=0.15, nbView=3, - nbClass=2, datasetLength=34, randomStateInt=None): + nbClass=2, datasetLength=1000, randomStateInt=None): """Used to generate a plausible dataset to test the algorithms""" randomStateInt = 42 randomState = np.random.RandomState(randomStateInt) @@ -333,25 +333,26 @@ def filterViews(datasetFile, temp_dataset, views, usedIndices): for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): copyhdf5Dataset(datasetFile, temp_dataset, "View" + str(viewIndex), "View" + str(viewIndex), usedIndices) - for askedViewName in views: - for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): - viewName = datasetFile.get("View" + str(viewIndex)).attrs["name"] - if type(viewName) == bytes: - viewName = viewName.decode("utf-8") - if viewName == askedViewName: - copyhdf5Dataset(datasetFile, temp_dataset, - "View" + str(viewIndex), - "View" + str(newViewIndex), usedIndices) - newViewName = \ - temp_dataset.get("View" + str(newViewIndex)).attrs["name"] - if type(newViewName) == bytes: - temp_dataset.get("View" + str(newViewIndex)).attrs[ - "name"] = newViewName.decode("utf-8") - - newViewIndex += 1 - else: - pass - temp_dataset.get("Metadata").attrs["nbView"] = len(views) + else: + for askedViewName in views: + for viewIndex in range(datasetFile.get("Metadata").attrs["nbView"]): + viewName = datasetFile.get("View" + str(viewIndex)).attrs["name"] + if type(viewName) == bytes: + viewName = viewName.decode("utf-8") + if viewName == askedViewName: + copyhdf5Dataset(datasetFile, temp_dataset, + "View" + str(viewIndex), + "View" + str(newViewIndex), usedIndices) + newViewName = \ + temp_dataset.get("View" + str(newViewIndex)).attrs["name"] + if type(newViewName) == bytes: + temp_dataset.get("View" + str(newViewIndex)).attrs[ + "name"] = newViewName.decode("utf-8") + + newViewIndex += 1 + else: + pass + temp_dataset.get("Metadata").attrs["nbView"] = len(views) def copyhdf5Dataset(sourceDataFile, destinationDataFile, sourceDatasetName, @@ -432,11 +433,9 @@ def add_gaussian_noise(dataset_file, random_state, path_f, dataset_name, dataset_file.copy("Labels", noisy_dataset) for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): dataset_file.copy("View" + str(view_index), noisy_dataset) - # dataset_file.close() for view_index in range(noisy_dataset.get("Metadata").attrs["nbView"]): view_name = "View" + str(view_index) view_dset = noisy_dataset.get(view_name) - # orig_shape = view_dset.value.shape view_limits = dataset_file[ "Metadata/View" + str(view_index) + "_limits"].value view_ranges = view_limits[:, 1] - view_limits[:, 0] @@ -448,7 +447,10 @@ def add_gaussian_noise(dataset_file, random_state, path_f, dataset_name, noised_data = np.where(noised_data > view_limits[:, 1], view_limits[:, 1], noised_data) noisy_dataset[view_name][...] = noised_data - # final_shape = noised_data.shape + original_dataset_filename = dataset_file.filename + dataset_file.close() + if "_temp_" in original_dataset_filename: + os.remove(original_dataset_filename) return noisy_dataset, dataset_name + "_noised" diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py b/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py index 84e03d89b5d569506dcfe1f277058807107dba35..08b23063f2e1e58a9b536c371b56dc73698e7353 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/HyperParameterSearch.py @@ -146,7 +146,7 @@ def genHeatMaps(params, scoresArray, outputFileName): plt.yticks(np.arange(len(paramArray2Set)), paramArray2Set, rotation=45) plt.title('Validation metric') plt.savefig( - outputFileName + "heat_map-" + paramName1 + "-" + paramName2 + ".png") + outputFileName + "heat_map-" + paramName1 + "-" + paramName2 + ".png", transparent=True) plt.close() # nohup python ~/dev/git/spearmint/spearmint/main.py . & diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py index cce4ef6d1f962c1c3bb869171df54fcded42111a..dc8507bbc4f305c693d4bd213811aa9058927002 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py @@ -22,9 +22,9 @@ def parseTheArgs(arguments): groupStandard = parser.add_argument_group('Standard arguments') groupStandard.add_argument('-log', action='store_true', help='Use option to activate logging to console') - groupStandard.add_argument('--name', metavar='STRING', action='store', + groupStandard.add_argument('--name', metavar='STRING', nargs='+', action='store', help='Name of Database (default: %(default)s)', - default='Plausible') + default=['Plausible']) groupStandard.add_argument('--label', metavar='STRING', action='store', help='Labeling the results directory (default: ' '%(default)s)', @@ -64,10 +64,10 @@ def parseTheArgs(arguments): help='Use option to bebug implemented algorithms') groupStandard.add_argument('-add_noise', action='store_true', help='Use option to add noise to the data') - groupStandard.add_argument('--noise_std', metavar='FLOAT', action='store', + groupStandard.add_argument('--noise_std', metavar='FLOAT', nargs="+", action='store', help='The std of the gaussian noise that will ' 'be added to the data.', - type=float, default=0.15) + type=float, default=[0.0]) groupStandard.add_argument('--res_dir', metavar='STRING', action='store', help='The path to the result directory', default="../Results/") @@ -143,227 +143,227 @@ def parseTheArgs(arguments): groupRF = parser.add_argument_group('Random Forest arguments') groupRF.add_argument('--RF_trees', metavar='INT', type=int, action='store', help='Number max trees',nargs="+", - default=25) + default=[25]) groupRF.add_argument('--RF_max_depth', metavar='INT', type=int, action='store',nargs="+", help='Max depth for the trees', - default=5) + default=[5]) groupRF.add_argument('--RF_criterion', metavar='STRING', action='store', help='Criterion for the trees',nargs="+", - default="entropy") + default=["entropy"]) groupSVMLinear = parser.add_argument_group('Linear SVM arguments') groupSVMLinear.add_argument('--SVML_C', metavar='INT', type=int, action='store', nargs="+", help='Penalty parameter used', - default=1) + default=[1]) groupSVMRBF = parser.add_argument_group('SVW-RBF arguments') groupSVMRBF.add_argument('--SVMRBF_C', metavar='INT', type=int, action='store', nargs="+", help='Penalty parameter used', - default=1) + default=[1]) groupSVMPoly = parser.add_argument_group('Poly SVM arguments') groupSVMPoly.add_argument('--SVMPoly_C', metavar='INT', type=int, action='store', nargs="+", help='Penalty parameter used', - default=1) + default=[1]) groupSVMPoly.add_argument('--SVMPoly_deg', nargs="+", metavar='INT', type=int, action='store', help='Degree parameter used', - default=2) + default=[2]) groupAdaboost = parser.add_argument_group('Adaboost arguments') groupAdaboost.add_argument('--Ada_n_est', metavar='INT', type=int, action='store', nargs="+", help='Number of estimators', - default=2) + default=[2]) groupAdaboost.add_argument('--Ada_b_est', metavar='STRING', action='store', help='Estimators',nargs="+", - default='DecisionTreeClassifier') + default=['DecisionTreeClassifier']) groupAdaboostPregen = parser.add_argument_group('AdaboostPregen arguments') groupAdaboostPregen.add_argument('--AdP_n_est', metavar='INT', type=int, action='store',nargs="+", help='Number of estimators', - default=100) + default=[100]) groupAdaboostPregen.add_argument('--AdP_b_est', metavar='STRING', action='store',nargs="+", help='Estimators', - default='DecisionTreeClassifier') + default=['DecisionTreeClassifier']) groupAdaboostPregen.add_argument('--AdP_stumps', metavar='INT', type=int, action='store',nargs="+", help='Number of stumps inthe ' 'pregenerated dataset', - default=1) + default=[1]) groupAdaboostGraalpy = parser.add_argument_group( 'AdaboostGraalpy arguments') groupAdaboostGraalpy.add_argument('--AdG_n_iter', metavar='INT', type=int, action='store',nargs="+", help='Number of estimators', - default=100) + default=[100]) groupAdaboostGraalpy.add_argument('--AdG_stumps', metavar='INT', type=int, action='store',nargs="+", help='Number of stumps inthe ' 'pregenerated dataset', - default=1) + default=[1]) groupDT = parser.add_argument_group('Decision Trees arguments') groupDT.add_argument('--DT_depth', metavar='INT', type=int, action='store', help='Determine max depth for Decision Trees',nargs="+", - default=3) + default=[3]) groupDT.add_argument('--DT_criterion', metavar='STRING', action='store', help='Determine max depth for Decision Trees',nargs="+", - default="entropy") + default=["entropy"]) groupDT.add_argument('--DT_splitter', metavar='STRING', action='store', help='Determine criterion for Decision Trees',nargs="+", - default="random") + default=["random"]) groupDTP = parser.add_argument_group('Decision Trees pregen arguments') groupDTP.add_argument('--DTP_depth', metavar='INT', type=int, action='store',nargs="+", help='Determine max depth for Decision Trees', - default=3) + default=[3]) groupDTP.add_argument('--DTP_criterion', metavar='STRING', action='store', help='Determine max depth for Decision Trees',nargs="+", - default="entropy") + default=["entropy"]) groupDTP.add_argument('--DTP_splitter', metavar='STRING', action='store', help='Determine criterion for Decision Trees',nargs="+", - default="random") + default=["random"]) groupDTP.add_argument('--DTP_stumps', metavar='INT', type=int, action='store',nargs="+", help='Determine the number of stumps for Decision ' 'Trees pregen', - default=1) + default=[1]) groupSGD = parser.add_argument_group('SGD arguments') groupSGD.add_argument('--SGD_alpha', metavar='FLOAT', type=float, action='store',nargs="+", - help='Determine alpha for SGDClassifier', default=0.1) + help='Determine alpha for SGDClassifier', default=[0.1]) groupSGD.add_argument('--SGD_loss', metavar='STRING', action='store', help='Determine loss for SGDClassifier',nargs="+", - default='log') + default=['log']) groupSGD.add_argument('--SGD_penalty', metavar='STRING', action='store', help='Determine penalty for SGDClassifier', nargs="+", - default='l2') + default=['l2']) groupKNN = parser.add_argument_group('KNN arguments') groupKNN.add_argument('--KNN_neigh', metavar='INT', type=int, action='store',nargs="+", help='Determine number of neighbors for KNN', - default=1) + default=[1]) groupKNN.add_argument('--KNN_weights', nargs="+", metavar='STRING', action='store', help='Determine number of neighbors for KNN', - default="distance") + default=["distance"]) groupKNN.add_argument('--KNN_algo', metavar='STRING', action='store', help='Determine number of neighbors for KNN', - default="auto",nargs="+", ) + default=["auto"],nargs="+", ) groupKNN.add_argument('--KNN_p', metavar='INT', nargs="+", type=int, action='store', help='Determine number of neighbors for KNN', - default=1) + default=[1]) groupSCM = parser.add_argument_group('SCM arguments') groupSCM.add_argument('--SCM_max_rules', metavar='INT', type=int, action='store', nargs="+", - help='Max number of rules for SCM', default=1) + help='Max number of rules for SCM', default=[1]) groupSCM.add_argument('--SCM_p', metavar='FLOAT', type=float, action='store', nargs="+", - help='Max number of rules for SCM', default=1.0) + help='Max number of rules for SCM', default=[1.0]) groupSCM.add_argument('--SCM_model_type', metavar='STRING', action='store', help='Max number of rules for SCM', nargs="+", - default="conjunction") + default=["conjunction"]) groupSCMPregen = parser.add_argument_group('SCMPregen arguments') groupSCMPregen.add_argument('--SCP_max_rules', metavar='INT', type=int, action='store',nargs="+", - help='Max number of rules for SCM', default=1) + help='Max number of rules for SCM', default=[1]) groupSCMPregen.add_argument('--SCP_p', metavar='FLOAT', type=float, action='store',nargs="+", - help='Max number of rules for SCM', default=1.0) + help='Max number of rules for SCM', default=[1.0]) groupSCMPregen.add_argument('--SCP_model_type', metavar='STRING', action='store',nargs="+", help='Max number of rules for SCM', - default="conjunction") + default=["conjunction"]) groupSCMPregen.add_argument('--SCP_stumps', metavar='INT', type=int, action='store',nargs="+", help='Number of stumps per attribute', - default=1) + default=[1]) groupSCMSparsity = parser.add_argument_group('SCMSparsity arguments') groupSCMSparsity.add_argument('--SCS_max_rules', metavar='INT', type=int, action='store',nargs="+", - help='Max number of rules for SCM', default=1) + help='Max number of rules for SCM', default=[1]) groupSCMSparsity.add_argument('--SCS_stumps', metavar='INT', type=int, action='store',nargs="+", - help='Number of stumps', default=1) + help='Number of stumps', default=[1]) groupSCMSparsity.add_argument('--SCS_p', metavar='FLOAT', type=float, action='store',nargs="+", help='Max number of rules for SCM', - default=1.0) + default=[1.0]) groupSCMSparsity.add_argument('--SCS_model_type', metavar='STRING', action='store',nargs="+", help='Max number of rules for SCM', - default="conjunction") + default=["conjunction"]) groupCQBoost = parser.add_argument_group('CQBoost arguments') groupCQBoost.add_argument('--CQB_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for CQBoost', - default=0.001) + default=[0.001]) groupCQBoost.add_argument('--CQB_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for CQBoost', - default=1e-06) + default=[1e-06]) groupCQBoost.add_argument('--CQB_stumps', metavar='INT', type=int, action='store',nargs="+", help='Set the number of stumps for CQBoost', - default=1) + default=[1]) groupCQBoost.add_argument('--CQB_n_iter', metavar='INT', type=int, action='store',nargs="+", help='Set the maximum number of iteration in ' 'CQBoost', - default=None) + default=[None]) groupCQBoostv2 = parser.add_argument_group('CQBoostv2 arguments') groupCQBoostv2.add_argument('--CQB2_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for CQBoostv2', - default=0.002) + default=[0.002]) groupCQBoostv2.add_argument('--CQB2_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for CQBoostv2', - default=1e-08) + default=[1e-08]) groupCQBoostv21 = parser.add_argument_group('CQBoostv21 arguments') groupCQBoostv21.add_argument('--CQB21_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for CQBoostv2', - default=0.001) + default=[0.001]) groupCQBoostv21.add_argument('--CQB21_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for CQBoostv2', - default=1e-08) + default=[1e-08]) groupQarBoost = parser.add_argument_group('QarBoost arguments') groupQarBoost.add_argument('--QarB_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for QarBoost', - default=0.001) + default=[0.001]) groupQarBoost.add_argument('--QarB_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for QarBoost', - default=1e-08) + default=[1e-08]) groupCGreed = parser.add_argument_group('CGreed arguments') groupCGreed.add_argument('--CGR_stumps', metavar='INT', type=int, action='store',nargs="+", help='Set the n_stumps_per_attribute parameter ' 'for CGreed', - default=1) + default=[1]) groupCGreed.add_argument('--CGR_n_iter', metavar='INT', type=int, action='store',nargs="+", help='Set the n_max_iterations parameter for ' 'CGreed', - default=100) + default=[100]) groupCGDesc = parser.add_argument_group('CGDesc arguments') groupCGDesc.add_argument('--CGD_stumps', nargs="+", metavar='INT', type=int, @@ -375,82 +375,94 @@ def parseTheArgs(arguments): action='store', nargs="+", help='Set the n_max_iterations parameter for ' 'CGreed', - default=100) + default=[10]) + + groupCBBoost= parser.add_argument_group('CBBoost arguments') + groupCBBoost.add_argument('--CBB_stumps', nargs="+", metavar='INT', type=int, + action='store', + help='Set the n_stumps_per_attribute parameter ' + 'for CBBoost', + default=[1]) + groupCBBoost.add_argument('--CBB_n_iter', metavar='INT', type=int, + action='store', nargs="+", + help='Set the n_max_iterations parameter for ' + 'CBBoost', + default=[100]) groupCGDescTree = parser.add_argument_group('CGDesc arguments') groupCGDescTree.add_argument('--CGDT_trees', metavar='INT', type=int, action='store', nargs="+", help='Set thenumber of trees for CGreed', - default=100) + default=[100]) groupCGDescTree.add_argument('--CGDT_n_iter', metavar='INT', type=int, action='store', nargs="+", help='Set the n_max_iterations parameter for ' 'CGreed', - default=100) + default=[100]) groupCGDescTree.add_argument('--CGDT_max_depth', metavar='INT', type=int, action='store', nargs="+", help='Set the n_max_iterations parameter for CGreed', - default=2) + default=[2]) groupMinCQGraalpyTree = parser.add_argument_group( 'MinCQGraalpyTree arguments') groupMinCQGraalpyTree.add_argument('--MCGT_mu', metavar='FLOAT', type=float, action='store', nargs="+", help='Set the mu_parameter for MinCQGraalpy', - default=0.05) + default=[0.05]) groupMinCQGraalpyTree.add_argument('--MCGT_trees', metavar='INT', type=int, action='store', nargs="+", help='Set the n trees parameter for MinCQGraalpy', - default=100) + default=[100]) groupMinCQGraalpyTree.add_argument('--MCGT_max_depth', metavar='INT', type=int,nargs="+", action='store', help='Set the n_stumps_per_attribute parameter for MinCQGraalpy', - default=2) + default=[2]) groupCQBoostTree = parser.add_argument_group('CQBoostTree arguments') groupCQBoostTree.add_argument('--CQBT_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for CQBoost', - default=0.001) + default=[0.001]) groupCQBoostTree.add_argument('--CQBT_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for CQBoost', - default=1e-06) + default=[1e-06]) groupCQBoostTree.add_argument('--CQBT_trees', metavar='INT', type=int, action='store',nargs="+", help='Set the number of trees for CQBoost', - default=100) + default=[100]) groupCQBoostTree.add_argument('--CQBT_max_depth', metavar='INT', type=int, action='store',nargs="+", help='Set the number of stumps for CQBoost', - default=2) + default=[2]) groupCQBoostTree.add_argument('--CQBT_n_iter', metavar='INT', type=int, action='store',nargs="+", help='Set the maximum number of iteration in CQBoostTree', - default=None) + default=[None]) groupSCMPregenTree = parser.add_argument_group('SCMPregenTree arguments') groupSCMPregenTree.add_argument('--SCPT_max_rules', metavar='INT', type=int, action='store',nargs="+", help='Max number of rules for SCM', - default=1) + default=[1]) groupSCMPregenTree.add_argument('--SCPT_p', metavar='FLOAT', type=float, action='store',nargs="+", help='Max number of rules for SCM', - default=1.0) + default=[1.0]) groupSCMPregenTree.add_argument('--SCPT_model_type', metavar='STRING', action='store',nargs="+", help='Max number of rules for SCM', - default="conjunction") + default=["conjunction"]) groupSCMPregenTree.add_argument('--SCPT_trees', metavar='INT', type=int, action='store',nargs="+", help='Number of stumps per attribute', - default=100) + default=[100]) groupSCMPregenTree.add_argument('--SCPT_max_depth', metavar='INT', type=int, action='store',nargs="+", help='Max_depth of the trees', - default=1) + default=[1]) groupSCMSparsityTree = parser.add_argument_group( 'SCMSparsityTree arguments') @@ -458,24 +470,24 @@ def parseTheArgs(arguments): type=int,nargs="+", action='store', help='Max number of rules for SCM', - default=1) + default=[1]) groupSCMSparsityTree.add_argument('--SCST_p', metavar='FLOAT', type=float, action='store',nargs="+", help='Max number of rules for SCM', - default=1.0) + default=[1.0]) groupSCMSparsityTree.add_argument('--SCST_model_type', metavar='STRING', action='store',nargs="+", help='Max number of rules for SCM', - default="conjunction") + default=["conjunction"]) groupSCMSparsityTree.add_argument('--SCST_trees', metavar='INT', type=int, action='store',nargs="+", help='Number of stumps per attribute', - default=100) + default=[100]) groupSCMSparsityTree.add_argument('--SCST_max_depth', metavar='INT', type=int,nargs="+", action='store', help='Max_depth of the trees', - default=1) + default=[1]) groupAdaboostPregenTree = parser.add_argument_group( 'AdaboostPregenTrees arguments') @@ -483,98 +495,98 @@ def parseTheArgs(arguments): type=int,nargs="+", action='store', help='Number of estimators', - default=100) + default=[100]) groupAdaboostPregenTree.add_argument('--AdPT_b_est', metavar='STRING', action='store',nargs="+", help='Estimators', - default='DecisionTreeClassifier') + default=['DecisionTreeClassifier']) groupAdaboostPregenTree.add_argument('--AdPT_trees', metavar='INT', type=int,nargs="+", action='store', help='Number of trees in the pregenerated dataset', - default=100) + default=[100]) groupAdaboostPregenTree.add_argument('--AdPT_max_depth', metavar='INT', type=int,nargs="+", action='store', help='Number of stumps inthe pregenerated dataset', - default=3) + default=[3]) groupLasso = parser.add_argument_group('Lasso arguments') groupLasso.add_argument('--LA_n_iter', metavar='INT', type=int, action='store',nargs="+", help='Set the max_iter parameter for Lasso', - default=1) + default=[1]) groupLasso.add_argument('--LA_alpha', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the alpha parameter for Lasso', - default=1.0) + default=[1.0]) groupGradientBoosting = parser.add_argument_group( 'Gradient Boosting arguments') groupGradientBoosting.add_argument('--GB_n_est', metavar='INT', type=int, action='store',nargs="+", help='Set the n_estimators_parameter for Gradient Boosting', - default=100) + default=[100]) groupMinCQ = parser.add_argument_group('MinCQ arguments') groupMinCQ.add_argument('--MCQ_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu_parameter for MinCQ', - default=0.05) + default=[0.05]) groupMinCQ.add_argument('--MCQ_stumps', metavar='INT', type=int, action='store',nargs="+", help='Set the n_stumps_per_attribute parameter for MinCQ', - default=1) + default=[1]) groupMinCQGraalpy = parser.add_argument_group('MinCQGraalpy arguments') groupMinCQGraalpy.add_argument('--MCG_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu_parameter for MinCQGraalpy', - default=0.05) + default=[0.05]) groupMinCQGraalpy.add_argument('--MCG_stumps', metavar='INT', type=int, action='store',nargs="+", help='Set the n_stumps_per_attribute parameter for MinCQGraalpy', - default=1) + default=[1]) groupQarBoostv3 = parser.add_argument_group('QarBoostv3 arguments') groupQarBoostv3.add_argument('--QarB3_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for QarBoostv3', - default=0.001) + default=[0.001]) groupQarBoostv3.add_argument('--QarB3_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for QarBoostv3', - default=1e-08) + default=[1e-08]) groupQarBoostNC = parser.add_argument_group('QarBoostNC arguments') groupQarBoostNC.add_argument('--QarBNC_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for QarBoostNC', - default=0.001) + default=[0.001]) groupQarBoostNC.add_argument('--QarBNC_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for QarBoostNC', - default=1e-08) + default=[1e-08]) groupQarBoostNC2 = parser.add_argument_group('QarBoostNC2 arguments') groupQarBoostNC2.add_argument('--QarBNC2_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for QarBoostNC2', - default=0.001) + default=[0.001]) groupQarBoostNC2.add_argument('--QarBNC2_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for QarBoostNC2', - default=1e-08) + default=[1e-08]) groupQarBoostNC3 = parser.add_argument_group('QarBoostNC3 arguments') groupQarBoostNC3.add_argument('--QarBNC3_mu', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the mu parameter for QarBoostNC3', - default=0.001) + default=[0.001]) groupQarBoostNC3.add_argument('--QarBNC3_epsilon', metavar='FLOAT', type=float, action='store',nargs="+", help='Set the epsilon parameter for QarBoostNC3', - default=1e-08) + default=[1e-08]) # # Multiview args @@ -765,7 +777,7 @@ def getDatabaseFunction(name, type): return getDatabase -def initLogFile(name, views, CL_type, log, debug, label, result_directory): +def initLogFile(name, views, CL_type, log, debug, label, result_directory, add_noise, noise_std): r"""Used to init the directory where the preds will be stored and the log file. First this function will check if the result directory already exists (only one per minute is allowed). @@ -788,11 +800,12 @@ def initLogFile(name, views, CL_type, log, debug, label, result_directory): resultsDirectory : string Reference to the main results directory for the benchmark. """ + noise_string = "/n_"+str(int(noise_std*100)) if debug: - resultDirectory = result_directory + name + "/debug_started_" + time.strftime( + resultDirectory = result_directory + name + noise_string +"/debug_started_" + time.strftime( "%Y_%m_%d-%H_%M_%S") + "_" + label + "/" else: - resultDirectory = result_directory + name + "/started_" + time.strftime( + resultDirectory = result_directory + name + noise_string+ "/started_" + time.strftime( "%Y_%m_%d-%H_%M") + "_" + label + "/" logFileName = time.strftime("%Y_%m_%d-%H_%M") + "-" + ''.join( CL_type) + "-" + "_".join( @@ -894,6 +907,7 @@ def initViews(DATASET, argViews): Names of all the available views in the dataset. """ NB_VIEW = DATASET.get("Metadata").attrs["nbView"] + print(NB_VIEW) if argViews != [""]: allowedViews = argViews allViews = [str(DATASET.get("View" + str(viewIndex)).attrs["name"]) @@ -947,6 +961,16 @@ def genDirecortiesNames(directory, statsIter): return directories +def find_dataset_names(path, type, names): + """This function goal is to browse the dataset directory and extarcts all the needed dataset names.""" + available_file_names = [file_name.strip().split(".")[0] for file_name in os.listdir(path) if file_name.endswith(type)] + if names == ["all"]: + return available_file_names + elif len(names)>1: + return [used_name for used_name in available_file_names if used_name in names] + else: + return names + def genArgumentDictionaries(labelsDictionary, directories, multiclassLabels, labelsCombinations, indicesMulticlass, hyperParamSearch, args, kFolds,