diff --git a/config_files/config_test.yml b/config_files/config_test.yml index 0a541ce0560b1e94d76d1a20bc71715526877a2e..7ed807b690443cfbf8cc59ae5e744784d2df5042 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -1,11 +1,11 @@ # The base configuration of the benchmark Base : log: True - name: ["lives_13view", "koukou"] + name: ["plausible",] label: "_" type: ".hdf5" views: - pathf: "/home/baptiste/Documents/Datasets/Alexis/data/" + pathf: "../data/" nice: 0 random_state: 42 nb_cores: 1 @@ -18,18 +18,18 @@ Base : # All the classification-realted configuration options Classification: multiclass_method: "oneVersusOne" - split: 0.9 + split: 0.8 nb_folds: 2 nb_class: 2 - classes: ["EMF", ] - type: ["multiview", "monoview"] - algos_monoview: ["decision_tree", "adaboost", "random_forest" ] + classes: + type: ["monoview",] + algos_monoview: ["gradient_boosting", ] algos_multiview: ["weighted_linear_early_fusion",] stats_iter: 2 metrics: ["accuracy_score", "f1_score"] metric_princ: "f1_score" hps_type: "randomized_search-equiv" - hps_iter: 5 + hps_iter: 1 ##################################### diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py index f98cc1949b9962f8f80ad32d3693faf65ea19f2c..91edf3ddaf5e9e19f6b144fe386979763dea5e65 100644 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ b/multiview_platform/mono_multi_view_classifiers/exec_classif.py @@ -18,7 +18,7 @@ from . import multiview_classifiers from .multiview.exec_multiview import exec_multiview, exec_multiview_multicore from .monoview.exec_classif_mono_view import exec_monoview, exec_monoview_multicore from .utils.dataset import delete_HDF5 -from .result_analysis import get_results, plot_results_noise, analyze_biclass +from .result_analysis import get_results, plot_results_noise, analyze_iterations from .utils import execution, dataset, multiclass, configuration matplotlib.use( @@ -479,7 +479,7 @@ def arange_metrics(metrics, metric_princ): def benchmark_init(directory, classification_indices, labels, labels_dictionary, - k_folds): + k_folds, dataset_var): """ Initializes the benchmark, by saving the indices of the train examples and the cross validation folds. @@ -513,7 +513,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary, if exc.errno != errno.EEXIST: raise train_indices = classification_indices[0] - train_labels = labels[train_indices] + train_labels = dataset_var.get_labels(example_indices=train_indices) np.savetxt(os.path.join(directory, "train_labels.csv"), train_labels, delimiter=",") np.savetxt(os.path.join(directory, "train_indices.csv"), classification_indices[0], delimiter=",") @@ -656,14 +656,14 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, flag=None, labels=None,): results_monoview, labels_names = benchmark_init(directory, classification_indices, labels, - labels_dictionary, k_folds) + labels_dictionary, k_folds, dataset_var) logging.getLogger('matplotlib.font_manager').disabled = True logging.debug("Start:\t monoview benchmark") traceback_outputs = {} for arguments in argument_dictionaries["monoview"]: try: X = dataset_var.get_v(arguments["view_index"]) - Y = labels + Y = dataset_var.get_labels() results_monoview += [ exec_monoview(directory, X, Y, args["Base"]["name"], labels_names, classification_indices, k_folds, @@ -702,11 +702,9 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, return [flag, results_monoview + results_multiview, traceback_outputs] -def exec_benchmark(nb_cores, stats_iter, nb_multiclass, - benchmark_arguments_dictionaries, classification_indices, - directories, - directory, multi_class_labels, metrics, labels_dictionary, - nb_labels, dataset_var, +def exec_benchmark(nb_cores, stats_iter, + benchmark_arguments_dictionaries, + directory, metrics, dataset_var, # exec_one_benchmark=exec_one_benchmark, # exec_one_benchmark_multicore=exec_one_benchmark_multicore, exec_one_benchmark_mono_core=exec_one_benchmark_mono_core, @@ -771,26 +769,18 @@ def exec_benchmark(nb_cores, stats_iter, nb_multiclass, # else: for arguments in benchmark_arguments_dictionaries: benchmark_results = exec_one_benchmark_mono_core(dataset_var=dataset_var, **arguments) - analyze_biclass([benchmark_results], benchmark_arguments_dictionaries, stats_iter, metrics, example_ids=dataset_var.example_ids) + analyze_iterations([benchmark_results], benchmark_arguments_dictionaries, stats_iter, metrics, example_ids=dataset_var.example_ids, labels=dataset_var.get_labels()) results += [benchmark_results] logging.debug("Done:\t Executing all the needed biclass benchmarks") # Do everything with flagging - nb_examples = len(classification_indices[0][0]) + len( - classification_indices[0][1]) - multiclass_ground_truth = dataset_var.get_labels() logging.debug("Start:\t Analyzing predictions") - results_mean_stds = get_results(results, stats_iter, nb_multiclass, + results_mean_stds = get_results(results, stats_iter, benchmark_arguments_dictionaries, - multiclass_ground_truth, metrics, - classification_indices, - directories, directory, - labels_dictionary, - nb_examples, - nb_labels, - dataset_var.example_ids) + dataset_var.example_ids, + dataset_var.get_labels()) logging.debug("Done:\t Analyzing predictions") delete(benchmark_arguments_dictionaries, nb_cores, dataset_var) return results_mean_stds @@ -856,8 +846,8 @@ def exec_classif(arguments): splits = execution.gen_splits(dataset_var.get_labels(), args["Classification"]["split"], stats_iter_random_states) - multiclass_labels, labels_combinations, indices_multiclass = multiclass.gen_multiclass_labels( - dataset_var.get_labels(), multiclass_method, splits) + # multiclass_labels, labels_combinations, indices_multiclass = multiclass.gen_multiclass_labels( + # dataset_var.get_labels(), multiclass_method, splits) k_folds = execution.gen_k_folds(stats_iter, args["Classification"]["nb_folds"], stats_iter_random_states) @@ -894,18 +884,15 @@ def exec_classif(arguments): # NB_CLASS, initKWARGS) directories = execution.gen_direcorties_names(directory, stats_iter) benchmark_argument_dictionaries = execution.gen_argument_dictionaries( - labels_dictionary, directories, multiclass_labels, - labels_combinations, indices_multiclass, + labels_dictionary, directories, + splits, hyper_param_search, args, k_folds, stats_iter_random_states, metrics, - argument_dictionaries, benchmark, nb_views, - views, views_indices) - nb_multiclass = len(labels_combinations) + argument_dictionaries, benchmark, + views, views_indices,) results_mean_stds = exec_benchmark( - nb_cores, stats_iter, nb_multiclass, - benchmark_argument_dictionaries, splits, directories, - directory, multiclass_labels, metrics, labels_dictionary, - nb_class, dataset_var) + nb_cores, stats_iter, + benchmark_argument_dictionaries, directory, metrics, dataset_var) noise_results.append([noise_std, results_mean_stds]) plot_results_noise(directory, noise_results, metrics[0][0], dataset_name) diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/accuracy_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/accuracy_score.py index b316ba43d19ae86c864798a8a66f2c3a670b3c8c..81c5438799ab816250e35c84958107bd3885ef8e 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/accuracy_score.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/accuracy_score.py @@ -47,7 +47,7 @@ def get_scorer(**kwargs): sample_weight=sample_weight) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except Exception: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/f1_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/f1_score.py index e930aa577b17f60a69bb1d92060b227cd3343a13..3c2029ece644073933463673d9fcc7ea84380904 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/f1_score.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/f1_score.py @@ -14,7 +14,7 @@ __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype -def score(y_true, y_pred, multiclass=False, **kwargs): +def score(y_true, y_pred, multiclass=True, **kwargs): try: sample_weight = kwargs["0"] except: @@ -57,13 +57,13 @@ def get_scorer(**kwargs): try: average = kwargs["3"] except: - average = "binary" + average = "micro" return make_scorer(metric, greater_is_better=True, sample_weight=sample_weight, labels=labels, pos_label=pos_label, average=average) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except Exception: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/fbeta_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/fbeta_score.py index 725bc1831073d88b4d5125e03f5c838043f96e89..aae0151739ea050f6c8d5ad5e835976503bdf0b6 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/fbeta_score.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/fbeta_score.py @@ -10,7 +10,7 @@ __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype -def score(y_true, y_pred, multiclass=False, **kwargs): +def score(y_true, y_pred, multiclass=True, **kwargs): try: sample_weight = kwargs["0"] except Exception: @@ -60,13 +60,13 @@ def get_scorer(**kwargs): try: average = kwargs["4"] except Exception: - average = "binary" + average = "micro" return make_scorer(metric, greater_is_better=True, beta=beta, sample_weight=sample_weight, labels=labels, pos_label=pos_label, average=average) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except Exception: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/framework.py b/multiview_platform/mono_multi_view_classifiers/metrics/framework.py index 8bdf418859db8bb06961f9a1c3295e611b460174..6351bac8d319fc599e72e63bb683ea8ebdd6df48 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/framework.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/framework.py @@ -58,7 +58,7 @@ def get_scorer(**kwargs): return scorer -def getConfig(**kwargs): +def get_config(**kwargs): """Get the metric's configuration as a string. Parameters diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/generic_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/generic_score.py index 7535f186a9d4acecc65703f116482d2d0d2d2477..9a004452b1737234ca1a775a150405dbb2e3c0ed 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/generic_score.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/generic_score.py @@ -293,7 +293,7 @@ def get_scorer(type='f1_score', **kwargs): return scorer -def getConfig(type='f1_score', **kwargs): +def get_config(type='f1_score', **kwargs): _type_names = ['accuracy_score', 'f1_score', 'fbeta_score', 'hamming_loss', 'jaccard_similarity_score', 'precision_score', 'recall_score', 'roc_auc_score', 'zero_one_loss', 'zero_one_loss', 'framework'] diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/hamming_loss.py b/multiview_platform/mono_multi_view_classifiers/metrics/hamming_loss.py index f6fe0df043d384c335e6279679bdcb2654ffca91..4e1e1ef888ebb8375852d7c91ffc98f8640eba8c 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/hamming_loss.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/hamming_loss.py @@ -26,7 +26,7 @@ def get_scorer(**kwargs): return make_scorer(metric, greater_is_better=False, classes=classes) -def getConfig(**kwargs): +def get_config(**kwargs): try: classes = kwargs["0"] except Exception: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/jaccard_similarity_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/jaccard_similarity_score.py index 72eaca5f16f22a99e3d3750798dd9b9845e1d7bf..98018b66aa49376cd3132a7cc5b0d02faa656d89 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/jaccard_similarity_score.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/jaccard_similarity_score.py @@ -27,7 +27,7 @@ def get_scorer(**kwargs): sample_weight=sample_weight) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except Exception: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/log_loss.py b/multiview_platform/mono_multi_view_classifiers/metrics/log_loss.py index 3cd89924d7af9c1f1fcc9fafb67991be9af7b07a..f3868882a84ed0a79d1a89daa6004136ddd47e9a 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/log_loss.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/log_loss.py @@ -35,7 +35,7 @@ def get_scorer(**kwargs): sample_weight=sample_weight, eps=eps) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except Exception: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/matthews_corrcoef.py b/multiview_platform/mono_multi_view_classifiers/metrics/matthews_corrcoef.py index 6f0bdb01bd7c8796819bf1b77eba81e46400e539..80307efb356f7564ee9e065f8f31e4397c6ae302 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/matthews_corrcoef.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/matthews_corrcoef.py @@ -18,6 +18,6 @@ def get_scorer(**kwargs): return make_scorer(metric, greater_is_better=True) -def getConfig(**kwargs): +def get_config(**kwargs): config_string = "Matthews correlation coefficient (higher is better)" return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/precision_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/precision_score.py index 6afd4ea90a8f03a59ef8303fbcf3d94a4a81012a..11ebc14e7470e38a905ae59bbc204af16b1b8d78 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/precision_score.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/precision_score.py @@ -55,7 +55,7 @@ def get_scorer(**kwargs): average=average) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/recall_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/recall_score.py index a6dae59f9031168a296c15fd182490bb38cd45ee..4ad1236896c622addfe7cd15961e135813ea0bc6 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/recall_score.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/recall_score.py @@ -57,7 +57,7 @@ def get_scorer(**kwargs): average=average) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except Exception: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/roc_auc_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/roc_auc_score.py index b1fccd09b89d665635945d72417f5129ca55baaf..35bd0c397dcd57369df2c5614e500c7b8b8def5e 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/roc_auc_score.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/roc_auc_score.py @@ -44,7 +44,7 @@ def get_scorer(**kwargs): sample_weight=sample_weight, average=average) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except: diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/zero_one_loss.py b/multiview_platform/mono_multi_view_classifiers/metrics/zero_one_loss.py index 1b293d79fb90f9c05aa8f68f45230aad6d4f262f..5a4afb4c80df6a635aef6bcfc052338fbefa581c 100644 --- a/multiview_platform/mono_multi_view_classifiers/metrics/zero_one_loss.py +++ b/multiview_platform/mono_multi_view_classifiers/metrics/zero_one_loss.py @@ -28,7 +28,7 @@ def get_scorer(**kwargs): sample_weight=sample_weight) -def getConfig(**kwargs): +def get_config(**kwargs): try: sample_weight = kwargs["0"] except Exception: diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/analyze_result.py b/multiview_platform/mono_multi_view_classifiers/monoview/analyze_result.py index e5d00b67c649c6c609e06daaa93b2f400bec61e4..8a6ba10001d3f63587072c706dc3167effb7f97f 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/analyze_result.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/analyze_result.py @@ -36,13 +36,13 @@ def getDBConfigString(name, feat, classification_indices, shape, def getClassifierConfigString(gridSearch, nbCores, nIter, clKWARGS, classifier, output_file_name, y_test): classifierConfigString = "Classifier configuration : \n" - classifierConfigString += "\t- " + classifier.getConfig()[5:] + "\n" + classifierConfigString += "\t- " + classifier.get_config()[5:] + "\n" classifierConfigString += "\t- Executed on " + str(nbCores) + " core(s) \n" if gridSearch: classifierConfigString += "\t- Got configuration using randomized search with " + str( nIter) + " iterations \n" classifierConfigString += "\n\n" - classifierInterpretString = classifier.getInterpret(output_file_name, y_test) + classifierInterpretString = classifier.get_interpret(output_file_name, y_test) return classifierConfigString, classifierInterpretString @@ -55,7 +55,7 @@ def getMetricScore(metric, y_train, y_train_pred, y_test, y_test_pred): metricKWARGS = {} metricScoreTrain = metricModule.score(y_train, y_train_pred) metricScoreTest = metricModule.score(y_test, y_test_pred) - metricScoreString = "\tFor " + metricModule.getConfig( + metricScoreString = "\tFor " + metricModule.get_config( **metricKWARGS) + " : " metricScoreString += "\n\t\t- Score on train : " + str(metricScoreTrain) metricScoreString += "\n\t\t- Score on test : " + str(metricScoreTest) diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py index cb360ad64eeb81bafb3eed86b76bfce1a8ab141b..2d3b436aa3d35d6cc4a2f03240c4a32aef12acf4 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py @@ -18,6 +18,7 @@ from .analyze_result import execute from .. import monoview_classifiers from ..utils.dataset import extract_subset, Dataset from ..utils import hyper_parameter_search +from ..utils.multiclass import get_mc_estim # Author-Info __author__ = "Nikolas Huelsmann, Baptiste BAUVIN" @@ -45,10 +46,10 @@ def exec_monoview_multicore(directory, name, labels_names, classification_indice **args) -def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, - KFolds, nbCores, databaseType, path, - randomState, hyper_param_search="randomized_search", - metrics=[["accuracy_score", None]], n_iter=30, view_name="", **args): +def exec_monoview(directory, X, Y, name, labels_names, classification_indices, + KFolds, nbCores, databaseType, path, + random_state, hyper_param_search="randomized_search", + metrics=[["accuracy_score", None]], n_iter=30, view_name="", **args): logging.debug("Start:\t Loading data") kwargs, \ t_start, \ @@ -57,7 +58,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, X, \ learningRate, \ labelsString, \ - outputFileName = initConstants(args, X, classificationIndices, labels_names, + outputFileName = initConstants(args, X, classification_indices, labels_names, name, directory, view_name) logging.debug("Done:\t Loading data") @@ -69,8 +70,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, + str(nbCores) + ", algorithm : " + classifier_name) logging.debug("Start:\t Determine Train/Test split") - X_train, y_train, X_test, y_test, X_test_multiclass = init_train_test(X, Y, - classificationIndices) + X_train, y_train, X_test, y_test = init_train_test(X, Y, classification_indices) logging.debug("Info:\t Shape X_train:" + str( X_train.shape) + ", Length of y_train:" + str(len(y_train))) @@ -79,17 +79,21 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, logging.debug("Done:\t Determine Train/Test split") logging.debug("Start:\t Generate classifier args") - classifierModule = getattr(monoview_classifiers, classifier_name) - classifier_class_name = classifierModule.classifier_class_name - clKWARGS, testFoldsPreds = getHPs(classifierModule, hyper_param_search, - n_iter, classifier_name, classifier_class_name, - X_train, y_train, - randomState, outputFileName, - KFolds, nbCores, metrics, kwargs) + classifier_module = getattr(monoview_classifiers, classifier_name) + classifier_class_name = classifier_module.classifier_class_name + cl_kwargs, testFoldsPreds = getHPs(classifier_module, hyper_param_search, + n_iter, classifier_name, classifier_class_name, + X_train, y_train, + random_state, outputFileName, + KFolds, nbCores, metrics, kwargs) logging.debug("Done:\t Generate classifier args") logging.debug("Start:\t Training") - classifier = getattr(classifierModule, classifier_class_name)(randomState, **clKWARGS) + + classifier = get_mc_estim(getattr(classifier_module, + classifier_class_name) + (random_state, **cl_kwargs), + random_state) classifier.fit(X_train, y_train) # NB_CORES=nbCores, logging.debug("Done:\t Training") @@ -100,15 +104,11 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, #Filling the full prediction in the right order full_pred = np.zeros(Y.shape, dtype=int) - 100 - for trainIndex, index in enumerate(classificationIndices[0]): + for trainIndex, index in enumerate(classification_indices[0]): full_pred[index] = y_train_pred[trainIndex] - for testIndex, index in enumerate(classificationIndices[1]): + for testIndex, index in enumerate(classification_indices[1]): full_pred[index] = y_test_pred[testIndex] - if X_test_multiclass != []: - y_test_multiclass_pred = classifier.predict(X_test_multiclass) - else: - y_test_multiclass_pred = [] logging.debug("Done:\t Predicting") @@ -119,12 +119,11 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, logging.debug("Start:\t Getting results") stringAnalysis, \ imagesAnalysis, \ - metricsScores = execute(name, classificationIndices, KFolds, nbCores, + metricsScores = execute(name, classification_indices, KFolds, nbCores, hyper_parameter_search, metrics, n_iter, view_name, classifier_name, - clKWARGS, labels_names, X.shape, + cl_kwargs, labels_names, X.shape, y_train, y_train_pred, y_test, y_test_pred, t_end, - randomState, classifier, outputFileName) - # cl_desc = [value for key, value in sorted(clKWARGS.items())] + random_state, classifier, outputFileName) logging.debug("Done:\t Getting results") logging.debug("Start:\t Saving preds") @@ -136,47 +135,41 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, if testFoldsPreds is None: testFoldsPreds = y_train_pred return monoview_utils.MonoviewResult(viewIndex, classifier_name, view_name, metricsScores, - full_pred, clKWARGS, - y_test_multiclass_pred, testFoldsPreds, classifier, X_train.shape[1]) - # return viewIndex, [CL_type, view_name, metricsScores, full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds] + full_pred, cl_kwargs, + testFoldsPreds, classifier, X_train.shape[1]) -def initConstants(args, X, classificationIndices, labels_names, +def initConstants(args, X, classification_indices, labels_names, name, directory, view_name): try: kwargs = args["args"] except KeyError: kwargs = args t_start = time.time() - CL_type = kwargs["classifier_name"] - learningRate = float(len(classificationIndices[0])) / ( - len(classificationIndices[0]) + len(classificationIndices[1])) - labelsString = "-".join(labels_names) - CL_type_string = CL_type - timestr = time.strftime("%Y_%m_%d-%H_%M_%S") - outputFileName = os.path.join(directory, CL_type_string, view_name, timestr + "-results-" + CL_type_string + "-" + labelsString + \ - '-learnRate_{0:.2f}'.format( - learningRate) + '-' + name + "-" + view_name + "-") - if not os.path.exists(os.path.dirname(outputFileName)): + cl_type = kwargs["classifier_name"] + learning_rate = float(len(classification_indices[0])) / ( + len(classification_indices[0]) + len(classification_indices[1])) + labels_string = "-".join(labels_names) + cl_type_string = cl_type + output_file_name = os.path.join(directory, cl_type_string, view_name, + cl_type_string + '-' + name + "-" + + view_name + "-") + if not os.path.exists(os.path.dirname(output_file_name)): try: - os.makedirs(os.path.dirname(outputFileName)) + os.makedirs(os.path.dirname(output_file_name)) except OSError as exc: if exc.errno != errno.EEXIST: raise - return kwargs, t_start, view_name, CL_type, X, learningRate, labelsString, outputFileName + return kwargs, t_start, view_name, cl_type, X, learning_rate, labels_string, output_file_name def init_train_test(X, Y, classificationIndices): - trainIndices, testIndices, testIndicesMulticlass = classificationIndices + trainIndices, testIndices = classificationIndices X_train = extract_subset(X, trainIndices) X_test = extract_subset(X, testIndices) - if np.array(testIndicesMulticlass).size != 0: - X_test_multiclass = extract_subset(X, testIndicesMulticlass) - else: - X_test_multiclass = [] y_train = Y[trainIndices] y_test = Y[testIndices] - return X_train, y_train, X_test, y_test, X_test_multiclass + return X_train, y_train, X_test, y_test def getHPs(classifierModule, hyper_param_search, nIter, classifier_module_name, diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py b/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py index f2ce0112d63cd7f1c8d807fd3edd4ab3eeaa65cb..04683dadf177da8dc79b3d8136d5942538b7f278 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py @@ -1,6 +1,7 @@ import pickle import matplotlib.pyplot as plt +from abc import abstractmethod import numpy as np from matplotlib.ticker import FuncFormatter from scipy.stats import uniform, randint @@ -135,7 +136,7 @@ class BaseMonoviewClassifier(BaseEstimator, ):#ClassifierMixin): else: return [()] - def genDistribs(self): + def gen_distribs(self): return dict((param_name, distrib) for param_name, distrib in zip(self.param_names, self.distribs)) @@ -144,7 +145,7 @@ class BaseMonoviewClassifier(BaseEstimator, ):#ClassifierMixin): [param_name + " : " + self.to_str(param_name) for param_name in self.param_names]) - def getConfig(self): + def get_config(self): if self.param_names: return "\n\t\t- " + self.__class__.__name__ + "with " + self.params_to_string() else: @@ -190,12 +191,40 @@ class BaseMonoviewClassifier(BaseEstimator, ):#ClassifierMixin): featureImportance) + "\n" return interpretString + @abstractmethod + def fit(self, X, y): + pass + + @abstractmethod + def predict(self, X): + pass + def get_name_for_fusion(self): return self.__class__.__name__[:4] - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test): return "" + def accepts_multi_class(self, random_state, n_samples=10, dim=2, + n_classes=3): + if int(n_samples / n_classes) < 1: + raise ValueError( + "n_samples ({}) / n_classe ({}) must be over 1".format( + n_samples, + n_classes)) + fake_mc_X = random_state.random_integers(low=0, high=100, + size=(n_samples, dim)) + fake_mc_y = [class_index + for _ in range(int(n_samples / n_classes)) + for class_index in range(n_classes)] + fake_mc_y += [0 for _ in range(n_samples % n_classes)] + try: + self.fit(fake_mc_X, fake_mc_y) + self.predict(fake_mc_X) + return True + except ValueError: + return False + def get_names(classed_list): return np.array([object_.__class__.__name__ for object_ in classed_list]) @@ -208,15 +237,13 @@ def percent(x, pos): class MonoviewResult(object): def __init__(self, view_index, classifier_name, view_name, metrics_scores, - full_labels_pred, - classifier_config, y_test_multiclass_pred, test_folds_preds, classifier, n_features): + full_labels_pred, classifier_config, test_folds_preds, classifier, n_features): self.view_index = view_index self.classifier_name = classifier_name self.view_name = view_name self.metrics_scores = metrics_scores self.full_labels_pred = full_labels_pred self.classifier_config = classifier_config - self.y_test_multiclass_pred = y_test_multiclass_pred self.test_folds_preds = test_folds_preds self.clf = classifier self.n_features = n_features @@ -251,6 +278,8 @@ def get_accuracy_graph(plotted_data, classifier_name, file_name, f.savefig(file_name, transparent=True) plt.close() + + # def isUseful(labelSupports, index, CLASS_LABELS, labelDict): # if labelSupports[labelDict[CLASS_LABELS[index]]] != 0: # labelSupports[labelDict[CLASS_LABELS[index]]] -= 1 diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py index 637f12e179319eb402d4d256bf968206b18ffb3e..367e8a8c18886d53e572fc6e32b8eb8f12065119 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py @@ -57,7 +57,6 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): if isinstance(base_estimator, str): if base_estimator == "DecisionTreeClassifier": base_estimator = DecisionTreeClassifier() - super(Adaboost, self).__init__( random_state=random_state, n_estimators=n_estimators, @@ -137,7 +136,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): [step_pred for step_pred in self.staged_predict(X)]) return pred - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test): interpretString = "" interpretString += self.get_feature_importance(directory) interpretString += "\n\n Estimator error | Estimator weight\n" diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/decision_tree.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/decision_tree.py index 7fe9bdadd3b2ed9e3b65ce49aa6da8e0ba00ffa8..378acf85a4905629cc25dacf9bf7b97b58f4e6be 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/decision_tree.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/decision_tree.py @@ -31,7 +31,7 @@ class DecisionTree(DecisionTreeClassifier, BaseMonoviewClassifier): # """Used to know if the classifier can return label probabilities""" # return True - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test): interpretString = "" interpretString += self.get_feature_importance(directory) return interpretString diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/gradient_boosting.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/gradient_boosting.py index 441aaebf49f1040254ae4ca77ee4fecd8c0d0d18..01fe8bdc97e3b4c52325147115674601e638e0b7 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/gradient_boosting.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/gradient_boosting.py @@ -72,23 +72,26 @@ class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier): # """Used to know if the classifier can return label probabilities""" # return False - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test, multi_class=False): interpretString = "" - interpretString += self.get_feature_importance(directory) - step_test_metrics = np.array( - [self.plotted_metric.score(y_test, step_pred) for step_pred in - self.step_predictions]) - get_accuracy_graph(step_test_metrics, "AdaboostClassic", - directory + "test_metrics.png", - self.plotted_metric_name, set="test") - get_accuracy_graph(self.metrics, "AdaboostClassic", - directory + "metrics.png", self.plotted_metric_name) - np.savetxt(directory + "test_metrics.csv", step_test_metrics, - delimiter=',') - np.savetxt(directory + "train_metrics.csv", self.metrics, delimiter=',') - np.savetxt(directory + "times.csv", - np.array([self.train_time, self.pred_time]), delimiter=',') - return interpretString + if multi_class: + return interpretString + else: + interpretString += self.get_feature_importance(directory) + step_test_metrics = np.array( + [self.plotted_metric.score(y_test, step_pred) for step_pred in + self.step_predictions]) + get_accuracy_graph(step_test_metrics, "AdaboostClassic", + directory + "test_metrics.png", + self.plotted_metric_name, set="test") + get_accuracy_graph(self.metrics, "AdaboostClassic", + directory + "metrics.png", self.plotted_metric_name) + np.savetxt(directory + "test_metrics.csv", step_test_metrics, + delimiter=',') + np.savetxt(directory + "train_metrics.csv", self.metrics, delimiter=',') + np.savetxt(directory + "times.csv", + np.array([self.train_time, self.pred_time]), delimiter=',') + return interpretString # def formatCmdArgs(args): diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/knn.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/knn.py index 197b9a7cd1d4d53dcbf9ff47bbfec4defe97600d..8a9ad08f5d72e1ea0b41f27a6ebe8104e94fa1c7 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/knn.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/knn.py @@ -46,7 +46,7 @@ class KNN(KNeighborsClassifier, BaseMonoviewClassifier): # """Used to know if the classifier can return label probabilities""" # return True - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test): interpretString = "" return interpretString diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/lasso.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/lasso.py index ca85cf799394b0c136452e592f13b4ab7e3aea9c..a166d934819fcb10e6fc198d5276d53e26735e46 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/lasso.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/lasso.py @@ -82,7 +82,7 @@ class Lasso(LassoSK, BaseMonoviewClassifier): # """ # return False - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test): """ return the interpreted string diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/random_forest.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/random_forest.py index 91ebe9dfd19fca77db12e2ca2b8146d1b0f4b288..3fc9721004f4b75d8d6d7290f03b7c62dce68d0d 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/random_forest.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/random_forest.py @@ -73,7 +73,7 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier): # """ # return True - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test): """ Parameters diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py index 1023f6b33c99ee0d41b6dbc401ed305b689f7ac4..78ff0ead71834086886a760ed528d3ba2184dc0a 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py @@ -65,7 +65,7 @@ class SGD(SGDClassifier, BaseMonoviewClassifier): # # return True - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test): """ Parameters diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_linear.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_linear.py index 13b0d6aedfe23cba01ce53cb39ed490b267f70ae..cd3a157b8ca98c74e9305fa1583e0fbc41faed5f 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_linear.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_linear.py @@ -36,7 +36,7 @@ class SVMLinear(SVCClassifier, BaseMonoviewClassifier): self.param_names = ["C", "random_state"] self.distribs = [CustomUniform(loc=0, state=1), [random_state]] - def getInterpret(self, directory, y_test): + def get_interpret(self, directory, y_test): interpret_string = "" # self.feature_importances_ = (self.coef_/np.sum(self.coef_)).reshape((self.coef_.shape[1],)) return interpret_string diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py index aa305849e6903b42bf63eb9e7b440ec3a20f85c6..80e6ef39522a7ea4cc045f764ba34fff5ab648c0 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py @@ -27,7 +27,7 @@ def printMetricScore(metricScores, metric_list): enumerate(metric[1])) else: metric_kwargs = {} - metric_score_string += "\tFor " + metric_module.getConfig( + metric_score_string += "\tFor " + metric_module.get_config( **metric_kwargs) + " : " metric_score_string += "\n\t\t- Score on train : " + str( metricScores[metric[0]][0]) @@ -161,7 +161,7 @@ def execute(classifier, trainLabels, labels_dictionary.values()) + "\n\t-Views : " + ', '.join( views) + "\n\t-" + str( KFolds.n_splits) + \ - " folds\n\nClassification configuration : \n\t-Algorithm used : " + classifier_name + " with : " + classifier.getConfig() + " folds\n\nClassification configuration : \n\t-Algorithm used : " + classifier_name + " with : " + classifier.get_config() metricsScores = getMetricsScores(metric_list, trainLabels, testLabels, validationIndices, learningIndices, labels) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py index abea7e64e38475d671e89dd4bafba594d3892b1a..2e4f5bc850784cba8c6b3b79d8fcd6ec1cb57e5f 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py @@ -100,12 +100,9 @@ def save_results(classifier, labels_dictionary, string_analysis, views, classifi logging.info(string_analysis) # views_string = "-".join(views) views_string = "mv" - labels_string = "-".join(labels_set) - timestr = time.strftime("%Y_%m_%d-%H_%M_%S") cl_type_string = classifier.short_name output_file_name = os.path.join(directory, cl_type_string, - timestr + "-results-" + cl_type_string + "-" + views_string + '-' + labels_string + \ - '-learnRate_{0:.2f}'.format(learning_rate) + '-' + name) + cl_type_string+"-"+views_string+'-'+name) if not os.path.exists(os.path.dirname(output_file_name)): try: os.makedirs(os.path.dirname(output_file_name)) @@ -252,7 +249,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, k_folds logging.info("Info:\t Extraction duration " + str(extraction_time) + "s") logging.debug("Start:\t Getting train/test split") - learning_indices, validation_indices, test_indices_multiclass = classification_indices + learning_indices, validation_indices = classification_indices logging.debug("Done:\t Getting train/test split") logging.debug("Start:\t Getting classifiers modules") @@ -266,7 +263,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, k_folds logging.debug("Start:\t Optimizing hyperparameters") if hyper_param_search != "None": classifier_config = hyper_parameter_search.search_best_settings( - dataset_var, labels, classifier_module, classifier_name, + dataset_var, dataset_var.get_labels(), classifier_module, classifier_name, metrics[0], learning_indices, k_folds, random_state, directory, nb_cores=nb_cores, views_indices=views_indices, searching_tool=hyper_param_search, n_iter=n_iter, @@ -277,7 +274,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, k_folds logging.debug("Done:\t Optimizing hyperparameters") logging.debug("Start:\t Fitting classifier") - classifier.fit(dataset_var, labels, train_indices=learning_indices, + classifier.fit(dataset_var, dataset_var.get_labels(), train_indices=learning_indices, view_indices=views_indices) logging.debug("Done:\t Fitting classifier") @@ -286,17 +283,11 @@ def exec_multiview(directory, dataset_var, name, classification_indices, k_folds view_indices=views_indices) test_labels = classifier.predict(dataset_var, example_indices=validation_indices, view_indices=views_indices) - full_labels = np.zeros(labels.shape, dtype=int) - 100 + full_labels = np.zeros(dataset_var.get_labels().shape, dtype=int) - 100 for train_index, index in enumerate(learning_indices): full_labels[index] = train_labels[train_index] for test_index, index in enumerate(validation_indices): full_labels[index] = test_labels[test_index] - if test_indices_multiclass != []: - test_labels_multiclass = classifier.predict(dataset_var, - example_indices=test_indices_multiclass, - view_indices=views_indices) - else: - test_labels_multiclass = [] logging.info("Done:\t Pertidcting") classification_time = time.time() - t_start @@ -323,7 +314,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, k_folds logging.debug("Start:\t Saving preds") return MultiviewResult(cl_type, classifier_config, metrics_scores, - full_labels, test_labels_multiclass) + full_labels) # return CL_type, classificationKWARGS, metricsScores, fullLabels, testLabelsMulticlass diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py index 3b22712392c326b58f0c0f57508c1f24a24ac723..da79c6cc36e20197074498683c6ced4bbdcc606c 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py @@ -8,12 +8,11 @@ from .. import monoview_classifiers class MultiviewResult(object): def __init__(self, classifier_name, classifier_config, - metrics_scores, full_labels, test_labels_multiclass): + metrics_scores, full_labels): self.classifier_name = classifier_name self.classifier_config = classifier_config self.metrics_scores = metrics_scores self.full_labels_pred = full_labels - self.y_test_multiclass_pred = test_labels_multiclass def get_classifier_name(self): try: diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis.py index 6ff0f76b7e528f56a127318b6cd6aa882bc68b4b..3f1b93dbf9a25d9fa753341c48063977ae4149d2 100644 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis.py +++ b/multiview_platform/mono_multi_view_classifiers/result_analysis.py @@ -24,6 +24,7 @@ from .multiview.multiview_utils import MultiviewResult __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype + def save_dict_to_text(dictionnary, output_file): #TODO : smarter way must exist output_file.write("Failed algorithms : \n\t"+ ",\n\t".join(dictionnary.keys())+".\n\n\n") @@ -473,10 +474,7 @@ def sort_by_test_score(train_scores, test_scores, names, train_STDs=None, return sorted_names, sorted_train_scores, sorted_test_scores, sorted_train_STDs, sorted_test_STDs - - - -def publishMetricsGraphs(metrics_scores, directory, database_name, labels_names): +def publish_metrics_graphs(metrics_scores, directory, database_name, labels_names): r"""Used to sort the results (names and both scores) in descending test score order. Parameters @@ -520,8 +518,7 @@ def init_plot(results, metric_name, metric_dataframe, nb_results = metric_dataframe.shape[1] - file_name = os.path.join(directory, time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + database_name + "-" + "_vs_".join( + file_name = os.path.join(directory, database_name + "-" + "_vs_".join( labels_names) + "-" + metric_name) results += [[classifiers_name, metric_name, testMean, testSTD] @@ -578,11 +575,10 @@ def gen_error_data(example_errors): return nb_classifiers, nb_examples, classifiers_names, data_2d, error_on_examples -def publishExampleErrors(example_errors, directory, databaseName, labels_names, example_ids, labels): +def publish_example_errors(example_errors, directory, databaseName, labels_names, example_ids, labels): logging.debug("Start:\t Biclass Label analysis figure generation") - base_file_name = os.path.join(directory, time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-" + "_vs_".join( + base_file_name = os.path.join(directory, databaseName + "-" + "_vs_".join( labels_names) + "-") nb_classifiers, nb_examples, classifiers_names, \ @@ -601,13 +597,13 @@ def publishExampleErrors(example_errors, directory, databaseName, labels_names, logging.debug("Done:\t Biclass Label analysis figures generation") -def publish_feature_importances(feature_importances, directory, database_name, labels_names, feature_stds=None): +def publish_feature_importances(feature_importances, directory, database_name, feature_stds=None): for view_name, feature_importance in feature_importances.items(): if not os.path.exists(os.path.join(directory, "feature_importances")): os.mkdir(os.path.join(directory, "feature_importances")) - file_name = os.path.join(directory, "feature_importances" , time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + database_name + "-" + "_vs_".join( - labels_names) + "-" + view_name + "-feature_importances") + file_name = os.path.join(directory, "feature_importances" , + database_name + "-" + view_name + + "-feature_importances") if feature_stds is not None: feature_std = feature_stds[view_name] feature_std.to_csv(file_name+"_dataframe_stds.csv") @@ -640,7 +636,7 @@ def publish_feature_importances(feature_importances, directory, database_name, l del fig -def get_arguments(benchmark_argument_dictionaries, flag): +def get_arguments(benchmark_argument_dictionaries, iter_index): r"""Used to get the arguments passed to the benchmark executing function corresponding to the flag of a biclass experimentation. @@ -657,7 +653,7 @@ def get_arguments(benchmark_argument_dictionaries, flag): All the arguments passed to the benchmark executing function for the needed experimentation. """ for benchmarkArgumentDictionary in benchmark_argument_dictionaries: - if benchmarkArgumentDictionary["flag"] == flag: + if benchmarkArgumentDictionary["flag"] == iter_index: return benchmarkArgumentDictionary @@ -687,21 +683,21 @@ def get_feature_importances(result, feature_names=None): return feature_importances -def publish_tracebacks(directory, database_name, labels_names, tracebacks, flag): +def publish_tracebacks(directory, database_name, labels_names, tracebacks, iter_index): if tracebacks: - with open(os.path.join(directory, time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + database_name + "-" + "_vs_".join( - labels_names) + "-iter"+str(flag[0])+"-"+str(flag[1][0])+"vs"+ - str(flag[1][1])+"-tacebacks.txt"), "w") as traceback_file: + with open(os.path.join(directory, database_name + + "-iter"+str(iter_index) + + "-tacebacks.txt"), + "w") as traceback_file: failed_list = save_dict_to_text(tracebacks, traceback_file) - flagged_list = [_ + "-iter"+str(flag[0])+"-"+str(flag[1][0])+"vs"+ - str(flag[1][1]) for _ in failed_list] + flagged_list = [_ + "-iter"+str(iter_index) for _ in failed_list] else: flagged_list = {} return flagged_list -def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metrics, example_ids): +def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, + metrics, example_ids, labels): r"""Used to extract and format the results of the different biclass experimentations performed. Parameters @@ -728,16 +724,17 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric label combination, regrouping the scores for each metrics and the information useful to plot errors on examples. """ logging.debug("Srart:\t Analzing all biclass resuls") - biclass_results = {} + iter_results = {"metrics_scores": [i for i in range(stats_iter)], + "example_errors": [i for i in range(stats_iter)], + "feature_importances": [i for i in range(stats_iter)]} flagged_tracebacks_list = [] fig_errors = [] - for flag, result, tracebacks in results: - iteridex, [classifierPositive, classifierNegative] = flag + for iter_index, result, tracebacks in results: - arguments = get_arguments(benchmark_argument_dictionaries, flag) + arguments = get_arguments(benchmark_argument_dictionaries, iter_index) metrics_scores = get_metrics_scores_biclass(metrics, result) - example_errors = get_example_errors_biclass(arguments["labels"], result) + example_errors = get_example_errors_biclass(labels, result) feature_importances = get_feature_importances(result) directory = arguments["directory"] @@ -745,318 +742,299 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric labels_names = [arguments["labels_dictionary"][0], arguments["labels_dictionary"][1]] - flagged_tracebacks_list += publish_tracebacks(directory, database_name, labels_names, tracebacks, flag) - results = publishMetricsGraphs(metrics_scores, directory, database_name, - labels_names) - publishExampleErrors(example_errors, directory, database_name, - labels_names, example_ids, arguments["labels"]) - publish_feature_importances(feature_importances, directory, database_name, labels_names) - - - if not str(classifierPositive) + str(classifierNegative) in biclass_results: - biclass_results[str(classifierPositive) + str(classifierNegative)] = {} - biclass_results[str(classifierPositive) + str(classifierNegative)][ - "metrics_scores"] = [i for i in range(stats_iter)] - biclass_results[str(classifierPositive) + str(classifierNegative)][ - "example_errors"] = [i for i in range(stats_iter)] - biclass_results[str(classifierPositive) + str(classifierNegative)][ - "feature_importances"] = [i for i in range(stats_iter)] - biclass_results[str(classifierPositive) + str(classifierNegative)]["metrics_scores"][iteridex] = metrics_scores - biclass_results[str(classifierPositive) + str(classifierNegative)]["example_errors"][iteridex] = example_errors - biclass_results[str(classifierPositive) + str(classifierNegative)]["feature_importances"][iteridex] = feature_importances - biclass_results[str(classifierPositive) + str(classifierNegative)]["labels"] = arguments["labels"] - - logging.debug("Done:\t Analzing all biclass resuls") - - return results, biclass_results, flagged_tracebacks_list - - -def gen_metrics_scores_multiclass(results, true_labels, metrics_list, - arguments_dictionaries): - """Used to add all the metrics scores to the multiclass result structure for each clf and each iteration""" - - logging.debug("Start:\t Getting multiclass scores for each metric") - - for metric in metrics_list: - metric_module = getattr(metrics, metric[0]) - for iter_index, iter_results in enumerate(results): - - for argumentsDictionary in arguments_dictionaries: - if argumentsDictionary["flag"][0] == iter_index: - classification_indices = argumentsDictionary[ - "classification_indices"] - train_indices, test_indices, multiclass_test_indices = classification_indices - - for classifier_name, resultDictionary in iter_results.items(): - if not "metrics_scores" in resultDictionary: - results[iter_index][classifier_name]["metrics_scores"] = {} - train_score = metric_module.score(true_labels[train_indices], - resultDictionary["labels"][ - train_indices], - multiclass=True) - test_score = metric_module.score( - true_labels[multiclass_test_indices], - resultDictionary["labels"][multiclass_test_indices], - multiclass=True) - results[iter_index][classifier_name]["metrics_scores"][ - metric[0]] = [train_score, test_score] - logging.debug("Done:\t Getting multiclass scores for each metric") - return results - - -def get_error_on_labels_multiclass(multiclass_results, multiclass_labels): - """Used to add all the arrays showing on which example there is an error for each clf and each iteration""" - - logging.debug("Start:\t Getting errors on each example for each classifier") + flagged_tracebacks_list += publish_tracebacks(directory, database_name, labels_names, tracebacks, iter_index) + res = publish_metrics_graphs(metrics_scores, directory, database_name, + labels_names) + publish_example_errors(example_errors, directory, database_name, + labels_names, example_ids, labels) + publish_feature_importances(feature_importances, directory, + database_name) - for iter_index, iter_results in enumerate(multiclass_results): - for classifier_name, classifier_results in iter_results.items(): - error_on_examples = classifier_results["labels"] == multiclass_labels - multiclass_results[iter_index][classifier_name][ - "error_on_examples"] = error_on_examples.astype(int) - logging.debug("Done:\t Getting errors on each example for each classifier") - return multiclass_results - - -def publishMulticlassScores(multiclass_results, metrics, stats_iter, direcories, - databaseName): - results=[] - for iter_index in range(stats_iter): - directory = direcories[iter_index] - for metric in metrics: - logging.debug( - "Start:\t Multiclass score graph generation for " + metric[0]) - classifiers_names = np.array([classifier_name for classifier_name in - multiclass_results[iter_index].keys()]) - train_scores = np.array([multiclass_results[iter_index][ - classifier_name]["metrics_scores"][ - metric[0]][0] - for classifier_name in classifiers_names]) - validationScores = np.array([multiclass_results[iter_index][ - classifier_name]["metrics_scores"][ - metric[0]][1] - for classifier_name in - classifiers_names]) - - nbResults = classifiers_names.shape[0] - fileName = os.path.join(directory , time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-" + metric[ - 0]) - - plot_metric_scores(train_scores, validationScores, classifiers_names, - nbResults, metric[0], fileName, tag=" multiclass") - - logging.debug( - "Done:\t Multiclass score graph generation for " + metric[0]) - results+=[[classifiersName, metric, testMean, testSTD] for classifiersName, testMean, testSTD in zip(classifiers_names, validationScores, np.zeros(len(validationScores)))] - return results + iter_results["metrics_scores"][iter_index] = metrics_scores + iter_results["example_errors"][iter_index] = example_errors + iter_results["feature_importances"][iter_index] = feature_importances + iter_results["labels"] = labels + logging.debug("Done:\t Analzing all biclass resuls") -def publishMulticlassExmapleErrors(multiclass_results, directories, - databaseName, example_ids, multiclass_labels): - for iter_index, multiclass_result in enumerate(multiclass_results): - directory = directories[iter_index] - logging.debug("Start:\t Multiclass Label analysis figure generation") - - base_file_name = os.path.join(directory, time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-") - nb_classifiers, nb_examples, classifiers_names, data, error_on_examples = gen_error_data( - dict((key, multiclass_result[key]['error_on_examples']) - for key in multiclass_result.keys()),) - plot_2d(data, classifiers_names, nb_classifiers, nb_examples, - base_file_name, example_ids=example_ids, labels=multiclass_labels) - - plot_errors_bar(error_on_examples, nb_classifiers, nb_examples, - base_file_name) - - logging.debug("Done:\t Multiclass Label analysis figure generation") - - -def analyzeMulticlass(results, stats_iter, benchmark_argument_dictionaries, - nb_examples, nb_labels, multiclass_labels, - metrics, classification_indices, directories, example_ids): - """Used to transform one versus one results in multiclass results and to publish it""" - multiclass_results = [{} for _ in range(stats_iter)] - - for flag, result, tracebacks in results: - iter_index = flag[0] - classifierPositive = flag[1][0] - classifierNegative = flag[1][1] - - for benchmarkArgumentDictionary in benchmark_argument_dictionaries: - if benchmarkArgumentDictionary["flag"] == flag: - trainIndices, testIndices, testMulticlassIndices = \ - benchmarkArgumentDictionary["classification_indices"] - - for classifierResult in result: - classifier_name = classifierResult.get_classifier_name() - if classifier_name not in multiclass_results[iter_index]: - multiclass_results[iter_index][classifier_name] = np.zeros( - (nb_examples, nb_labels), dtype=int) - for exampleIndex in trainIndices: - label = classifierResult.full_labels_pred[exampleIndex] - if label == 1: - multiclass_results[iter_index][classifier_name][ - exampleIndex, classifierPositive] += 1 - else: - multiclass_results[iter_index][classifier_name][ - exampleIndex, classifierNegative] += 1 - for multiclassIndex, exampleIndex in enumerate( - testMulticlassIndices): - label = classifierResult.y_test_multiclass_pred[multiclassIndex] - if label == 1: - multiclass_results[iter_index][classifier_name][ - exampleIndex, classifierPositive] += 1 - else: - multiclass_results[iter_index][classifier_name][ - exampleIndex, classifierNegative] += 1 - - for iter_index, multiclassiterResult in enumerate(multiclass_results): - for key, value in multiclassiterResult.items(): - multiclass_results[iter_index][key] = { - "labels": np.argmax(value, axis=1)} - - multiclass_results = gen_metrics_scores_multiclass(multiclass_results, - multiclass_labels, metrics, - benchmark_argument_dictionaries) - multiclass_results = get_error_on_labels_multiclass(multiclass_results, - multiclass_labels) - - results = publishMulticlassScores(multiclass_results, metrics, stats_iter, directories, - benchmark_argument_dictionaries[0]["args"]["Base"]["name"]) - publishMulticlassExmapleErrors(multiclass_results, directories, - benchmark_argument_dictionaries[0][ - "args"]["Base"]["name"], example_ids, multiclass_labels) - - return results, multiclass_results + return res, iter_results, flagged_tracebacks_list + + +# def gen_metrics_scores_multiclass(results, true_labels, metrics_list, +# arguments_dictionaries): +# """Used to add all the metrics scores to the multiclass result structure for each clf and each iteration""" +# +# logging.debug("Start:\t Getting multiclass scores for each metric") +# +# for metric in metrics_list: +# metric_module = getattr(metrics, metric[0]) +# for iter_index, iter_results in enumerate(results): +# +# for argumentsDictionary in arguments_dictionaries: +# if argumentsDictionary["flag"][0] == iter_index: +# classification_indices = argumentsDictionary[ +# "classification_indices"] +# train_indices, test_indices, multiclass_test_indices = classification_indices +# +# for classifier_name, resultDictionary in iter_results.items(): +# if not "metrics_scores" in resultDictionary: +# results[iter_index][classifier_name]["metrics_scores"] = {} +# train_score = metric_module.score(true_labels[train_indices], +# resultDictionary["labels"][ +# train_indices], +# multiclass=True) +# test_score = metric_module.score( +# true_labels[multiclass_test_indices], +# resultDictionary["labels"][multiclass_test_indices], +# multiclass=True) +# results[iter_index][classifier_name]["metrics_scores"][ +# metric[0]] = [train_score, test_score] +# logging.debug("Done:\t Getting multiclass scores for each metric") +# return results + + +# def get_error_on_labels_multiclass(multiclass_results, multiclass_labels): +# """Used to add all the arrays showing on which example there is an error for each clf and each iteration""" +# +# logging.debug("Start:\t Getting errors on each example for each classifier") +# +# for iter_index, iter_results in enumerate(multiclass_results): +# for classifier_name, classifier_results in iter_results.items(): +# error_on_examples = classifier_results["labels"] == multiclass_labels +# multiclass_results[iter_index][classifier_name][ +# "error_on_examples"] = error_on_examples.astype(int) +# +# logging.debug("Done:\t Getting errors on each example for each classifier") +# +# return multiclass_results + + +# def publishMulticlassScores(multiclass_results, metrics, stats_iter, direcories, +# databaseName): +# results=[] +# for iter_index in range(stats_iter): +# directory = direcories[iter_index] +# for metric in metrics: +# logging.debug( +# "Start:\t Multiclass score graph generation for " + metric[0]) +# classifiers_names = np.array([classifier_name for classifier_name in +# multiclass_results[iter_index].keys()]) +# train_scores = np.array([multiclass_results[iter_index][ +# classifier_name]["metrics_scores"][ +# metric[0]][0] +# for classifier_name in classifiers_names]) +# validationScores = np.array([multiclass_results[iter_index][ +# classifier_name]["metrics_scores"][ +# metric[0]][1] +# for classifier_name in +# classifiers_names]) +# +# nbResults = classifiers_names.shape[0] +# fileName = os.path.join(directory , time.strftime( +# "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-" + metric[ +# 0]) +# +# plot_metric_scores(train_scores, validationScores, classifiers_names, +# nbResults, metric[0], fileName, tag=" multiclass") +# +# logging.debug( +# "Done:\t Multiclass score graph generation for " + metric[0]) +# results+=[[classifiersName, metric, testMean, testSTD] for classifiersName, testMean, testSTD in zip(classifiers_names, validationScores, np.zeros(len(validationScores)))] +# return results + + +# def publishMulticlassExmapleErrors(multiclass_results, directories, +# databaseName, example_ids, multiclass_labels): +# for iter_index, multiclass_result in enumerate(multiclass_results): +# directory = directories[iter_index] +# logging.debug("Start:\t Multiclass Label analysis figure generation") +# +# base_file_name = os.path.join(directory, time.strftime( +# "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-") +# nb_classifiers, nb_examples, classifiers_names, data, error_on_examples = gen_error_data( +# dict((key, multiclass_result[key]['error_on_examples']) +# for key in multiclass_result.keys()),) +# plot_2d(data, classifiers_names, nb_classifiers, nb_examples, +# base_file_name, example_ids=example_ids, labels=multiclass_labels) +# +# plot_errors_bar(error_on_examples, nb_classifiers, nb_examples, +# base_file_name) +# +# logging.debug("Done:\t Multiclass Label analysis figure generation") + +# +# def analyzeMulticlass(results, stats_iter, benchmark_argument_dictionaries, +# nb_examples, nb_labels, multiclass_labels, +# metrics, classification_indices, directories, example_ids): +# """Used to transform one versus one results in multiclass results and to publish it""" +# multiclass_results = [{} for _ in range(stats_iter)] +# +# for flag, result, tracebacks in results: +# iter_index = flag[0] +# classifierPositive = flag[1][0] +# classifierNegative = flag[1][1] +# +# for benchmarkArgumentDictionary in benchmark_argument_dictionaries: +# if benchmarkArgumentDictionary["flag"] == flag: +# trainIndices, testIndices, testMulticlassIndices = \ +# benchmarkArgumentDictionary["classification_indices"] +# +# for classifierResult in result: +# classifier_name = classifierResult.get_classifier_name() +# if classifier_name not in multiclass_results[iter_index]: +# multiclass_results[iter_index][classifier_name] = np.zeros( +# (nb_examples, nb_labels), dtype=int) +# for exampleIndex in trainIndices: +# label = classifierResult.full_labels_pred[exampleIndex] +# if label == 1: +# multiclass_results[iter_index][classifier_name][ +# exampleIndex, classifierPositive] += 1 +# else: +# multiclass_results[iter_index][classifier_name][ +# exampleIndex, classifierNegative] += 1 +# for multiclassIndex, exampleIndex in enumerate( +# testMulticlassIndices): +# label = classifierResult.y_test_multiclass_pred[multiclassIndex] +# if label == 1: +# multiclass_results[iter_index][classifier_name][ +# exampleIndex, classifierPositive] += 1 +# else: +# multiclass_results[iter_index][classifier_name][ +# exampleIndex, classifierNegative] += 1 +# +# for iter_index, multiclassiterResult in enumerate(multiclass_results): +# for key, value in multiclassiterResult.items(): +# multiclass_results[iter_index][key] = { +# "labels": np.argmax(value, axis=1)} +# +# multiclass_results = gen_metrics_scores_multiclass(multiclass_results, +# multiclass_labels, metrics, +# benchmark_argument_dictionaries) +# multiclass_results = get_error_on_labels_multiclass(multiclass_results, +# multiclass_labels) +# +# results = publishMulticlassScores(multiclass_results, metrics, stats_iter, directories, +# benchmark_argument_dictionaries[0]["args"]["Base"]["name"]) +# publishMulticlassExmapleErrors(multiclass_results, directories, +# benchmark_argument_dictionaries[0][ +# "args"]["Base"]["name"], example_ids, multiclass_labels) +# +# return results, multiclass_results def numpy_mean_and_std(scores_array): return np.mean(scores_array, axis=1), np.std(scores_array, axis=1) -def publish_iter_biclass_metrics_scores(iter_results, directory, labels_dictionary, - data_base_name, stats_iter, - min_size=10): +def publish_all_metrics_scores(iter_results, directory, + data_base_name, stats_iter, + min_size=10): results=[] - for labels_combination, iter_result in iter_results.items(): - current_directory = os.path.join(directory, labels_dictionary[ - int(labels_combination[0])] + "-vs-" + labels_dictionary[ - int(labels_combination[1])]) - if not os.path.exists(os.path.dirname(os.path.join(current_directory, "a"))): - try: - os.makedirs(os.path.dirname(os.path.join(current_directory, "a"))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise - - for metric_name, scores in iter_result.items(): - train = np.array(scores["mean"].loc["train"]) - test = np.array(scores["mean"].loc["test"]) - names = np.array(scores["mean"].columns) - train_std = np.array(scores["std"].loc["train"]) - test_std = np.array(scores["std"].loc["test"]) - # trainMeans, trainSTDs = numpy_mean_and_std(scores["train_scores"]) - # testMeans, testSTDs = numpy_mean_and_std(scores["test_scores"]) - - # names = np.array([name for name in classifiers_dict.keys()]) - fileName = os.path.join(current_directory, time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + data_base_name + "-Mean_on_" + str( - stats_iter) + "_iter-" + metric_name) - nbResults = names.shape[0] - - plot_metric_scores(train, test, names, nbResults, - metric_name, fileName, tag=" averaged", - train_STDs=train_std, test_STDs=test_std) - results+=[[classifier_name, metric_name, test_mean, test_std] for classifier_name, test_mean, test_std in zip(names, test, test_std)] + if not os.path.exists(os.path.dirname(os.path.join(directory, "a"))): + try: + os.makedirs(os.path.dirname(os.path.join(directory, "a"))) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise + + for metric_name, scores in iter_results.items(): + train = np.array(scores["mean"].loc["train"]) + test = np.array(scores["mean"].loc["test"]) + names = np.array(scores["mean"].columns) + train_std = np.array(scores["std"].loc["train"]) + test_std = np.array(scores["std"].loc["test"]) + + file_name = os.path.join(directory, data_base_name + "-Mean_on_" + str( + stats_iter) + "_iter-" + metric_name) + nbResults = names.shape[0] + + plot_metric_scores(train, test, names, nbResults, + metric_name, file_name, tag=" averaged", + train_STDs=train_std, test_STDs=test_std) + results+=[[classifier_name, metric_name, test_mean, test_std] + for classifier_name, test_mean, test_std + in zip(names, test, test_std)] return results -def gen_error_data_glob(combi_results, stats_iter): - nb_examples = next(iter(combi_results.values())).shape[0] - nb_classifiers = len(combi_results) +def gen_error_data_glob(iter_results, stats_iter): + nb_examples = next(iter(iter_results.values())).shape[0] + nb_classifiers = len(iter_results) data = np.zeros((nb_examples, nb_classifiers), dtype=int) classifier_names = [] - for clf_index, (classifier_name, error_data) in enumerate(combi_results.items()): + for clf_index, (classifier_name, error_data) in enumerate(iter_results.items()): data[:, clf_index] = error_data classifier_names.append(classifier_name) error_on_examples = -1 * np.sum(data, axis=1) + (nb_classifiers * stats_iter) return nb_examples, nb_classifiers, data, error_on_examples, classifier_names -def publish_iter_biclass_example_errors(iter_results, directory, - labels_dictionary, stats_iter, - example_ids, labels): - for labels_combination, combi_results in iter_results.items(): - base_file_name = os.path.join(directory, labels_dictionary[ - int(labels_combination[0])] + "-vs-" + - labels_dictionary[ - int(labels_combination[1])], time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-") - - logging.debug( - "Start:\t Global biclass label analysis figure generation") - - nbExamples, nbClassifiers, data, \ - error_on_examples, classifier_names = gen_error_data_glob(combi_results, - stats_iter) - - np.savetxt(base_file_name + "clf_errors.csv", data, delimiter=",") - np.savetxt(base_file_name + "example_errors.csv", error_on_examples, - delimiter=",") - - plot_2d(data, classifier_names, nbClassifiers, nbExamples, - base_file_name, stats_iter=stats_iter, example_ids=example_ids, labels=labels[labels_combination]) - plot_errors_bar(error_on_examples, nbClassifiers * stats_iter, - nbExamples, base_file_name) +def publish_all_example_errors(iter_results, directory, + stats_iter, + example_ids, labels): - logging.debug( - "Done:\t Global biclass label analysis figures generation") - - -def publish_iter_multiclass_metrics_scores(iter_multiclass_results, classifiers_names, - data_base_name, directory, stats_iter, - min_size=10): - results = [] - for metric_name, scores in iter_multiclass_results["metrics_scores"].items(): - trainMeans, trainSTDs = numpy_mean_and_std(scores["train_scores"]) - testMeans, testSTDs = numpy_mean_and_std(scores["test_scores"]) - - nb_results = classifiers_names.shape[0] - - file_name = os.path.join(directory, time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + data_base_name + "-Mean_on_" + str( - stats_iter) + "_iter-" + metric_name + ".png") + logging.debug( + "Start:\t Global biclass label analysis figure generation") - plot_metric_scores(trainMeans, testMeans, classifiers_names, nb_results, - metric_name, file_name, tag=" averaged multiclass", - train_STDs=trainSTDs, test_STDs=testSTDs) + nbExamples, nbClassifiers, data, \ + error_on_examples, classifier_names = gen_error_data_glob(iter_results, + stats_iter) - results+=[[classifiers_name, metric_name,testMean, testSTD] for classifiers_name, testMean, testSTD in zip(classifiers_names, testMeans, testSTDs)] - return results + np.savetxt(directory + "clf_errors.csv", data, delimiter=",") + np.savetxt(directory + "example_errors.csv", error_on_examples, + delimiter=",") + plot_2d(data, classifier_names, nbClassifiers, nbExamples, + directory, stats_iter=stats_iter, example_ids=example_ids, labels=labels) + plot_errors_bar(error_on_examples, nbClassifiers * stats_iter, + nbExamples, directory) -def publish_iter_multiclass_example_errors(iter_multiclass_results, directory, - classifiers_names, stats_iter, example_ids, multiclass_labels, min_size=10): logging.debug( - "Start:\t Global multiclass label analysis figures generation") - base_file_name = os.path.join(directory, time.strftime("%Y_%m_%d-%H_%M_%S") + "-") - nb_examples, nb_classifiers, data, error_on_examples, classifiers_names = gen_error_data_glob( - dict((clf_name, combi_res) - for clf_name, combi_res - in zip(classifiers_names, - iter_multiclass_results["error_on_examples"])), - stats_iter) - - plot_2d(data, classifiers_names, nb_classifiers, nb_examples, - base_file_name, stats_iter=stats_iter, example_ids=example_ids, labels=multiclass_labels) - - plot_errors_bar(error_on_examples, nb_classifiers * stats_iter, nb_examples, - base_file_name) - - logging.debug("Done:\t Global multiclass label analysis figures generation") + "Done:\t Global biclass label analysis figures generation") + + +# def publish_iter_multiclass_metrics_scores(iter_multiclass_results, classifiers_names, +# data_base_name, directory, stats_iter, +# min_size=10): +# results = [] +# for metric_name, scores in iter_multiclass_results["metrics_scores"].items(): +# trainMeans, trainSTDs = numpy_mean_and_std(scores["train_scores"]) +# testMeans, testSTDs = numpy_mean_and_std(scores["test_scores"]) +# +# nb_results = classifiers_names.shape[0] +# +# file_name = os.path.join(directory, data_base_name + "-Mean_on_" + str( +# stats_iter) + "_iter-" + metric_name + ".png") +# +# plot_metric_scores(trainMeans, testMeans, classifiers_names, nb_results, +# metric_name, file_name, tag=" averaged multiclass", +# train_STDs=trainSTDs, test_STDs=testSTDs) +# +# results+=[[classifiers_name, metric_name,testMean, testSTD] for classifiers_name, testMean, testSTD in zip(classifiers_names, testMeans, testSTDs)] +# return results + + +# def publish_iter_multiclass_example_errors(iter_multiclass_results, directory, +# classifiers_names, stats_iter, example_ids, multiclass_labels, min_size=10): +# logging.debug( +# "Start:\t Global multiclass label analysis figures generation") +# nb_examples, nb_classifiers, data, error_on_examples, classifiers_names = gen_error_data_glob( +# dict((clf_name, combi_res) +# for clf_name, combi_res +# in zip(classifiers_names, +# iter_multiclass_results["error_on_examples"])), +# stats_iter) +# +# plot_2d(data, classifiers_names, nb_classifiers, nb_examples, +# directory, stats_iter=stats_iter, +# example_ids=example_ids, labels=multiclass_labels) +# +# plot_errors_bar(error_on_examples, nb_classifiers * stats_iter, nb_examples, +# directory) +# +# logging.debug("Done:\t Global multiclass label analysis figures generation") def gen_classifiers_dict(results, metrics): @@ -1112,116 +1090,111 @@ def format_previous_results(biclass_results): arrays for each classifier """ - metrics_analysis = dict((key, {}) for key in biclass_results.keys()) - error_analysis = dict((key, {}) for key in biclass_results.keys()) - feature_importances_analysis = dict((key, {}) for key in biclass_results.keys()) - feature_importances_stds = dict((key, {}) for key in biclass_results.keys()) - labels = dict((key,"") for key in biclass_results.keys()) - for label_combination, biclass_result in biclass_results.items(): - - metric_concat_dict = {} - for iter_index, metrics_score in enumerate( - biclass_result["metrics_scores"]): - for metric_name, dataframe in metrics_score.items(): - if metric_name not in metric_concat_dict: - metric_concat_dict[metric_name] = dataframe - else: - metric_concat_dict[metric_name] = pd.concat( - [metric_concat_dict[metric_name], dataframe]) - - for metric_name, dataframe in metric_concat_dict.items(): - metrics_analysis[label_combination][metric_name] = {} - metrics_analysis[label_combination][metric_name][ - "mean"] = dataframe.groupby(dataframe.index).mean() - metrics_analysis[label_combination][metric_name][ - "std"] = dataframe.groupby(dataframe.index).std(ddof=0) - - importance_concat_dict = {} - for iter_index, view_feature_importances in enumerate(biclass_result["feature_importances"]): - for view_name, feature_importances in view_feature_importances.items(): - if view_name not in importance_concat_dict: - importance_concat_dict[view_name] = feature_importances - else: - importance_concat_dict[view_name] = pd.concat( - [importance_concat_dict[view_name], feature_importances]) - - for view_name, dataframe in importance_concat_dict.items(): - feature_importances_analysis[label_combination][view_name] = dataframe.groupby(dataframe.index).mean() - - feature_importances_stds[label_combination][view_name] = dataframe.groupby(dataframe.index).std(ddof=0) - - labels[label_combination] = biclass_result["labels"] - - added_example_errors = {} - for example_errors in biclass_result["example_errors"]: - for classifier_name, errors in example_errors.items(): - if classifier_name not in added_example_errors: - added_example_errors[classifier_name] = errors - else: - added_example_errors[classifier_name] += errors - error_analysis[label_combination] = added_example_errors - return metrics_analysis, error_analysis, feature_importances_analysis, feature_importances_stds, labels - - -def analyzebiclass_iter(biclass_results, stats_iter, directory, - labels_dictionary, data_base_name, example_ids): + metrics_analysis = {} + feature_importances_analysis = {} + feature_importances_stds = {} + # labels = dict((key,"") for key in biclass_results.keys()) + # for biclass_result in biclass_results.items(): + + metric_concat_dict = {} + for iter_index, metrics_score in enumerate( + biclass_results["metrics_scores"]): + for metric_name, dataframe in metrics_score.items(): + if metric_name not in metric_concat_dict: + metric_concat_dict[metric_name] = dataframe + else: + metric_concat_dict[metric_name] = pd.concat( + [metric_concat_dict[metric_name], dataframe]) + + for metric_name, dataframe in metric_concat_dict.items(): + metrics_analysis[metric_name] = {} + metrics_analysis[metric_name][ + "mean"] = dataframe.groupby(dataframe.index).mean() + metrics_analysis[metric_name][ + "std"] = dataframe.groupby(dataframe.index).std(ddof=0) + + importance_concat_dict = {} + for iter_index, view_feature_importances in enumerate(biclass_results["feature_importances"]): + for view_name, feature_importances in view_feature_importances.items(): + if view_name not in importance_concat_dict: + importance_concat_dict[view_name] = feature_importances + else: + importance_concat_dict[view_name] = pd.concat( + [importance_concat_dict[view_name], feature_importances]) + + for view_name, dataframe in importance_concat_dict.items(): + feature_importances_analysis[view_name] = dataframe.groupby(dataframe.index).mean() + + feature_importances_stds[view_name] = dataframe.groupby(dataframe.index).std(ddof=0) + + + added_example_errors = {} + for example_errors in biclass_results["example_errors"]: + for classifier_name, errors in example_errors.items(): + if classifier_name not in added_example_errors: + added_example_errors[classifier_name] = errors + else: + added_example_errors[classifier_name] += errors + error_analysis = added_example_errors + return metrics_analysis, error_analysis, feature_importances_analysis, feature_importances_stds, biclass_results["labels"] + + +def analyze_all(biclass_results, stats_iter, directory, data_base_name, + example_ids): """Used to format the results in order to plot the mean results on the iterations""" - metrics_analysis, error_analysis, feature_improtances, feature_improtances_stds, labels = format_previous_results(biclass_results) - - results = publish_iter_biclass_metrics_scores(metrics_analysis, - directory, labels_dictionary, - data_base_name, stats_iter) - publish_iter_biclass_example_errors(error_analysis, directory, - labels_dictionary, - stats_iter, example_ids, labels) - for label_combination, feature_improtances_view in feature_improtances.items(): - labels = [labels_dictionary[ - int(label_combination[0])], labels_dictionary[ - int(label_combination[1])]] - publish_feature_importances(feature_improtances_view, os.path.join(directory,"-vs-".join(labels)+"/"), - data_base_name, labels, feature_improtances_stds[label_combination]) + metrics_analysis, error_analysis, \ + feature_importances, feature_importances_stds, \ + labels = format_previous_results(biclass_results) + + results = publish_all_metrics_scores(metrics_analysis, + directory, + data_base_name, stats_iter) + publish_all_example_errors(error_analysis, directory,stats_iter, + example_ids, labels) + publish_feature_importances(feature_importances, directory, + data_base_name, feature_importances_stds) return results -def analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics, - data_base_name, nb_examples, example_ids, multiclass_labels): - """Used to mean the multiclass results on the iterations executed with different random states""" - - logging.debug("Start:\t Getting mean results for multiclass classification") - iter_multiclass_results = {} - nb_classifiers = len(multiclass_results[0]) - iter_multiclass_results["error_on_examples"] = np.zeros( - (nb_classifiers, nb_examples), dtype=int) - iter_multiclass_results["metrics_scores"] = {} - classifiers_names = [] - for iter_index, multiclass_result in enumerate(multiclass_results): - for classifier_name, classifier_results in multiclass_result.items(): - if classifier_name not in classifiers_names: - classifiers_names.append(classifier_name) - classifier_index = classifiers_names.index(classifier_name) - for metric in metrics: - if metric[0] not in iter_multiclass_results["metrics_scores"]: - iter_multiclass_results["metrics_scores"][metric[0]] = { - "train_scores": - np.zeros((nb_classifiers, stats_iter)), - "test_scores": - np.zeros((nb_classifiers, stats_iter))} - iter_multiclass_results["metrics_scores"][metric[0]][ - "train_scores"][classifier_index, iter_index] = \ - classifier_results["metrics_scores"][metric[0]][0] - iter_multiclass_results["metrics_scores"][metric[0]]["test_scores"][ - classifier_index, iter_index] = \ - classifier_results["metrics_scores"][metric[0]][1] - iter_multiclass_results["error_on_examples"][classifier_index, :] += \ - classifier_results["error_on_examples"] - logging.debug("Start:\t Getting mean results for multiclass classification") - - classifiers_names = np.array(classifiers_names) - results = publish_iter_multiclass_metrics_scores( - iter_multiclass_results, classifiers_names, - data_base_name, directory, stats_iter) - publish_iter_multiclass_example_errors(iter_multiclass_results, directory, - classifiers_names, stats_iter, example_ids, multiclass_labels) - return results +# def analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics, +# data_base_name, nb_examples, example_ids, multiclass_labels): +# """Used to mean the multiclass results on the iterations executed with different random states""" +# +# logging.debug("Start:\t Getting mean results for multiclass classification") +# iter_multiclass_results = {} +# nb_classifiers = len(multiclass_results[0]) +# iter_multiclass_results["error_on_examples"] = np.zeros( +# (nb_classifiers, nb_examples), dtype=int) +# iter_multiclass_results["metrics_scores"] = {} +# classifiers_names = [] +# for iter_index, multiclass_result in enumerate(multiclass_results): +# for classifier_name, classifier_results in multiclass_result.items(): +# if classifier_name not in classifiers_names: +# classifiers_names.append(classifier_name) +# classifier_index = classifiers_names.index(classifier_name) +# for metric in metrics: +# if metric[0] not in iter_multiclass_results["metrics_scores"]: +# iter_multiclass_results["metrics_scores"][metric[0]] = { +# "train_scores": +# np.zeros((nb_classifiers, stats_iter)), +# "test_scores": +# np.zeros((nb_classifiers, stats_iter))} +# iter_multiclass_results["metrics_scores"][metric[0]][ +# "train_scores"][classifier_index, iter_index] = \ +# classifier_results["metrics_scores"][metric[0]][0] +# iter_multiclass_results["metrics_scores"][metric[0]]["test_scores"][ +# classifier_index, iter_index] = \ +# classifier_results["metrics_scores"][metric[0]][1] +# iter_multiclass_results["error_on_examples"][classifier_index, :] += \ +# classifier_results["error_on_examples"] +# logging.debug("Start:\t Getting mean results for multiclass classification") +# +# classifiers_names = np.array(classifiers_names) +# results = publish_iter_multiclass_metrics_scores( +# iter_multiclass_results, classifiers_names, +# data_base_name, directory, stats_iter) +# publish_iter_multiclass_example_errors(iter_multiclass_results, directory, +# classifiers_names, stats_iter, example_ids, multiclass_labels) +# return results def save_failed(failed_list, directory): @@ -1230,32 +1203,20 @@ def save_failed(failed_list, directory): failed_file.write(", \n".join(failed_list)+".") -def get_results(results, stats_iter, nb_multiclass, benchmark_argument_dictionaries, - multiclass_labels, metrics, - classification_indices, directories, directory, labels_dictionary, - nb_examples, nb_labels, example_ids): +def get_results(results, stats_iter, benchmark_argument_dictionaries, + metrics, directory, example_ids, labels): """Used to analyze the results of the previous benchmarks""" data_base_name = benchmark_argument_dictionaries[0]["args"]["Base"]["name"] - results_means_std, biclass_results, flagged_failed = analyze_biclass(results, benchmark_argument_dictionaries, - stats_iter, metrics, example_ids) + results_means_std, biclass_results, flagged_failed = analyze_iterations(results, benchmark_argument_dictionaries, + stats_iter, metrics, example_ids, labels) if flagged_failed: save_failed(flagged_failed, directory) - if nb_multiclass > 1: - results_means_std, multiclass_results = analyzeMulticlass(results, stats_iter, - benchmark_argument_dictionaries, - nb_examples, nb_labels, - multiclass_labels, metrics, - classification_indices, - directories, example_ids) if stats_iter > 1: - results_means_std = analyzebiclass_iter( + results_means_std = analyze_all( biclass_results, stats_iter, directory, - labels_dictionary, data_base_name, example_ids) - if nb_multiclass > 1: - results_means_std = analyze_iter_multiclass(multiclass_results, directory, stats_iter, - metrics, data_base_name, nb_examples, example_ids, multiclass_labels) + data_base_name, example_ids) return results_means_std diff --git a/multiview_platform/mono_multi_view_classifiers/utils/base.py b/multiview_platform/mono_multi_view_classifiers/utils/base.py new file mode 100644 index 0000000000000000000000000000000000000000..74e8e593f712b402a4589a0a246dc95b507f2095 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/utils/base.py @@ -0,0 +1,116 @@ +import numpy as np +import pickle +from sklearn.base import BaseEstimator +from matplotlib.ticker import FuncFormatter +import matplotlib.pyplot as plt + + +class BaseClassifier(BaseEstimator, ): + + def genBestParams(self, detector): + return dict( + (param_name, detector.best_params_[param_name]) for param_name in + self.param_names) + + def genParamsFromDetector(self, detector): + if self.classed_params: + classed_dict = dict((classed_param, get_names( + detector.cv_results_["param_" + classed_param])) + for classed_param in self.classed_params) + if self.param_names: + return [(param_name, + np.array(detector.cv_results_["param_" + param_name])) + if param_name not in self.classed_params else ( + param_name, classed_dict[param_name]) + for param_name in self.param_names] + else: + return [()] + + def gen_distribs(self): + return dict((param_name, distrib) for param_name, distrib in + zip(self.param_names, self.distribs)) + + def params_to_string(self): + return ", ".join( + [param_name + " : " + self.to_str(param_name) for param_name in + self.param_names]) + + def getConfig(self): + if self.param_names: + return "\n\t\t- " + self.__class__.__name__ + "with " + self.params_to_string() + else: + return "\n\t\t- " + self.__class__.__name__ + "with no config." + + def to_str(self, param_name): + if param_name in self.weird_strings: + if self.weird_strings[param_name] == "class_name": + return self.get_params()[param_name].__class__.__name__ + else: + return self.weird_strings[param_name]( + self.get_params()[param_name]) + else: + return str(self.get_params()[param_name]) + + def get_feature_importance(self, directory, nb_considered_feats=50): + """Used to generate a graph and a pickle dictionary representing feature importances""" + featureImportances = self.feature_importances_ + sortedArgs = np.argsort(-featureImportances) + featureImportancesSorted = featureImportances[sortedArgs][ + :nb_considered_feats] + featureIndicesSorted = sortedArgs[:nb_considered_feats] + fig, ax = plt.subplots() + x = np.arange(len(featureIndicesSorted)) + formatter = FuncFormatter(percent) + ax.yaxis.set_major_formatter(formatter) + plt.bar(x, featureImportancesSorted) + plt.title("Importance depending on feature") + fig.savefig(directory + "feature_importances.png", transparent=True) + plt.close() + featuresImportancesDict = dict((featureIndex, featureImportance) + for featureIndex, featureImportance in + enumerate(featureImportances) + if featureImportance != 0) + with open(directory + 'feature_importances.pickle', 'wb') as handle: + pickle.dump(featuresImportancesDict, handle) + interpretString = "Feature importances : \n" + for featureIndex, featureImportance in zip(featureIndicesSorted, + featureImportancesSorted): + if featureImportance > 0: + interpretString += "- Feature index : " + str(featureIndex) + \ + ", feature importance : " + str( + featureImportance) + "\n" + return interpretString + + def get_name_for_fusion(self): + return self.__class__.__name__[:4] + + def getInterpret(self, directory, y_test): + return "" + + def accepts_multi_class(self, random_state, n_samples=10, dim=2, + n_classes=3): + if int(n_samples / n_classes) < 1: + raise ValueError( + "n_samples ({}) / n_classe ({}) must be over 1".format( + n_samples, + n_classes)) + fake_mc_X = random_state.random_integers(low=0, high=100, + size=(n_samples, dim)) + fake_mc_y = [class_index + for _ in range(int(n_samples / n_classes)) + for class_index in range(n_classes)] + fake_mc_y += [0 for _ in range(n_samples % n_classes)] + try: + self.fit(fake_mc_X, fake_mc_y) + self.predict(fake_mc_X) + return True + except ValueError: + return False + + +def get_names(classed_list): + return np.array([object_.__class__.__name__ for object_ in classed_list]) + +def percent(x, pos): + """Used to print percentage of importance on the y axis""" + return '%1.1f %%' % (x * 100) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/utils/execution.py b/multiview_platform/mono_multi_view_classifiers/utils/execution.py index 45ef0a698dda02695f702594f64374db9b1386cd..ec5775af94fea8e48fa57e78924c4a9bf8eb78fe 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/execution.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/execution.py @@ -326,12 +326,12 @@ def find_dataset_names(path, type, names): return names -def gen_argument_dictionaries(labels_dictionary, directories, multiclass_labels, - labels_combinations, indices_multiclass, +def gen_argument_dictionaries(labels_dictionary, directories, + splits, hyper_param_search, args, k_folds, stats_iter_random_states, metrics, argument_dictionaries, - benchmark, nb_views, views, views_indices): + benchmark, views, views_indices): r"""Used to generate a dictionary for each benchmark. One for each label combination (if multiclass), for each statistical iteration, generates an dictionary with @@ -379,30 +379,20 @@ def gen_argument_dictionaries(labels_dictionary, directories, multiclass_labels, """ benchmark_argument_dictionaries = [] - for combination_index, labels_combination in enumerate(labels_combinations): - for iter_index, iterRandomState in enumerate(stats_iter_random_states): - benchmark_argument_dictionary = { - "labels_dictionary": {0: labels_dictionary[labels_combination[0]], - 1: labels_dictionary[ - labels_combination[1]]}, - "directory": os.path.join(directories[iter_index], - labels_dictionary[labels_combination[0]] + - "-vs-" + - labels_dictionary[labels_combination[1]]), - "classification_indices": [ - indices_multiclass[combination_index][0][iter_index], - indices_multiclass[combination_index][1][iter_index], - indices_multiclass[combination_index][2][iter_index]], - "args": args, - "labels": multiclass_labels[combination_index], - "k_folds": k_folds[iter_index], - "random_state": iterRandomState, - "hyper_param_search": hyper_param_search, - "metrics": metrics, - "argument_dictionaries": argument_dictionaries, - "benchmark": benchmark, - "views": views, - "views_indices": views_indices, - "flag": [iter_index, labels_combination]} - benchmark_argument_dictionaries.append(benchmark_argument_dictionary) + for iter_index, iterRandomState in enumerate(stats_iter_random_states): + benchmark_argument_dictionary = { + "labels_dictionary": labels_dictionary, + "directory": directories[iter_index], + "classification_indices": splits[iter_index], + "args": args, + "k_folds": k_folds[iter_index], + "random_state": iterRandomState, + "hyper_param_search": hyper_param_search, + "metrics": metrics, + "argument_dictionaries": argument_dictionaries, + "benchmark": benchmark, + "views": views, + "views_indices": views_indices, + "flag": iter_index} + benchmark_argument_dictionaries.append(benchmark_argument_dictionary) return benchmark_argument_dictionaries diff --git a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py index f23ddcc7b6a4dfcf1a5f5644eeedc11f0b54614b..295e272ce3896d9e1b31520007951726f6b90497 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py @@ -6,7 +6,7 @@ import numpy as np from scipy.stats import randint, uniform from sklearn.model_selection import RandomizedSearchCV - +from .multiclass import get_mc_estim from .. import metrics @@ -112,7 +112,8 @@ def randomized_search(X, y, framework, random_state, output_file_name, classifie equivalent_draws=True): estimator = getattr(classifier_module, classifier_name)(random_state=random_state, **classifier_kwargs) - params_dict = estimator.genDistribs() + params_dict = estimator.gen_distribs() + estimator = get_mc_estim(estimator, random_state) if params_dict: metric_module = getattr(metrics, metric[0]) if metric[1] is not None: @@ -126,15 +127,15 @@ def randomized_search(X, y, framework, random_state, output_file_name, classifie [min(nb_possible_combination, n_iter) for nb_possible_combination in nb_possible_combinations]) random_search = MultiviewCompatibleRandomizedSearchCV(estimator, - n_iter=int(np.sum(min_list)), - param_distributions=params_dict, - refit=True, - n_jobs=nb_cores, scoring=scorer, - cv=folds, random_state=random_state, - learning_indices=learning_indices, - view_indices=view_indices, - framework = framework, - equivalent_draws=equivalent_draws) + n_iter=int(np.sum(min_list)), + param_distributions=params_dict, + refit=True, + n_jobs=nb_cores, scoring=scorer, + cv=folds, random_state=random_state, + learning_indices=learning_indices, + view_indices=view_indices, + framework = framework, + equivalent_draws=equivalent_draws) random_search.fit(X, y) best_params = random_search.best_params_ if "random_state" in best_params: diff --git a/multiview_platform/mono_multi_view_classifiers/utils/multiclass.py b/multiview_platform/mono_multi_view_classifiers/utils/multiclass.py index 2e525f2983472feb1b78089bb06f5f7ddd55314d..e7b377883e737d9a0f2172449edda5f474cc797a 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/multiclass.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/multiclass.py @@ -1,7 +1,9 @@ import itertools - +from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier import numpy as np +from .base import BaseClassifier + def gen_multiclass_labels(labels, multiclass_method, splits): r"""Used to gen the train/test splits and to set up the framework of the adaptation of a multiclass dataset @@ -97,3 +99,38 @@ def is_biclass(multiclass_preds): return True else: return False + + +def get_mc_estim(estimator, random_state): + # print(estimator.accepts_multi_class(random_state)) + if not estimator.accepts_multi_class(random_state): + if hasattr(estimator, "predict_proba"): + estimator = OVRWrapper(estimator) + print(estimator.get_params().keys()) + else: + estimator = OneVsOneClassifier(estimator) + return estimator + +class MCWrapper(): + + def set_params(self, **params): + self.estimator.set_params(**params) + return self + + def get_config(self): + return self.estimator.get_config() + + def get_interpret(self, output_file_name, y_test): + return self.estimator.get_interpret(output_file_name, y_test, + multi_class=True) + +# +# +class OVRWrapper(MCWrapper, OneVsOneClassifier): + + pass + + +class OVOWrapper(MCWrapper, BaseClassifier): + + pass diff --git a/multiview_platform/mono_multi_view_classifiers/utils/multiview_result_analysis.py b/multiview_platform/mono_multi_view_classifiers/utils/multiview_result_analysis.py index 372f62116eb4d2305f7cf5df4596fd02c26a3bc2..df3e59727fadf66b6c2836f59afc546a57cd69e4 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/multiview_result_analysis.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/multiview_result_analysis.py @@ -14,7 +14,7 @@ def print_metric_score(metric_scores, metrics): enumerate(metric[1])) else: metric_kwargs = {} - metric_score_string += "\tFor " + metric_module.getConfig( + metric_score_string += "\tFor " + metric_module.get_config( **metric_kwargs) + " : " metric_score_string += "\n\t\t- Score on train : " + str( metric_scores[metric[0]][0]) diff --git a/multiview_platform/tests/test_ResultAnalysis.py b/multiview_platform/tests/test_ResultAnalysis.py index 90f45733b7fab2ad7f7807adb7f1a820c03413ad..26b32efb6c965aafd7c9d17673efb75ac3bf81b1 100644 --- a/multiview_platform/tests/test_ResultAnalysis.py +++ b/multiview_platform/tests/test_ResultAnalysis.py @@ -47,20 +47,30 @@ class Test_get_metrics_scores_biclass(unittest.TestCase): np.testing.assert_array_equal(np.array(metrics_scores["f1_score"].columns), np.array(["ada-0"])) - def multiple_monoview_classifiers(self): + def test_multiple_monoview_classifiers(self): metrics = [["accuracy_score"], ["f1_score"]] - results = [MonoviewResult(0, - "ada", - "0", - {"accuracy_score": [0.9, 0.95], - "f1_score": [0.91, 0.96]} - , "", "", "", ""), - MonoviewResult(0, - "dt", - "1", - {"accuracy_score": [0.8, 0.85], - "f1_score": [0.81, 0.86]} - , "", "", "", "") + results = [MonoviewResult(view_index=0, + classifier_name="ada", + view_name="0", + metrics_scores={"accuracy_score": [0.9, 0.95], + "f1_score": [0.91, 0.96]}, + full_labels_pred="", + classifier_config="", + y_test_multiclass_pred="", + test_folds_preds="", + classifier="", + n_features=""), + MonoviewResult(view_index=0, + classifier_name="dt", + view_name="1", + metrics_scores={"accuracy_score": [0.8, 0.85], + "f1_score": [0.81, 0.86]}, + full_labels_pred="", + classifier_config="", + y_test_multiclass_pred="", + test_folds_preds="", + classifier="", + n_features="") ] metrics_scores = result_analysis.get_metrics_scores_biclass(metrics, results) @@ -82,16 +92,21 @@ class Test_get_metrics_scores_biclass(unittest.TestCase): np.array(metrics_scores["f1_score"].columns), np.array(["ada-0", "dt-1"])) - def mutiview_result(self): + def test_mutiview_result(self): metrics = [["accuracy_score"], ["f1_score"]] results = [MultiviewResult("mv", "", {"accuracy_score": [0.7, 0.75], "f1_score": [0.71, 0.76]}, "", ""), - MonoviewResult(0, - "dt", - "1", - {"accuracy_score": [0.8, 0.85], - "f1_score": [0.81, 0.86]} - , "", "", "", "") + MonoviewResult(view_index=0, + classifier_name="dt", + view_name="1", + metrics_scores={"accuracy_score": [0.8, 0.85], + "f1_score": [0.81, 0.86]}, + full_labels_pred="", + classifier_config="", + y_test_multiclass_pred="", + test_folds_preds="", + classifier="", + n_features="") ] metrics_scores = result_analysis.get_metrics_scores_biclass(metrics, results) @@ -126,7 +141,8 @@ class Test_get_example_errors_biclass(unittest.TestCase): "1", {"accuracy_score": [0.8, 0.85], "f1_score": [0.81, 0.86]} - , np.array([0,0,1,1,0,0,1,1,0]), "", "", "", "", "") + , np.array([0,0,1,1,0,0,1,1,0]), "", "", + "", "", "") ] example_errors = result_analysis.get_example_errors_biclass(ground_truth, results) @@ -143,7 +159,8 @@ class Test_init_plot(unittest.TestCase): results = [] metric_name = "acc" data = np.random.RandomState(42).uniform(0,1,(2,2)) - metric_dataframe = pd.DataFrame(index=["train", "test"], columns=["dt-1", "mv"], data=data) + metric_dataframe = pd.DataFrame(index=["train", "test"], + columns=["dt-1", "mv"], data=data) directory = "dir" database_name = 'db' labels_names = ['lb1', "lb2"] @@ -160,7 +177,8 @@ class Test_init_plot(unittest.TestCase): np.testing.assert_array_equal(test, data[1, :]) np.testing.assert_array_equal(classifier_names, np.array(["dt-1", "mv"])) self.assertEqual(nb_results, 2) - self.assertEqual(results, [["dt-1", "acc", data[1,0], 0], ["mv", "acc", data[1,1], 0]]) + self.assertEqual(results, [["dt-1", "acc", data[1,0], 0], + ["mv", "acc", data[1,1], 0]]) class Test_gen_error_data(unittest.TestCase): @@ -182,7 +200,7 @@ class Test_gen_error_data(unittest.TestCase): class Test_format_previous_results(unittest.TestCase): def test_simple(self): - biclass_results = {"01":{"metrics_scores":[], "example_errors":[], "feature_importances":[], "labels":[]}} + biclass_results = {"metrics_scores":[], "example_errors":[], "feature_importances":[], "labels":[]} random_state = np.random.RandomState(42) # Gen metrics data @@ -192,8 +210,8 @@ class Test_format_previous_results(unittest.TestCase): columns=["ada-1", "mv"]) metric_2_df = pd.DataFrame(data=metrics_2_data, index=["train", "test"], columns=["ada-1", "mv"]) - biclass_results["01"]["metrics_scores"].append({"acc": metric_1_df}) - biclass_results["01"]["metrics_scores"].append({"acc": metric_2_df}) + biclass_results["metrics_scores"].append({"acc": metric_1_df}) + biclass_results["metrics_scores"].append({"acc": metric_2_df}) # Gen error data ada_error_data_1 = random_state.randint(0,2,7) @@ -202,12 +220,12 @@ class Test_format_previous_results(unittest.TestCase): mv_error_data_1 = random_state.randint(0, 2, 7) mv_error_data_2 = random_state.randint(0, 2, 7) mv_sum = mv_error_data_1+mv_error_data_2 - biclass_results["01"]["example_errors"].append({}) - biclass_results["01"]["example_errors"].append({}) - biclass_results["01"]["example_errors"][0]["ada-1"] = ada_error_data_1 - biclass_results["01"]["example_errors"][0]["mv"] = mv_error_data_1 - biclass_results["01"]["example_errors"][1]["ada-1"] = ada_error_data_2 - biclass_results["01"]["example_errors"][1]["mv"] = mv_error_data_2 + biclass_results["example_errors"].append({}) + biclass_results["example_errors"].append({}) + biclass_results["example_errors"][0]["ada-1"] = ada_error_data_1 + biclass_results["example_errors"][0]["mv"] = mv_error_data_1 + biclass_results["example_errors"][1]["ada-1"] = ada_error_data_2 + biclass_results["example_errors"][1]["mv"] = mv_error_data_2 # Running the function metric_analysis, error_analysis, feature_importances, feature_stds,labels = result_analysis.format_previous_results(biclass_results) @@ -223,16 +241,16 @@ class Test_format_previous_results(unittest.TestCase): columns=["ada-1", "mvm"]) # Testing - np.testing.assert_array_equal(metric_analysis["01"]["acc"]["mean"].loc["train"], + np.testing.assert_array_equal(metric_analysis["acc"]["mean"].loc["train"], mean_df.loc["train"]) - np.testing.assert_array_equal(metric_analysis["01"]["acc"]["mean"].loc["test"], + np.testing.assert_array_equal(metric_analysis["acc"]["mean"].loc["test"], mean_df.loc["test"]) - np.testing.assert_array_equal(metric_analysis["01"]["acc"]["std"].loc["train"], + np.testing.assert_array_equal(metric_analysis["acc"]["std"].loc["train"], std_df.loc["train"]) - np.testing.assert_array_equal(metric_analysis["01"]["acc"]["std"].loc["test"], + np.testing.assert_array_equal(metric_analysis["acc"]["std"].loc["test"], std_df.loc["test"]) - np.testing.assert_array_equal(ada_sum, error_analysis["01"]["ada-1"]) - np.testing.assert_array_equal(mv_sum, error_analysis["01"]["mv"]) + np.testing.assert_array_equal(ada_sum, error_analysis["ada-1"]) + np.testing.assert_array_equal(mv_sum, error_analysis["mv"]) class Test_gen_error_data_glob(unittest.TestCase):