diff --git a/config_files/config_test.yml b/config_files/config_test.yml index ac44a417ed854d4072fee8913c681493ebeed117..572fa46ece7ff760a92c41337f27ad5da72c34bb 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -1,14 +1,14 @@ # The base configuration of the benchmark log: True -name: ["digits",] +name: ["generated_dset",] label: "_" file_type: ".hdf5" views: -pathf: "/home/baptiste/Documents/Datasets/Digits/" +pathf: "/home/baptiste/Documents/Gitwork/multiview_generator/demo/" nice: 0 random_state: 42 nb_cores: 1 -full: False +full: True debug: True add_noise: False noise_std: 0.0 @@ -17,76 +17,63 @@ track_tracebacks: False # All the classification-realted configuration options multiclass_method: "oneVersusOne" -split: 0.49 +split: 0.25 nb_folds: 2 -nb_class: 2 +nb_class: classes: -type: ["multiview"] +type: ["monoview", "multiview"] algos_monoview: ["decision_tree" ] -algos_multiview: ["weighted_linear_early_fusion",] -stats_iter: 2 +algos_multiview: ["weighted_linear_early_fusion","weighted_linear_late_fusion", "mumbo", "mucombo"] +stats_iter: 15 metrics: ["accuracy_score", "f1_score"] metric_princ: "accuracy_score" -hps_type: "None" +hps_type: "Random" hps_args: - n_iter: 4 + n_iter: 10 equivalent_draws: False - decision_tree: - max_depth: [1,2,3,4,5] - - - weighted_linear_early_fusion: - view_weights: [null] - monoview_classifier: ["decision_tree"] - monoview_classifier__max_depth: [1,2] - +weighted_linear_early_fusion: + view_weights: null + monoview_classifier_name: "decision_tree" + monoview_classifier_config: + decision_tree: + max_depth: 12 + criterion: "gini" + splitter: "best" +weighted_linear_late_fusion: + weights: null + classifiers_names: "decision_tree" + classifier_configs: + decision_tree: + max_depth: 3 + criterion: "gini" + splitter: "best" +decision_tree: + max_depth: 3 ###################################### ## The Monoview Classifier arguments # ###################################### -<<<<<<< HEAD - mumbo: base_estimator__criterion: 'gini' - base_estimator__max_depth: 4 + base_estimator__max_depth: 3 base_estimator__random_state: None base_estimator__splitter: 'best' best_view_mode: 'edge' - n_estimators: 50 -======= -weighted_linear_early_fusion: - monoview_classifier_config: - init: CustomDecisionTree - init__ccp_alpha: 0.0 - init__class_weight: null - init__criterion: gini - init__max_depth: 1 - init__max_features: null - init__max_leaf_nodes: null - init__min_impurity_decrease: 0.0 - init__min_impurity_split: null - init__min_samples_leaf: 1 - init__min_samples_split: 2 - init__min_weight_fraction_leaf: 0.0 - init__presort: deprecated - init__random_state: null - init__splitter: best - loss: exponential - max_depth: 5 - n_estimators: 412 - monoview_classifier_name: gradient_boosting - view_weights: - - 0.16666666666666669 - - 0.16666666666666669 - - 0.16666666666666669 - - 0.16666666666666669 - - 0.16666666666666669 - - 0.16666666666666669 ->>>>>>> develop + base_estimator: 'decision_tree' + n_estimators: 10 + +mucombo: + base_estimator__criterion: 'gini' + base_estimator__max_depth: 3 + base_estimator__random_state: None + base_estimator__splitter: 'best' + best_view_mode: 'edge' + base_estimator: 'decision_tree' + n_estimators: 10 # #random_forest: # n_estimators: [25] @@ -116,10 +103,6 @@ weighted_linear_early_fusion: # n_iterations: [50] # n_stumps: [1] # -#decision_tree: -# max_depth: [2] -# criterion: ["gini"] -# splitter: ["best"] # #decision_tree_pregen: # max_depth: [10] diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py index acb47b7511685a5eb947d9c9e97e68de4afaae0f..2860d588471ec22904db1ada443c9557a573dce1 100644 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ b/multiview_platform/mono_multi_view_classifiers/exec_classif.py @@ -907,7 +907,6 @@ def exec_classif(arguments): args["add_noise"], noise_std) args["name"] = datasetname - splits = execution.gen_splits(dataset_var.get_labels(), args["split"], stats_iter_random_states) diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/analyze_result.py b/multiview_platform/mono_multi_view_classifiers/monoview/analyze_result.py deleted file mode 100644 index 04c405b0d7edeedf50238534d6c54a648da333ee..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview/analyze_result.py +++ /dev/null @@ -1,103 +0,0 @@ -# from datetime import timedelta as hms -# -# from .. import metrics -# from ..utils.base import get_metric -# -# -# def get_db_config_string(name, feat, classification_indices, shape, -# class_labels_names, k_folds): -# """ -# -# Parameters -# ---------- -# name -# feat -# classification_indices -# shape -# class_labels_names -# k_folds -# -# Returns -# ------- -# -# """ -# learning_rate = float(len(classification_indices[0])) / ( -# len(classification_indices[0]) + len(classification_indices[1])) -# db_config_string = "Database configuration : \n" -# db_config_string += "\t- Database name : " + name + "\n" -# db_config_string += "\t- View name : " + feat + "\t View shape : " + str( -# shape) + "\n" -# db_config_string += "\t- Learning Rate : " + str(learning_rate) + "\n" -# db_config_string += "\t- Labels used : " + ", ".join( -# class_labels_names) + "\n" -# db_config_string += "\t- Number of cross validation folds : " + str( -# k_folds.n_splits) + "\n\n" -# return db_config_string -# -# -# def get_classifier_config_string(grid_search, nb_cores, n_iter, cl_kwargs, -# classifier, -# output_file_name, y_test): -# classifier_config_string = "Classifier configuration : \n" -# classifier_config_string += "\t- " + classifier.get_config()[5:] + "\n" -# classifier_config_string += "\t- Executed on " + str( -# nb_cores) + " core(s) \n" -# if grid_search: -# classifier_config_string += "\t- Got configuration using randomized search with " + str( -# n_iter) + " iterations \n" -# classifier_config_string += "\n\n" -# classifier_interpret_string = classifier.get_interpretation( -# output_file_name, -# y_test) -# return classifier_config_string, classifier_interpret_string -# -# -# def get_metric_score(metric, y_train, y_train_pred, y_test, y_test_pred): -# metric_module = getattr(metrics, metric[0]) -# if metric[1] is not None: -# metric_kwargs = dict((index, metricConfig) for index, metricConfig in -# enumerate(metric[1])) -# else: -# metric_kwargs = {} -# metric_score_train = metric_module.score(y_train, y_train_pred) -# metric_score_test = metric_module.score(y_test, y_test_pred) -# metric_score_string = "\tFor " + metric_module.get_config( -# **metric_kwargs) + " : " -# metric_score_string += "\n\t\t- Score on train : " + str(metric_score_train) -# metric_score_string += "\n\t\t- Score on test : " + str(metric_score_test) -# metric_score_string += "\n" -# return metric_score_string, [metric_score_train, metric_score_test] -# -# -# def execute(name, learning_rate, k_folds, nb_cores, grid_search, metrics_list, -# n_iter, -# feat, cl_type, cl_kwargs, class_labels_names, -# shape, y_train, y_train_pred, y_test, y_test_pred, time, -# random_state, classifier, output_file_name): -# metric_module, metric_kwargs = get_metric(metrics_list) -# train_score = metric_module.score(y_train, y_train_pred) -# test_score = metric_module.score(y_test, y_test_pred) -# string_analysis = "Classification on " + name + " database for " + feat + " with " + cl_type + ".\n\n" -# string_analysis += metrics_list[0][0] + " on train : " + str( -# train_score) + "\n" + \ -# metrics_list[0][0] + " on test : " + str( -# test_score) + "\n\n" -# string_analysis += get_db_config_string(name, feat, learning_rate, shape, -# class_labels_names, k_folds) -# classifier_config_string, classifier_intepret_string = get_classifier_config_string( -# grid_search, nb_cores, n_iter, cl_kwargs, classifier, output_file_name, -# y_test) -# string_analysis += classifier_config_string -# metrics_scores = {} -# for metric in metrics_list: -# metric_string, metric_score = get_metric_score(metric, y_train, -# y_train_pred, y_test, -# y_test_pred) -# string_analysis += metric_string -# metrics_scores[metric[0]] = metric_score -# string_analysis += "\n\n Classification took " + str(hms(seconds=int(time))) -# string_analysis += "\n\n Classifier Interpretation : \n" -# string_analysis += classifier_intepret_string -# -# image_analysis = {} -# return string_analysis, image_analysis, metrics_scores diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py index 0e1cd183b0e19ce4bf5b1d0dd015cce845376388..3ad80ca2ac5dab2de39af6b358286c90823857f3 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py @@ -143,15 +143,14 @@ def exec_monoview(directory, X, Y, database_name, labels_names, classification_i metrics_list=metrics, n_iter=n_iter, class_label_names=labels_names, - train_pred=train_pred, - test_pred=test_pred, + pred=full_pred, directory=directory, base_file_name=base_file_name, labels=Y, database_name=database_name, nb_cores=nb_cores, duration=whole_duration) - string_analysis, images_analysis, metrics_scores = result_analyzer.analyze() + string_analysis, images_analysis, metrics_scores, class_metrics_scores = result_analyzer.analyze() logging.debug("Done:\t Getting results") logging.debug("Start:\t Saving preds") @@ -163,7 +162,7 @@ def exec_monoview(directory, X, Y, database_name, labels_names, classification_i return MonoviewResult(view_index, classifier_name, view_name, metrics_scores, full_pred, cl_kwargs, classifier, X_train.shape[1], - hyper_param_duration, fit_duration, pred_duration) + hyper_param_duration, fit_duration, pred_duration, class_metrics_scores) def init_constants(args, X, classification_indices, labels_names, diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py b/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py index 84540ab38d984bdafba615f235477e690d0a1ce4..a84fe0ef6772ba4fa254a24c4a0582617cc05315 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py @@ -159,7 +159,7 @@ class MonoviewResult(object): def __init__(self, view_index, classifier_name, view_name, metrics_scores, full_labels_pred, classifier_config, classifier, n_features, hps_duration, fit_duration, - pred_duration): + pred_duration, class_metric_scores): self.view_index = view_index self.classifier_name = classifier_name self.view_name = view_name @@ -171,6 +171,7 @@ class MonoviewResult(object): self.hps_duration = hps_duration self.fit_duration = fit_duration self.pred_duration = pred_duration + self.class_metric_scores = class_metric_scores def get_classifier_name(self): return self.classifier_name + "-" + self.view_name @@ -208,11 +209,11 @@ class MonoviewResultAnalyzer(ResultAnalyser): def __init__(self, view_name, classifier_name, shape, classifier, classification_indices, k_folds, hps_method, metrics_list, - n_iter, class_label_names, train_pred, test_pred, + n_iter, class_label_names, pred, directory, base_file_name, labels, database_name, nb_cores, duration): ResultAnalyser.__init__(self, classifier, classification_indices, k_folds, hps_method, metrics_list, n_iter, - class_label_names, train_pred, test_pred, + class_label_names, pred, directory, base_file_name, labels, database_name, nb_cores, duration) self.view_name = view_name diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py deleted file mode 100644 index b24cdc3fcfb17c01e975bca843ce8653b02efc31..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py +++ /dev/null @@ -1,152 +0,0 @@ -# from .. import metrics -# -# from ..utils.base import get_metric -# -# # Author-Info -# __author__ = "Baptiste Bauvin" -# __status__ = "Prototype" # Production, Development, Prototype -# -# -# def print_metric_score(metric_scores, metric_list): -# """ -# this function print the metrics scores -# -# Parameters -# ---------- -# metric_scores : the score of metrics -# -# metric_list : list of metrics -# -# Returns -# ------- -# metric_score_string string constaining all metric results -# """ -# metric_score_string = "\n\n" -# for metric in metric_list: -# metric_module = getattr(metrics, metric[0]) -# if metric[1] is not None: -# metric_kwargs = dict( -# (index, metricConfig) for index, metricConfig in -# enumerate(metric[1])) -# else: -# metric_kwargs = {} -# metric_score_string += "\tFor " + metric_module.get_config( -# **metric_kwargs) + " : " -# metric_score_string += "\n\t\t- Score on train : " + str( -# metric_scores[metric[0]][0]) -# metric_score_string += "\n\t\t- Score on test : " + str( -# metric_scores[metric[0]][1]) -# metric_score_string += "\n\n" -# return metric_score_string -# -# -# def get_total_metric_scores(metric, train_labels, test_labels, -# validation_indices, -# learning_indices, labels): -# """ -# -# Parameters -# ---------- -# -# metric : -# -# train_labels : labels of train -# -# test_labels : labels of test -# -# validation_indices : -# -# learning_indices : -# -# labels : -# -# Returns -# ------- -# list of [train_score, test_score] -# """ -# metric_module = getattr(metrics, metric[0]) -# if metric[1] is not None: -# metric_kwargs = dict((index, metricConfig) for index, metricConfig in -# enumerate(metric[1])) -# else: -# metric_kwargs = {} -# train_score = metric_module.score(labels[learning_indices], train_labels, -# **metric_kwargs) -# test_score = metric_module.score(labels[validation_indices], test_labels, -# **metric_kwargs) -# return [train_score, test_score] -# -# -# def get_metrics_scores(metrics, train_labels, test_labels, -# validation_indices, learning_indices, labels): -# metrics_scores = {} -# for metric in metrics: -# metrics_scores[metric[0]] = get_total_metric_scores(metric, -# train_labels, -# test_labels, -# validation_indices, -# learning_indices, -# labels) -# return metrics_scores -# -# -# def execute(classifier, pred_train_labels, pred_test_labels, -# classification_indices, labels_dictionary, views, name, k_folds, -# metrics_list, labels, directory): -# """ -# -# Parameters -# ---------- -# classifier : classifier used -# -# pred_train_labels : labels of train -# -# pred_test_labels : labels of test -# -# classification_indices -# -# labels_dictionary -# -# views -# -# name -# -# k_folds -# -# metrics_list -# -# labels -# -# Returns -# ------- -# return tuple of (string_analysis, images_analysis, metricsScore) -# """ -# classifier_name = classifier.short_name -# learning_indices, validation_indices = classification_indices -# metric_module, metric_kwargs = get_metric(metrics_list) -# score_on_train = metric_module.score(labels[learning_indices], -# pred_train_labels, -# **metric_kwargs) -# score_on_test = metric_module.score(labels[validation_indices], -# pred_test_labels, **metric_kwargs) -# -# string_analysis = "\t\tResult for multiview classification with " + classifier_name + \ -# "\n\n" + metrics_list[0][0] + " :\n\t-On Train : " + str( -# score_on_train) + "\n\t-On Test : " + str( -# score_on_test) + \ -# "\n\nDataset info :\n\t-Database name : " + name + "\n\t-Labels : " + \ -# ', '.join( -# labels_dictionary.values()) + "\n\t-Views : " + ', '.join( -# views) + "\n\t-" + str( -# k_folds.n_splits) + \ -# " folds\n\nClassification configuration : \n\t-Algorithm used : " + classifier_name + " with : " + classifier.get_config() -# -# metrics_scores = get_metrics_scores(metrics_list, pred_train_labels, -# pred_test_labels, -# validation_indices, learning_indices, -# labels) -# string_analysis += print_metric_score(metrics_scores, metrics_list) -# string_analysis += "\n\n Interpretation : \n\n" + classifier.get_interpretation( -# directory, labels) -# images_analysis = {} -# return string_analysis, images_analysis, metrics_scores diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py index f1b23d9a5ceb3e6bf26505e05841b943d43d61f1..deebc7255491ab89688e7d33062b174fdd68d08d 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py @@ -311,9 +311,9 @@ def exec_multiview(directory, dataset_var, name, classification_indices, example_indices=validation_indices, view_indices=views_indices) pred_duration = time.monotonic() - pred_beg - full_labels = np.zeros(dataset_var.get_labels().shape, dtype=int) - 100 - full_labels[learning_indices] = train_pred - full_labels[validation_indices] = test_pred + full_pred = np.zeros(dataset_var.get_labels().shape, dtype=int) - 100 + full_pred[learning_indices] = train_pred + full_pred[validation_indices] = test_pred logging.info("Done:\t Pertidcting") whole_duration = time.time() - t_start @@ -332,15 +332,14 @@ def exec_multiview(directory, dataset_var, name, classification_indices, metrics_list=metrics, n_iter=n_iter, class_label_names=list(labels_dictionary.values()), - train_pred=train_pred, - test_pred=test_pred, + pred=full_pred, directory=directory, base_file_name=base_file_name, labels=labels, database_name=dataset_var.get_name(), nb_cores=nb_cores, duration=whole_duration) - string_analysis, images_analysis, metrics_scores = result_analyzer.analyze() + string_analysis, images_analysis, metrics_scores, class_metrics_scores = result_analyzer.analyze() logging.info("Done:\t Result Analysis for " + cl_type) logging.debug("Start:\t Saving preds") @@ -348,5 +347,5 @@ def exec_multiview(directory, dataset_var, name, classification_indices, logging.debug("Start:\t Saving preds") return MultiviewResult(cl_type, classifier_config, metrics_scores, - full_labels, hps_duration, fit_duration, - pred_duration) + full_pred, hps_duration, fit_duration, + pred_duration, class_metrics_scores) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py index 46ffe37a843c90b39d9bfede14a191484de78f1c..38ffe337bde061d5d61fc1ab289e0b2ba7c18ddf 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py @@ -1,6 +1,8 @@ from abc import abstractmethod import numpy as np +from sklearn.tree import DecisionTreeClassifier +from sklearn.base import BaseEstimator from .. import monoview_classifiers from ..utils.base import BaseClassifier, ResultAnalyser @@ -31,6 +33,34 @@ class BaseMultiviewClassifier(BaseClassifier): self.weird_strings = {} self.used_views = None + def set_base_estim_from_dict(self, base_estim_dict, **kwargs): + if base_estim_dict is None: + base_estimator = DecisionTreeClassifier() + elif isinstance(base_estim_dict, str) and kwargs is not None: + estim_name = base_estim_dict + estim_module = getattr(monoview_classifiers, estim_name) + estim_class = getattr(estim_module, + estim_module.classifier_class_name) + base_estim_params = {} + for key, value in kwargs.items(): + key, delim, sub_key = key.partition('__') + if key == "base_estimator": + base_estim_params[sub_key] = value + base_estimator = estim_class(**base_estim_params) + elif isinstance(base_estim_dict, dict): + estim_name = next(iter(base_estim_dict)) + estim_module = getattr(monoview_classifiers, estim_name) + estim_class = getattr(estim_module, + estim_module.classifier_class_name) + base_estimator = estim_class(**base_estim_dict[estim_name]) + elif isinstance(base_estim_dict, BaseEstimator): + base_estimator = base_estim_dict + else: + raise ValueError("base_estimator should be either None, a dictionary" + " or a BaseEstimator child object, " + "here it is {}".format(type(base_estim_dict))) + return base_estimator + @abstractmethod def fit(self, X, y, train_indices=None, view_indices=None): pass @@ -152,7 +182,7 @@ from .. import multiview_classifiers class MultiviewResult(object): def __init__(self, classifier_name, classifier_config, metrics_scores, full_labels, hps_duration, fit_duration, - pred_duration): + pred_duration, class_metric_scores): self.classifier_name = classifier_name self.classifier_config = classifier_config self.metrics_scores = metrics_scores @@ -160,6 +190,7 @@ class MultiviewResult(object): self.hps_duration = hps_duration self.fit_duration = fit_duration self.pred_duration = pred_duration + self.class_metric_scores = class_metric_scores def get_classifier_name(self): try: @@ -177,13 +208,13 @@ class MultiviewResultAnalyzer(ResultAnalyser): def __init__(self, view_names, classifier, classification_indices, k_folds, hps_method, metrics_list, n_iter, class_label_names, - train_pred, test_pred, directory, base_file_name, labels, + pred, directory, base_file_name, labels, database_name, nb_cores, duration): if hps_method.endswith("equiv"): n_iter = n_iter*len(view_names) ResultAnalyser.__init__(self, classifier, classification_indices, k_folds, hps_method, metrics_list, n_iter, class_label_names, - train_pred, test_pred, directory, + pred, directory, base_file_name, labels, database_name, nb_cores, duration) self.classifier_name = classifier.short_name diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mucombo.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mucombo.py index d787e41cf99c7b20d105e74c6b6a83462a4275a6..46ab01cdd8a5633c7ff4976450ea19a4f2f57843 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mucombo.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mucombo.py @@ -5,6 +5,7 @@ from multimodal.boosting.cumbo import MuCumboClassifier from ..multiview.multiview_utils import BaseMultiviewClassifier from ..utils.hyper_parameter_search import CustomRandint from ..utils.dataset import get_examples_views_indices +from ..utils.base import base_boosting_estimators classifier_class_name = "MuCumbo" @@ -13,13 +14,14 @@ class MuCumbo(BaseMultiviewClassifier, MuCumboClassifier): def __init__(self, base_estimator=None, n_estimators=50, - random_state=None,): + random_state=None,**kwargs): BaseMultiviewClassifier.__init__(self, random_state) + base_estimator = self.set_base_estim_from_dict(base_estimator, **kwargs) MuCumboClassifier.__init__(self, base_estimator=base_estimator, n_estimators=n_estimators, random_state=random_state,) self.param_names = ["base_estimator", "n_estimators", "random_state",] - self.distribs = [[DecisionTreeClassifier(max_depth=1)], + self.distribs = [base_boosting_estimators, CustomRandint(5,200), [random_state],] def fit(self, X, y, train_indices=None, view_indices=None): diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py index 024c0e7cd75aaee29c9cd61e5d67a1394e79ab27..0fc63fb4416bbd68a7af74e6d6cc6e4620dde32e 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py @@ -1,5 +1,4 @@ from sklearn.tree import DecisionTreeClassifier -from sklearn.base import BaseEstimator import numpy as np import os @@ -19,9 +18,9 @@ class Mumbo(BaseMultiviewClassifier, MumboClassifier): def __init__(self, base_estimator=None, n_estimators=50, random_state=None, - best_view_mode="edge"): + best_view_mode="edge", **kwargs): BaseMultiviewClassifier.__init__(self, random_state) - base_estimator = self.set_base_estim_from_dict(base_estimator) + base_estimator = self.set_base_estim_from_dict(base_estimator, **kwargs) MumboClassifier.__init__(self, base_estimator=base_estimator, n_estimators=n_estimators, random_state=random_state, @@ -30,23 +29,6 @@ class Mumbo(BaseMultiviewClassifier, MumboClassifier): self.distribs = [base_boosting_estimators, CustomRandint(5,200), [random_state], ["edge", "error"]] - def set_base_estim_from_dict(self, base_estim_dict): - if base_estim_dict is None: - base_estimator = DecisionTreeClassifier() - elif isinstance(base_estim_dict, dict): - estim_name = next(iter(base_estim_dict)) - estim_module = getattr(monoview_classifiers, estim_name) - estim_class = getattr(estim_module, - estim_module.classifier_class_name) - base_estimator = estim_class(**base_estim_dict[estim_name]) - elif isinstance(base_estim_dict, BaseEstimator): - base_estimator = base_estim_dict - else: - raise ValueError("base_estimator should be either None, a dictionary" - " or a BaseEstimator child object, " - "here it is {}".format(type(base_estim_dict))) - return base_estimator - def set_params(self, base_estimator=None, **params): """ Sets the base estimator from a dict. diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/execution.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/execution.py index c62425c945d6f26747d67d860b89155913a33fb8..88392cd673ef4a701b8e796f513d27f47db6001d 100644 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/execution.py +++ b/multiview_platform/mono_multi_view_classifiers/result_analysis/execution.py @@ -12,7 +12,7 @@ def analyze(results, stats_iter, benchmark_argument_dictionaries, """Used to analyze the results of the previous benchmarks""" data_base_name = benchmark_argument_dictionaries[0]["args"]["name"] - results_means_std, iter_results, flagged_failed = analyze_iterations( + results_means_std, iter_results, flagged_failed, label_names = analyze_iterations( results, benchmark_argument_dictionaries, stats_iter, metrics, example_ids, labels) if flagged_failed: @@ -21,7 +21,7 @@ def analyze(results, stats_iter, benchmark_argument_dictionaries, if stats_iter > 1: results_means_std = analyze_all( iter_results, stats_iter, directory, - data_base_name, example_ids) + data_base_name, example_ids, label_names) return results_means_std @@ -62,6 +62,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, """ logging.debug("Start:\t Analyzing all results") iter_results = {"metrics_scores": [i for i in range(stats_iter)], + "class_metrics_scores": [i for i in range(stats_iter)], "example_errors": [i for i in range(stats_iter)], "feature_importances": [i for i in range(stats_iter)], "durations":[i for i in range(stats_iter)]} @@ -69,22 +70,22 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, fig_errors = [] for iter_index, result, tracebacks in results: arguments = get_arguments(benchmark_argument_dictionaries, iter_index) + labels_names = list(arguments["labels_dictionary"].values()) - metrics_scores = get_metrics_scores(metrics, result) + metrics_scores, class_metric_scores = get_metrics_scores(metrics, result, labels_names) example_errors = get_example_errors(labels, result) feature_importances = get_feature_importances(result) durations = get_duration(result) directory = arguments["directory"] database_name = arguments["args"]["name"] - labels_names = [arguments["labels_dictionary"][0], - arguments["labels_dictionary"][1]] + flagged_tracebacks_list += publish_tracebacks(directory, database_name, labels_names, tracebacks, iter_index) res = publish_metrics_graphs(metrics_scores, directory, database_name, - labels_names) + labels_names, class_metric_scores) publish_example_errors(example_errors, directory, database_name, labels_names, example_ids, labels) publish_feature_importances(feature_importances, directory, @@ -92,6 +93,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, plot_durations(durations, directory, database_name) iter_results["metrics_scores"][iter_index] = metrics_scores + iter_results["class_metrics_scores"][iter_index] = class_metric_scores iter_results["example_errors"][iter_index] = example_errors iter_results["feature_importances"][iter_index] = feature_importances iter_results["labels"] = labels @@ -99,19 +101,20 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, logging.debug("Done:\t Analyzing all results") - return res, iter_results, flagged_tracebacks_list + return res, iter_results, flagged_tracebacks_list, labels_names + def analyze_all(iter_results, stats_iter, directory, data_base_name, - example_ids): + example_ids, label_names): """Used to format the results in order to plot the mean results on the iterations""" - metrics_analysis, error_analysis, feature_importances, \ + metrics_analysis, class_metrics_analysis, error_analysis, feature_importances, \ feature_importances_stds, labels, duration_means, \ duration_stds = format_previous_results(iter_results) - results = publish_all_metrics_scores(metrics_analysis, + results = publish_all_metrics_scores(metrics_analysis, class_metrics_analysis, directory, - data_base_name, stats_iter) + data_base_name, stats_iter, label_names) publish_all_example_errors(error_analysis, directory, stats_iter, example_ids, labels) publish_feature_importances(feature_importances, directory, @@ -166,6 +169,7 @@ def format_previous_results(iter_results_lists): """ metrics_analysis = {} + class_metrics_analysis = {} feature_importances_analysis = {} feature_importances_stds = {} @@ -186,6 +190,23 @@ def format_previous_results(iter_results_lists): metrics_analysis[metric_name][ "std"] = dataframe.groupby(dataframe.index).std(ddof=0) + class_metric_concat_dict = {} + for iter_index, class_metrics_score in enumerate( + iter_results_lists["class_metrics_scores"]): + for metric_name, dataframe in class_metrics_score.items(): + if metric_name not in class_metric_concat_dict: + class_metric_concat_dict[metric_name] = dataframe + else: + class_metric_concat_dict[metric_name] = pd.concat( + [class_metric_concat_dict[metric_name], dataframe]) + + for metric_name, dataframe in class_metric_concat_dict.items(): + class_metrics_analysis[metric_name] = {} + class_metrics_analysis[metric_name][ + "mean"] = dataframe.groupby(dataframe.index).mean() + class_metrics_analysis[metric_name][ + "std"] = dataframe.groupby(dataframe.index).std(ddof=0) + durations_df_concat = pd.DataFrame(dtype=float) for iter_index, durations_df in enumerate(iter_results_lists["durations"]): durations_df_concat = pd.concat((durations_df_concat, durations_df), @@ -220,6 +241,7 @@ def format_previous_results(iter_results_lists): else: added_example_errors[classifier_name] += errors error_analysis = added_example_errors - return metrics_analysis, error_analysis, feature_importances_analysis, \ + return metrics_analysis, class_metrics_analysis ,error_analysis, \ + feature_importances_analysis, \ feature_importances_stds, iter_results_lists["labels"], \ duration_means, duration_stds diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/metric_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/metric_analysis.py index 3ab3f8a156aac6632ec23e6dea08c448e9464156..9cb296f2dea29686416c36be6325e0a62102ec36 100644 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/metric_analysis.py +++ b/multiview_platform/mono_multi_view_classifiers/result_analysis/metric_analysis.py @@ -7,7 +7,7 @@ import logging from ..utils.organization import secure_file_path -def get_metrics_scores(metrics, results): +def get_metrics_scores(metrics, results, label_names): r"""Used to extract metrics scores in case of classification Parameters @@ -43,19 +43,32 @@ def get_metrics_scores(metrics, results): for metric in metrics) for metric in metrics: - for classifierResult in results: + for classifier_result in results: metrics_scores[metric[0]].loc[ - "train", classifierResult.get_classifier_name()] = \ - classifierResult.metrics_scores[metric[0]][0] + "train", classifier_result.get_classifier_name()] = \ + classifier_result.metrics_scores[metric[0]][0] metrics_scores[metric[0]].loc[ - "test", classifierResult.get_classifier_name()] = \ - classifierResult.metrics_scores[metric[0]][1] + "test", classifier_result.get_classifier_name()] = \ + classifier_result.metrics_scores[metric[0]][1] - return metrics_scores + class_metric_scores = dict((metric[0], pd.DataFrame(index=pd.MultiIndex.from_product([["train", "test"], label_names]), + columns=classifier_names, dtype=float)) + for metric in metrics) + for metric in metrics: + for classifier_result in results: + for label_index, label_name in enumerate(label_names): + class_metric_scores[metric[0]].loc[( + "train", label_name),classifier_result.get_classifier_name()] = \ + classifier_result.class_metric_scores[metric[0]][0][label_index] + class_metric_scores[metric[0]].loc[( + "test", label_name), classifier_result.get_classifier_name()] = \ + classifier_result.class_metric_scores[metric[0]][1][label_index] + + return metrics_scores, class_metric_scores def publish_metrics_graphs(metrics_scores, directory, database_name, - labels_names): + labels_names, class_metric_scores): r"""Used to sort the results (names and both scores) in descending test score order. @@ -76,24 +89,32 @@ def publish_metrics_graphs(metrics_scores, directory, database_name, results """ results = [] - for metric_name, metric_dataframe in metrics_scores.items(): + for metric_name in metrics_scores.keys(): logging.debug( - "Start:\t Biclass score graph generation for " + metric_name) + "Start:\t Score graph generation for " + metric_name) train_scores, test_scores, classifier_names, \ - file_name, nb_results, results = init_plot(results, metric_name, - metric_dataframe, directory, - database_name, labels_names) + file_name, nb_results, results,\ + class_test_scores = init_plot(results, metric_name, + metrics_scores[metric_name], + directory, + database_name, labels_names, + class_metric_scores[metric_name]) plot_metric_scores(train_scores, test_scores, classifier_names, nb_results, metric_name, file_name, tag=" " + " vs ".join(labels_names)) + + class_file_name = os.path.join(directory, database_name + "-" + + metric_name+"-class") + plot_class_metric_scores(class_test_scores, class_file_name, + labels_names, classifier_names, metric_name) logging.debug( - "Done:\t Biclass score graph generation for " + metric_name) + "Done:\t Score graph generation for " + metric_name) return results -def publish_all_metrics_scores(iter_results, directory, - data_base_name, stats_iter, +def publish_all_metrics_scores(iter_results, class_iter_results, directory, + data_base_name, stats_iter, label_names, min_size=10): results = [] secure_file_path(os.path.join(directory, "a")) @@ -101,27 +122,61 @@ def publish_all_metrics_scores(iter_results, directory, for metric_name, scores in iter_results.items(): train = np.array(scores["mean"].loc["train"]) test = np.array(scores["mean"].loc["test"]) - names = np.array(scores["mean"].columns) + classifier_names = np.array(scores["mean"].columns) train_std = np.array(scores["std"].loc["train"]) test_std = np.array(scores["std"].loc["test"]) - file_name = os.path.join(directory, data_base_name + "-Mean_on_" + str( + file_name = os.path.join(directory, data_base_name + "-mean_on_" + str( stats_iter) + "_iter-" + metric_name) - nbResults = names.shape[0] + nb_results = classifier_names.shape[0] - plot_metric_scores(train, test, names, nbResults, + plot_metric_scores(train, test, classifier_names, nb_results, metric_name, file_name, tag=" averaged", train_STDs=train_std, test_STDs=test_std) results += [[classifier_name, metric_name, test_mean, test_std] for classifier_name, test_mean, test_std - in zip(names, test, test_std)] + in zip(classifier_names, test, test_std)] + + for metric_name, scores in class_iter_results.items(): + test = np.array([np.array(scores["mean"].iloc[i, :]) for i in range(scores["mean"].shape[0]) if scores["mean"].iloc[i, :].name[0]=='test']) + classifier_names = np.array(scores["mean"].columns) + test_std = np.array([np.array(scores["std"].iloc[i, :]) for i in range(scores["std"].shape[0]) if scores["std"].iloc[i, :].name[0]=='test']) + + file_name = os.path.join(directory, data_base_name + "-mean_on_" + str( + stats_iter) + "_iter-" + metric_name+"-class") + + plot_class_metric_scores(test, file_name, label_names, classifier_names, metric_name, stds=test_std, tag="averaged") return results +# def publish_all_class_metrics_scores(iter_results, directory, +# data_base_name, stats_iter, +# min_size=10): +# results = [] +# secure_file_path(os.path.join(directory, "a")) +# +# for metric_name, scores in iter_results.items(): +# train = np.array(scores["mean"].loc["train"]) +# test = np.array(scores["mean"].loc["test"]) +# names = np.array(scores["mean"].columns) +# train_std = np.array(scores["std"].loc["train"]) +# test_std = np.array(scores["std"].loc["test"]) +# +# file_name = os.path.join(directory, data_base_name + "-mean_on_" + str( +# stats_iter) + "_iter-" + metric_name+"-class") +# +# plot_class_metric_scores(test, file_name, labels_names=names, file_name, tag=" averaged", +# train_STDs=train_std, test_STDs=test_std) +# results += [[classifier_name, metric_name, test_mean, test_std] +# for classifier_name, test_mean, test_std +# in zip(names, test, test_std)] +# return results + def init_plot(results, metric_name, metric_dataframe, - directory, database_name, labels_names): + directory, database_name, labels_names, class_metric_scores): train = np.array(metric_dataframe.loc["train"]) test = np.array(metric_dataframe.loc["test"]) + class_test = np.array(class_metric_scores.loc["test"]) classifier_names = np.array(metric_dataframe.columns) nb_results = metric_dataframe.shape[1] @@ -129,10 +184,12 @@ def init_plot(results, metric_name, metric_dataframe, file_name = os.path.join(directory, database_name + "-" + "_vs_".join( labels_names) + "-" + metric_name) - results += [[classifiers_name, metric_name, testMean, testSTD] - for classifiers_name, testMean, testSTD in - zip(classifier_names, test, np.zeros(len(test)))] - return train, test, classifier_names, file_name, nb_results, results + results += [[classifiers_name, metric_name, test_mean, test_std, class_mean] + for classifiers_name, test_mean, class_mean, test_std in + zip(classifier_names, test, np.transpose(class_test), + np.zeros(len(test)))] + return train, test, classifier_names, file_name, nb_results, results, \ + class_test def plot_metric_scores(train_scores, test_scores, names, nb_results, @@ -230,6 +287,28 @@ def plot_metric_scores(train_scores, test_scores, names, nb_results, del fig +def plot_class_metric_scores(class_test_scores, class_file_name, + labels_names, classifier_names, metric_name, + stds=None, tag=""): + fig = plotly.graph_objs.Figure() + for lab_index, scores in enumerate(class_test_scores): + if stds is None: + std = None + else: + std = stds[lab_index] + fig.add_trace(plotly.graph_objs.Bar( + name=labels_names[lab_index], + x=classifier_names, y=scores, + error_y=dict(type='data', array=std), + )) + fig.update_layout( + title=metric_name + "<br>" + tag + " scores for each classifier") + fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', + plot_bgcolor='rgba(0,0,0,0)') + plotly.offline.plot(fig, filename=class_file_name + ".html", auto_open=False) + del fig + + def get_fig_size(nb_results, min_size=15, multiplier=1.0, bar_width=0.35): r"""Used to get the image size to save the figure and the bar width, depending on the number of scores to plot. diff --git a/multiview_platform/mono_multi_view_classifiers/utils/base.py b/multiview_platform/mono_multi_view_classifiers/utils/base.py index 6530eeac96f599c3d55e9643ace5b07d56b2642b..013ca1b57da87dbdd5e0f900413a7bf680d7cb4b 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/base.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/base.py @@ -2,7 +2,8 @@ import numpy as np from sklearn.base import BaseEstimator from abc import abstractmethod from datetime import timedelta as hms - +from tabulate import tabulate +from sklearn.metrics import confusion_matrix as confusion from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier @@ -161,7 +162,7 @@ class ResultAnalyser(): def __init__(self, classifier, classification_indices, k_folds, hps_method, metrics_list, n_iter, class_label_names, - train_pred, test_pred, directory, base_file_name, labels, + pred, directory, base_file_name, labels, database_name, nb_cores, duration): """ @@ -202,8 +203,7 @@ class ResultAnalyser(): self.metrics_list = metrics_list self.n_iter = n_iter self.class_label_names = class_label_names - self.train_pred = train_pred - self.test_pred = test_pred + self.pred = pred self.directory = directory self.base_file_name = base_file_name self.labels = labels @@ -212,6 +212,7 @@ class ResultAnalyser(): self.nb_cores = nb_cores self.duration = duration self.metric_scores = {} + self.class_metric_scores = {} def get_all_metrics_scores(self, ): """ @@ -220,10 +221,13 @@ class ResultAnalyser(): ------- """ for metric, metric_args in self.metrics_list: - self.metric_scores[metric] = self.get_metric_scores(metric, - metric_args) + class_train_scores, class_test_scores, train_score, test_score\ + = self.get_metric_score(metric, metric_args) + self.class_metric_scores[metric] = (class_train_scores, + class_test_scores) + self.metric_scores[metric] = (train_score, test_score) - def get_metric_scores(self, metric, metric_kwargs): + def get_metric_score(self, metric, metric_kwargs): """ Get the train and test scores for a specific metric and its arguments @@ -239,13 +243,24 @@ class ResultAnalyser(): train_score, test_score """ metric_module = getattr(metrics, metric) + class_train_scores = [] + class_test_scores = [] + for label_value in np.unique(self.labels): + train_example_indices = self.train_indices[np.where(self.labels[self.train_indices]==label_value)[0]] + test_example_indices = self.test_indices[np.where(self.labels[self.test_indices] == label_value)[0]] + class_train_scores.append(metric_module.score(y_true=self.labels[train_example_indices], + y_pred=self.pred[train_example_indices], + **metric_kwargs)) + class_test_scores.append(metric_module.score(y_true=self.labels[test_example_indices], + y_pred=self.pred[test_example_indices], + **metric_kwargs)) train_score = metric_module.score(y_true=self.labels[self.train_indices], - y_pred=self.train_pred, - **metric_kwargs) + y_pred=self.pred[self.train_indices], + **metric_kwargs) test_score = metric_module.score(y_true=self.labels[self.test_indices], - y_pred=self.test_pred, - **metric_kwargs) - return train_score, test_score + y_pred=self.pred[self.test_indices], + **metric_kwargs) + return class_train_scores, class_test_scores, train_score, test_score def print_metric_score(self,): """ @@ -269,6 +284,11 @@ class ResultAnalyser(): metric_score_string += "\n\t\t- Score on train : {}".format(self.metric_scores[metric][0]) metric_score_string += "\n\t\t- Score on test : {}".format(self.metric_scores[metric][1]) metric_score_string += "\n\n" + metric_score_string += "Test set confusion matrix : \n\n" + confusion_matrix = confusion(y_true=self.labels[self.test_indices], y_pred=self.pred[self.test_indices]) + formatted_conf = [[label_name]+list(row) for label_name, row in zip(self.class_label_names, confusion_matrix)] + metric_score_string+=tabulate(formatted_conf, headers= ['']+self.class_label_names, tablefmt='fancy_grid') + metric_score_string += "\n\n" return metric_score_string @abstractmethod @@ -341,7 +361,7 @@ class ResultAnalyser(): self.directory, self.base_file_name, self.labels[self.test_indices]) image_analysis = {} - return string_analysis, image_analysis, self.metric_scores + return string_analysis, image_analysis, self.metric_scores, self.class_metric_scores base_boosting_estimators = [DecisionTreeClassifier(max_depth=1), diff --git a/multiview_platform/tests/test_utils/test_base.py b/multiview_platform/tests/test_utils/test_base.py index 41186fcd58c8b2cd9a20ed75ddce7c444637d098..3147da48fd545eb270afe1689f544854ac086a7b 100644 --- a/multiview_platform/tests/test_utils/test_base.py +++ b/multiview_platform/tests/test_utils/test_base.py @@ -74,7 +74,7 @@ class Test_ResultAnalyzer(unittest.TestCase): self.directory, self.base_file_name, self.labels, self.database_name, self.nb_cores, self.duration) - train_score, test_score = RA.get_metric_scores("accuracy_score", {}) + train_score, test_score = RA.get_metric_score("accuracy_score", {}) self.assertEqual(train_score, self.train_accuracy) self.assertEqual(test_score, self.test_accuracy) diff --git a/requirements.txt b/requirements.txt index a5c90c624487b2a1bd404248cacfc11002cbf035..769db82f00dcb80e1c33796f56bc30a7f21d5c02 100755 --- a/requirements.txt +++ b/requirements.txt @@ -16,4 +16,8 @@ pyyaml==3.12 cvxopt==1.2.0 plotly==4.2.1 matplotlib==3.1.1 +<<<<<<< HEAD -e git+https://gitlab.lis-lab.fr/dominique.benielli/multiviewmetriclearning +======= +tabulate==0.8.6 +>>>>>>> develop