From ac8ca802c9fa5093e8bd8b2eeff9b322fab7d78b Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Mon, 14 Nov 2022 10:20:34 -0500 Subject: [PATCH] Object done --- .../config_files/config_example_0.yml | 6 +- summit/multiview_platform/exec_classif.py | 555 ++++++++------- .../monoview/exec_classif_mono_view.py | 503 +++++++++----- .../monoview_classifiers/adaboost.py | 46 +- .../monoview_classifiers/bagged_spkm.py | 46 ++ .../monoview_classifiers/decision_tree.py | 1 + .../monoview_classifiers/ib_decision_tree.py | 44 ++ .../monoview_classifiers/ib_random_forest.py | 44 ++ .../monoview_classifiers/ib_random_scm.py | 50 ++ .../monoview_classifiers/ib_scm.py | 42 ++ .../monoview_classifiers/random_forest.py | 1 + .../monoview_classifiers/spkm.py | 54 ++ .../multiview/exec_multiview.py | 632 +++++++++--------- .../additions/late_fusion_utils.py | 5 + .../multiview_classifiers/bagged_spkm_pw.py | 2 +- .../early_fusion_ib_decision_tree.py | 32 + .../early_fusion_ib_random_forest.py | 31 + .../early_fusion_ib_random_scm.py | 34 + .../early_fusion_ib_scm.py | 29 + .../multiview_classifiers/spkm_pw.py | 9 +- .../result_analysis/error_analysis.py | 42 +- .../result_analysis/execution.py | 64 +- .../result_analysis/feature_importances.py | 25 +- .../result_analysis/metric_analysis.py | 20 +- summit/multiview_platform/utils/base.py | 2 + .../multiview_platform/utils/compression.py | 8 +- .../multiview_platform/utils/configuration.py | 2 +- .../utils/hyper_parameter_search.py | 2 +- summit/multiview_platform/utils/multiclass.py | 5 +- 29 files changed, 1551 insertions(+), 785 deletions(-) create mode 100644 summit/multiview_platform/monoview_classifiers/bagged_spkm.py create mode 100644 summit/multiview_platform/monoview_classifiers/ib_decision_tree.py create mode 100644 summit/multiview_platform/monoview_classifiers/ib_random_forest.py create mode 100644 summit/multiview_platform/monoview_classifiers/ib_random_scm.py create mode 100644 summit/multiview_platform/monoview_classifiers/ib_scm.py create mode 100644 summit/multiview_platform/monoview_classifiers/spkm.py create mode 100644 summit/multiview_platform/multiview_classifiers/early_fusion_ib_decision_tree.py create mode 100644 summit/multiview_platform/multiview_classifiers/early_fusion_ib_random_forest.py create mode 100644 summit/multiview_platform/multiview_classifiers/early_fusion_ib_random_scm.py create mode 100644 summit/multiview_platform/multiview_classifiers/early_fusion_ib_scm.py diff --git a/summit/examples/config_files/config_example_0.yml b/summit/examples/config_files/config_example_0.yml index 753e5c07..63ec77c1 100644 --- a/summit/examples/config_files/config_example_0.yml +++ b/summit/examples/config_files/config_example_0.yml @@ -27,7 +27,7 @@ res_dir: "examples/results/example_0/" # If an error occurs in a classifier, if track_tracebacks is set to True, the # benchmark saves the traceback and continues, if it is set to False, it will # stop the benchmark and raise the error -track_tracebacks: True +track_tracebacks: False # All the classification-realted configuration options @@ -40,14 +40,14 @@ nb_class: # The name of the classes to select in the dataset classes: # The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] +cl_type: ["monoview","multiview"] # The name of the monoview algorithms to run, ["all"] to run all the available classifiers algos_monoview: ["decision_tree", "adaboost"] # The names of the multiview algorithms to run, ["all"] to run all the available classifiers algos_multiview: ["early_fusion_decision_tree", "early_fusion_adaboost", "weighted_linear_late_fusion",] # The number of times the benchamrk is repeated with different train/test # split, to have more statistically significant results -stats_iter: 1 +stats_iter: 2 # The metrics that will be use din the result analysis metrics: accuracy_score: {} diff --git a/summit/multiview_platform/exec_classif.py b/summit/multiview_platform/exec_classif.py index 1dd98420..e578633a 100644 --- a/summit/multiview_platform/exec_classif.py +++ b/summit/multiview_platform/exec_classif.py @@ -5,6 +5,7 @@ import time import traceback import argparse import pickle +from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold import matplotlib @@ -13,8 +14,8 @@ import numpy as np # Import own modules from . import monoview_classifiers from . import multiview_classifiers -from .monoview.exec_classif_mono_view import exec_monoview -from .multiview.exec_multiview import exec_multiview +from .monoview.exec_classif_mono_view import MonoViewExp +from .multiview.exec_multiview import MultiViewExp from .result_analysis.execution import analyze_iterations, analyze from .utils import execution, dataset, configuration from .utils.execution import BaseExec @@ -33,6 +34,161 @@ __status__ = "Prototype" # Production, Development, Prototype package_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +class BootstrapIteration: + + def __init__(self, + labels_dictionary={}, + directory="dir", + classification_indices=[], + args={}, + k_folds=StratifiedKFold(), + random_state=42, + hyper_param_search="Random", + metrics={}, + argument_dictionaries=[], + monoview_benchmark=[], + multiview_benchmark=[], + views=[], + views_indices=[], + flag=1,track_tracebacks=True, + database_name=""): + self.labels_dictionary=labels_dictionary + self.directory = directory + self.classification_indices = classification_indices + self.args = args + self.k_folds = k_folds + self.random_state = random_state + self.hyper_param_search = hyper_param_search + self.metrics = metrics + self.argument_dictionaries = argument_dictionaries + self.monoview_benchmark = monoview_benchmark + self.multiview_benchmark = multiview_benchmark + self.views = views + self.views_indices = views_indices + self.flag = flag + self.track_tracebacks = track_tracebacks + self.traceback_outputs = {} + self.results = [] + self.database_name=database_name + self.test_results = [] + + def benchmark_init(self, dataset_var): + """ + Initializes the benchmark, by saving the indices of the train + samples and the cross validation folds. + + Parameters + ---------- + directory : str + The benchmark's result directory + + classification_indices : numpy array + The indices of the samples, splitted for the train/test split + + labels : numpy array + The labels of the dataset + + labels_dictionary : dict + The dictionary with labels as keys and their names as values + + k_folds : sklearn.model_selection.Folds object + The folds for the cross validation process + + Returns + ------- + + """ + logging.info("Start:\t Benchmark initialization") + secure_file_path(os.path.join(self.directory, "train_labels.csv")) + train_indices = self.classification_indices[0] + train_labels = dataset_var.get_labels(sample_indices=train_indices) + np.savetxt(os.path.join(self.directory, "train_labels.csv"), train_labels, + delimiter=",") + np.savetxt(os.path.join(self.directory, "train_indices.csv"), + self.classification_indices[0], + delimiter=",") + self.results_monoview = [] + folds = self.k_folds.split(np.arange(len(train_labels)), train_labels) + min_fold_len = int(len(train_labels) / self.k_folds.n_splits) + for fold_index, (train_cv_indices, test_cv_indices) in enumerate(folds): + file_name = os.path.join(self.directory, "folds", "test_labels_fold_" + str( + fold_index) + ".csv") + secure_file_path(file_name) + np.savetxt(file_name, train_labels[test_cv_indices[:min_fold_len]], + delimiter=",") + self.labels_names = list(self.labels_dictionary.values()) + logging.info("Done:\t Benchmark initialization") + + def exec_one_benchmark_mono_core(self, dataset_var): # pragma: no cover + self.benchmark_init(dataset_var) + logging.getLogger('matplotlib.font_manager').disabled = True + logging.info("Start:\t Test Benchmark") + + for exp_ind, exp in enumerate(self.argument_dictionaries): + try: + exp.add_bootstrap_info(self.directory, self.k_folds, + self.classification_indices, + self.random_state) + self.results.append(exp.exec(dataset_var) ) + self.argument_dictionaries[exp_ind]=exp + except BaseException: + if self.track_tracebacks: + if isinstance(exp, MonoViewExp): + self.traceback_outputs[ + exp.classifier_name + "-" + exp.view_name] = traceback.format_exc() + else: + self.traceback_outputs[exp.classifier_name] = traceback.format_exc() + else: + raise + logging.info("Done:\tTest benchmark") + + def exec_one_test_benchmark(self, dataset_var, y=None, feature_ids=None, + n_splits=5, test_size=0.1): + logging.getLogger('matplotlib.font_manager').disabled = True + logging.info("Start:\t Benchmark") + self.results=[] + for exp in self.argument_dictionaries: + try: + exp.add_bootstrap_info(self.directory, self.k_folds, + self.classification_indices, + self.random_state) + self.results.append(exp.test(dataset_var, y, feature_ids, + n_splits=n_splits, + test_size=test_size)) + except BaseException: + if self.track_tracebacks: + if isinstance(exp, MonoViewExp): + self.traceback_outputs[ + exp.classifier_name + "-" + exp.view_name] = traceback.format_exc() + else: + self.traceback_outputs[ + exp.classifier_name] = traceback.format_exc() + else: + raise + logging.info("Done:\t monoview benchmark") + + # logging.info("Start:\t multiview benchmark") + # self.results_multiview = [] + # for arguments in self.argument_dictionaries["multiview"]: + # try: + # self.results_multiview += [ + # exec_multiview(self.directory, self.dataset_var, self.name, + # self.classification_indices, + # self.k_folds, self.nb_cores, self.args["file_type"], + # self.pathf, self.labels_dictionary, self.random_state, + # self.labels, + # hps_method=self.hyper_param_search, + # metrics=self.metrics, n_iter=self.hps_iter, + # **self.arguments)] + # except BaseException: + # if self.track_tracebacks: + # traceback_outputs[ + # arguments["classifier_name"]] = traceback.format_exc() + # else: + # raise + logging.info("Done:\t multiview benchmark") + + class Summit(BaseExec): def __init__(self, log=True, @@ -66,8 +222,9 @@ class Summit(BaseExec): config_path=None, **kwargs): if config_path is not None: - args = self.parse_the_args(config_path) - args = configuration.get_the_args(args.config_path) + if type(config_path)==list: + config_path = self.parse_the_args(config_path).config_path + args = configuration.get_the_args(config_path) Summit.__init__(self, **args) else: self.log = log @@ -83,7 +240,7 @@ class Summit(BaseExec): self.random_state = random_state self.nb_cores = nb_cores self.full = full - self.debug = debug + self.debug = True self.add_noise = add_noise self.noise_std = noise_std self.res_dir = res_dir @@ -101,6 +258,12 @@ class Summit(BaseExec): self.hps_type = hps_type self.hps_iter = hps_iter self.hps_kwargs = hps_kwargs + self.args = kwargs + self.monoview_benchmark = None + self.multiview_benchmark = None + self.monoview_kwargs = None + self.multiview_kwargs = None + self.exps = [] def exec_classif(self, ): # pragma: no cover """ @@ -137,7 +300,7 @@ class Summit(BaseExec): self.dataset_var, self.labels_dictionary, self.name = get_database( self.views, - self.pathf, dataset_name, + self.pathf, self.name, self.nb_class, self.classes, self.random_state, @@ -146,7 +309,6 @@ class Summit(BaseExec): self.gen_splits() self.gen_k_folds() - self.init_views() self.views_dictionary = self.dataset_var.get_view_dict() self.nb_views = len(self.views) @@ -170,10 +332,43 @@ class Summit(BaseExec): self.init_kwargs_func() data_base_time = time.time() - start self.init_argument_dictionaries() - directories = self.gen_direcorties_names() + self.gen_direcorties_names() self.gen_argument_dictionaries() self.exec_benchmark() + def test_clf(self, dataset_var, y=None, sample_ids=[], feature_ids=[]): + self.exec_test_benchmark(dataset_var, y=y, sample_ids=sample_ids, + feature_ids=feature_ids, + n_splits=self.stats_iter, test_size=self.split) + + def exec_test_benchmark(self, dataset_var, y=None, sample_ids=[], feature_ids=[], n_splits=1, test_size=0.8): # pragma: no cover + + self.results = [] + for bootstrap_iter in self.benchmark_argument_dictionaries: + bootstrap_iter.exec_one_test_benchmark(dataset_var, y=y, + feature_ids=feature_ids, + n_splits=n_splits, + test_size=test_size) + analyze_iterations([bootstrap_iter], self.stats_iter, + self.metrics, sample_ids=sample_ids, + labels=y, + feature_ids=[feature_ids], + view_names=["MST"], test=True) + self.results += [bootstrap_iter] + logging.info("Done:\t Executing all the needed benchmarks") + + # Do everything with flagging + logging.info("Start:\t Analyzing predictions") + results_mean_stds = analyze(self.results, self.stats_iter, + self.benchmark_argument_dictionaries, + self.metrics, + self.res_dir, + sample_ids, + y, [feature_ids], + ["MST"], test=True) + logging.info("Done:\t Analyzing predictions") + return results_mean_stds + def parse_the_args(self, arguments): """Used to parse the args entered by the user""" @@ -231,7 +426,6 @@ class Summit(BaseExec): with open(os.path.join(self.res_dir, "random_state.pickle"), "wb") as handle: pickle.dump(self.random_state, handle) - def init_stats_iter_random_states(self,): r""" Used to initialize multiple random states if needed because of multiple statistical iteration of the same benchmark @@ -256,7 +450,6 @@ class Summit(BaseExec): else: self.stats_iter_random_states = [self.random_state] - def get_database_function(self,): r"""Used to get the right database extraction function according to the type of database and it's name @@ -273,12 +466,11 @@ class Summit(BaseExec): The function that will be used to extract the database """ if self.name not in ["fake", "plausible"]: - get_database = getattr(DB, "get_classic_db_" + self.type[1:]) + get_database = getattr(DB, "get_classic_db_" + self.file_type[1:]) else: - get_database = getattr(DB, "get_" + self.name + "_db_" + self.type[1:]) + get_database = getattr(DB, "get_" + self.name + "_db_" + self.file_type[1:]) return get_database - def init_log_file(self): r"""Used to init the directory where the preds will be stored and the log file. @@ -337,7 +529,6 @@ class Summit(BaseExec): logging.getLogger().addHandler(logging.StreamHandler()) save_config(self.res_dir, self.__dict__) - def gen_splits(self,): r"""Used to _gen the train/test splits using one or multiple random states. @@ -359,7 +550,7 @@ class Summit(BaseExec): indices = np.arange(len(self.dataset_var.get_labels())) self.splits = [] for random_state in self.stats_iter_random_states: - folds_obj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, + folds_obj = StratifiedShuffleSplit(n_splits=1, random_state=random_state, test_size=self.split) folds = folds_obj.split(indices, self.dataset_var.get_labels()) @@ -369,7 +560,6 @@ class Summit(BaseExec): test_indices = indices[test_fold] self.splits.append([train_indices, test_indices]) - def gen_k_folds(self,): r"""Used to generate folds indices for cross validation for each statistical iteration. @@ -391,17 +581,16 @@ class Summit(BaseExec): self.folds_list = [] for random_state in self.stats_iter_random_states: self.folds_list.append( - sklearn.model_selection.StratifiedKFold(n_splits=self.nb_folds, + StratifiedKFold(n_splits=self.nb_folds, random_state=random_state, shuffle=True)) else: if isinstance(self.stats_iter_random_states, list): - self.stats_iter_random_states = self.stats_iter_random_states[0] - self.folds_list = [sklearn.model_selection.StratifiedKFold(n_splits=self.nb_folds, - random_state=self.stats_iter_random_states, + stats_iter_random_state = self.stats_iter_random_states[0] + self.folds_list = [StratifiedKFold(n_splits=self.nb_folds, + random_state=stats_iter_random_state, shuffle=True)] - def init_views(self,): r"""Used to return the views names that will be used by the benchmark, their indices and all the views names. @@ -423,8 +612,8 @@ class Summit(BaseExec): Names of all the available views in the dataset. """ nb_view = self.dataset_var.nb_view - if self.arg_views is not None: - allowed_views = self.arg_views + if self.views is not None: + allowed_views = self.views self.all_views = [str(self.dataset_var.get_view_name(view_index)) if not isinstance(self.dataset_var.get_view_name(view_index), bytes) else self.dataset_var.get_view_name(view_index).decode("utf-8") @@ -446,7 +635,6 @@ class Summit(BaseExec): self.views_indices = range(nb_view) self.all_views = self.views - def gen_direcorties_names(self,): r"""Used to generate the different directories of each iteration if needed. @@ -465,9 +653,9 @@ class Summit(BaseExec): if self.stats_iter > 1: self.directories = [] for i in range(self.stats_iter): - self.directories.append(os.path.join(self.directory, "iter_" + str(i + 1))) + self.directories.append(os.path.join(self.res_dir, "iter_" + str(i + 1))) else: - self.directories = [self.directory] + self.directories = [self.res_dir] def find_dataset_names(self, ): """This function goal is to browse the dataset directory and extrats all @@ -488,7 +676,6 @@ class Summit(BaseExec): for file_name in os.listdir(self.pathf) if file_name.endswith(self.file_type)] - print(self.name) self.dataset_list = [self.name] if self.dataset_list == ["all"]: self.dataset_list = available_file_names @@ -511,7 +698,6 @@ class Summit(BaseExec): "The asked dataset ({}) is not available in {}. \n The available ones are {}".format( self.dataset_list[0], self.pathf, available_file_names)) - def gen_argument_dictionaries(self,): # pragma: no cover r"""Used to generate a dictionary for each benchmark. @@ -561,20 +747,23 @@ class Summit(BaseExec): """ self.benchmark_argument_dictionaries = [] for iter_index, iterRandomState in enumerate(self.stats_iter_random_states): - benchmark_argument_dictionary = { - "labels_dictionary": self.labels_dictionary, - "directory": self.directories[iter_index], - "classification_indices": self.splits[iter_index], - "args": self.args, - "k_folds": self.k_folds[iter_index], - "random_state": iterRandomState, - "hyper_param_search": self.hyper_param_search, - "metrics": self.metrics, - "argument_dictionaries": self.argument_dictionaries, - "benchmark": self.benchmark, - "views": self.views, - "views_indices": self.views_indices, - "flag": iter_index} + benchmark_argument_dictionary = BootstrapIteration( + labels_dictionary= self.labels_dictionary, + directory=self.directories[iter_index], + classification_indices=self.splits[iter_index], + args=self.args, + k_folds=self.folds_list[iter_index], + random_state=iterRandomState, + hyper_param_search=self.hps_type, + metrics=self.metrics, + argument_dictionaries=self.exps[iter_index], + monoview_benchmark=self.monoview_benchmark, + multiview_benchmark=self.multiview_benchmark, + views=self.views, + views_indices=self.views_indices, + flag=iter_index, + track_tracebacks=self.track_tracebacks, + database_name=self.name) self.benchmark_argument_dictionaries.append(benchmark_argument_dictionary) @@ -603,42 +792,41 @@ class Summit(BaseExec): benchmark : Dictionary of dictionaries Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. """ - benchmark = {"monoview": {}, "multiview": {}} - if "monoview" in self.cl_type: - if self.monoview_algos == ['all']: # pragma: no cover - self.benchmark["monoview"] = [name for _, name, isPackage in + if self.algos_monoview == ['all']: # pragma: no cover + self.monoview_benchmark = [name for _, name, is_package in pkgutil.iter_modules( monoview_classifiers.__path__) - if not isPackage] + if not is_package] else: - self.benchmark["monoview"] = self.monoview_algos - + self.monoview_benchmark = self.algos_monoview if "multiview" in self.cl_type: - if self.multiview_algos == ["all"]: # pragma: no cover - self.benchmark["multiview"] = [name for _, name, isPackage in + if self.algos_multiview == ["all"]: # pragma: no cover + self.multiview_benchmark = [name for _, name, is_package in pkgutil.iter_modules( multiview_classifiers.__path__) - if not isPackage] + if not is_package] else: - self.benchmark["multiview"] = self.multiview_algos + self.multiview_benchmark = self.algos_multiview def init_argument_dictionaries(self, ): # pragma: no cover - self.argument_dictionaries = {"monoview": [], "multiview": []} - if self.benchmark["monoview"]: - self.argument_dictionaries["monoview"] = self.init_monoview_exps( - self.benchmark["monoview"], - self.views_dictionary, - self.nb_class, - self.init_kwargs["monoview"], self.hps_method, self.hps_kwargs) - if self.benchmark["multiview"]: - self.argument_dictionaries["multiview"] = self.init_multiview_exps( - self.benchmark["multiview"], - self.views_dictionary, - self.nb_class, - self.init_kwargs["multiview"], self.hps_method, self.hps_kwargs) + for iter_ind in range(self.stats_iter): + exps=[] + if self.monoview_benchmark is not None: + exps += self.init_monoview_exps( + self.monoview_benchmark, + self.views_dictionary, + self.nb_class, + self.monoview_kwargs, self.hps_type, self.hps_kwargs) + if self.multiview_benchmark is not None: + exps += self.init_multiview_exps( + self.multiview_benchmark, + self.views_dictionary, + self.nb_class, + self.multiview_kwargs, self.hps_type, self.hps_kwargs) + self.exps.append(exps) def init_multiview_exps(self, classifier_names, views_dictionary, nb_class, @@ -709,29 +897,26 @@ class Summit(BaseExec): for view_name, view_index in views_dictionary.items(): for classifier_name in classifier_names: if hps_method == "Grid": - arguments = self.gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - {"param_grid": + monoview_exp = self.gen_single_monoview_arg_dictionary(classifier_name=classifier_name, + arguments=kwargs_init, + view_index=view_index, + view_name=view_name, + hps_kwargs={"param_grid": hps_kwargs[ classifier_name]}) elif hps_method == "Random": hps_kwargs = self.get_random_hps_args(hps_kwargs, classifier_name) - arguments = self.gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - hps_kwargs) + monoview_exp = self.gen_single_monoview_arg_dictionary(classifier_name=classifier_name, + arguments=kwargs_init, + view_index=view_index, + view_name=view_name, + hps_kwargs=hps_kwargs) elif hps_method == "None": - arguments = self.gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - hps_kwargs) + monoview_exp = self.gen_single_monoview_arg_dictionary(classifier_name=classifier_name, + arguments=kwargs_init, + view_index=view_index, + view_name=view_name, + hps_kwargs=hps_kwargs) else: raise ValueError( @@ -739,45 +924,60 @@ class Summit(BaseExec): 'are available as hyper-parameter search ' 'methods, sadly "{}" is not'.format(hps_method) ) - monoview_arguments.append(arguments) + monoview_arguments.append(monoview_exp) return monoview_arguments - def get_random_hps_args(self, hps_args, classifier_name): hps_dict = {} for key, value in hps_args.items(): if key in ["n_iter", "equivalent_draws"]: hps_dict[key] = value - if key==classifier_name: + if key == classifier_name: hps_dict["param_distributions"] = value return hps_dict - - def gen_single_monoview_arg_dictionary(self, classifier_name, arguments, nb_class, - view_index, view_name, hps_kwargs): + def gen_single_monoview_arg_dictionary(self, classifier_name='', arguments={}, + view_index=0, view_name='',hps_kwargs={}): if classifier_name in arguments: classifier_config = dict((key, value) for key, value in arguments[ classifier_name].items()) else: classifier_config = {} - return {classifier_name: classifier_config, - "view_name": view_name, - "view_index": view_index, - "classifier_name": classifier_name, - "nb_class": nb_class, - "hps_kwargs": hps_kwargs} + plif = MonoViewExp(classifier_config=classifier_config, + view_name= view_name, view_index= view_index, + classifier_name=classifier_name, + nb_class=self.nb_class, + hps_kwargs=hps_kwargs, + train_size=self.split, + database_name=self.name, + hps_type=self.hps_type, nb_cores=self.nb_cores, + labels_dictionary=self.labels_dictionary, + metrics=self.metrics) + return plif def gen_single_multiview_arg_dictionary(self, classifier_name, arguments, nb_class, - hps_kwargs, views_dictionary=None): - return {"classifier_name": classifier_name, - "view_names": list(views_dictionary.keys()), - 'view_indices': list(views_dictionary.values()), - "nb_class": nb_class, - "labels_names": None, - "hps_kwargs": hps_kwargs, - classifier_name: self.extract_dict(arguments) - } + hps_kwargs, views_dictionary=None,): + if classifier_name in arguments: + classifier_config = dict((key, value) for key, value in arguments[ + classifier_name].items()) + else: + classifier_config = {} + return MultiViewExp(classifier_name=classifier_name, + view_names=list(views_dictionary.keys()), + view_indices=list(views_dictionary.values()), + nb_class=nb_class, + labels_dictionary=self.labels_dictionary, + hps_kwargs=hps_kwargs, + # classifier_name= self.extract_dict(arguments), + train_size=self.split, + classifier_config=classifier_config, + database_name=self.name, + hps_type=self.hps_type, + nb_cores=self.nb_cores, + metrics=self.metrics, + + ) def extract_dict(self, classifier_config): @@ -838,7 +1038,7 @@ class Summit(BaseExec): return paths - def init_kwargs(self, args, classifiers_names, framework="monoview"): + def init_kwargs(self, classifiers_names, framework="monoview"): r"""Used to init kwargs thanks to a function in each monoview classifier package. Parameters @@ -855,7 +1055,7 @@ class Summit(BaseExec): For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" - logging.info("Start:\t Initializing monoview classifiers arguments") + logging.info("Start:\t Initializing {} classifiers arguments".format(framework)) kwargs = {} for classifiers_name in classifiers_names: try: @@ -867,16 +1067,14 @@ class Summit(BaseExec): raise AttributeError( classifiers_name + " is not implemented in monoview_classifiers, " "please specify the name of the file in monoview_classifiers") - if classifiers_name in args: - kwargs[classifiers_name] = args[classifiers_name] + if classifiers_name in self.args: + kwargs[classifiers_name] = self.args[classifiers_name] else: kwargs[classifiers_name] = {} - logging.info("Done:\t Initializing monoview classifiers arguments") - + logging.info("Done:\t Initializing {} classifiers arguments".format(framework)) return kwargs - - def init_kwargs_func(self, args, benchmark): + def init_kwargs_func(self,): """ Dispached the kwargs initialization to monoview and multiview and creates the kwargs variable @@ -895,12 +1093,12 @@ class Summit(BaseExec): kwargs : dict The arguments for each mono- and multiview algorithms """ - monoview_kwargs = self.init_kwargs(args, benchmark["monoview"], + if self.monoview_benchmark is not None: + self.monoview_kwargs = self.init_kwargs(self.monoview_benchmark, framework="monoview") - multiview_kwargs = self.init_kwargs(args, benchmark["multiview"], + if self.multiview_benchmark is not None: + self.multiview_kwargs = self.init_kwargs(self.multiview_benchmark, framework="multiview") - kwargs = {"monoview": monoview_kwargs, "multiview": multiview_kwargs} - return kwargs def arange_metrics(self,): @@ -929,101 +1127,10 @@ class Summit(BaseExec): self.metrics)) - def benchmark_init(self, ): - """ - Initializes the benchmark, by saving the indices of the train - samples and the cross validation folds. - Parameters - ---------- - directory : str - The benchmark's result directory - classification_indices : numpy array - The indices of the samples, splitted for the train/test split - labels : numpy array - The labels of the dataset - - labels_dictionary : dict - The dictionary with labels as keys and their names as values - - k_folds : sklearn.model_selection.Folds object - The folds for the cross validation process - - Returns - ------- - - """ - logging.info("Start:\t Benchmark initialization") - secure_file_path(os.path.join(self.directory, "train_labels.csv")) - train_indices = self.classification_indices[0] - train_labels = self.dataset_var.get_labels(sample_indices=train_indices) - np.savetxt(os.path.join(self.directory, "train_labels.csv"), train_labels, - delimiter=",") - np.savetxt(os.path.join(self.directory, "train_indices.csv"), - self.classification_indices[0], - delimiter=",") - self.results_monoview = [] - folds = self.k_folds.split(np.arange(len(train_labels)), train_labels) - min_fold_len = int(len(train_labels) / self.k_folds.n_splits) - for fold_index, (train_cv_indices, test_cv_indices) in enumerate(folds): - file_name = os.path.join(self.directory, "folds", "test_labels_fold_" + str( - fold_index) + ".csv") - secure_file_path(file_name) - np.savetxt(file_name, train_labels[test_cv_indices[:min_fold_len]], - delimiter=",") - self.labels_names = list(self.labels_dictionary.values()) - logging.info("Done:\t Benchmark initialization") - - - def exec_one_benchmark_mono_core(self, ): # pragma: no cover - self.benchmark_init() - logging.getLogger('matplotlib.font_manager').disabled = True - logging.info("Start:\t monoview benchmark") - traceback_outputs = {} - for arguments in self.argument_dictionaries["monoview"]: - try: - X = self.dataset_var.get_v(arguments["view_index"]) - Y = self.dataset_var.get_labels() - self.results_monoview += [ - exec_monoview(self.directory, X, Y, self.name, self.labels_names, - self.classification_indices, self.k_folds, - self.nb_cores, self.args["file_type"], self.args["pathf"], self.random_state, - hyper_param_search=self.hyper_param_search, - metrics=self.metrics, feature_ids=self.dataset_var.feature_ids[arguments["view_index"]], - **self.arguments)] - except BaseException: - if self.track_tracebacks: - traceback_outputs[ - arguments["classifier_name"] + "-" + arguments[ - "view_name"]] = traceback.format_exc() - else: - raise - logging.info("Done:\t monoview benchmark") - - logging.info("Start:\t multiview benchmark") - self.results_multiview = [] - for arguments in self.argument_dictionaries["multiview"]: - try: - self.results_multiview += [ - exec_multiview(self.directory, self.dataset_var, self.name, - self.classification_indices, - self.k_folds, self.nb_cores, self.args["file_type"], - self.pathf, self.labels_dictionary, self.random_state, - self.labels, - hps_method=self.hyper_param_search, - metrics=self.metrics, n_iter=self.hps_iter, - **self.arguments)] - except BaseException: - if self.track_tracebacks: - traceback_outputs[ - arguments["classifier_name"]] = traceback.format_exc() - else: - raise - logging.info("Done:\t multiview benchmark") - return [flag, results_monoview + results_multiview, traceback_outputs] def exec_benchmark(self, ): # pragma: no cover r"""Used to execute the needed benchmark(s) on multicore or mono-core functions. @@ -1068,26 +1175,26 @@ class Summit(BaseExec): """ logging.info("Start:\t Executing all the needed benchmarks") self.results = [] - for arguments in self.benchmark_arguments_dictionaries: - benchmark_results = self.exec_one_benchmark_mono_core() - analyze_iterations([benchmark_results], - benchmark_arguments_dictionaries, stats_iter, - metrics, sample_ids=dataset_var.sample_ids, - labels=dataset_var.get_labels(), - feature_ids=dataset_var.feature_ids, - view_names=dataset_var.view_names) - results += [benchmark_results] + for iter_ind, bootstrap_iter in enumerate(self.benchmark_argument_dictionaries): + bootstrap_iter.exec_one_benchmark_mono_core(self.dataset_var) + analyze_iterations([bootstrap_iter], self.stats_iter, + self.metrics, sample_ids=self.dataset_var.sample_ids, + labels=self.dataset_var.get_labels(), + feature_ids=self.dataset_var.feature_ids, + view_names=self.dataset_var.view_names) + self.results += [bootstrap_iter] + self.benchmark_argument_dictionaries[iter_ind] = bootstrap_iter logging.info("Done:\t Executing all the needed benchmarks") # Do everything with flagging logging.info("Start:\t Analyzing predictions") - results_mean_stds = analyze(results, stats_iter, - benchmark_arguments_dictionaries, - metrics, - directory, - dataset_var.sample_ids, - dataset_var.get_labels(),dataset_var.feature_ids, - dataset_var.view_names) + results_mean_stds = analyze(self.results, self.stats_iter, + self.benchmark_argument_dictionaries, + self.metrics, + self.res_dir, + self.dataset_var.sample_ids, + self.dataset_var.get_labels(),self.dataset_var.feature_ids, + self.dataset_var.view_names) logging.info("Done:\t Analyzing predictions") return results_mean_stds diff --git a/summit/multiview_platform/monoview/exec_classif_mono_view.py b/summit/multiview_platform/monoview/exec_classif_mono_view.py index 44ef75b2..ce5c4be3 100644 --- a/summit/multiview_platform/monoview/exec_classif_mono_view.py +++ b/summit/multiview_platform/monoview/exec_classif_mono_view.py @@ -6,6 +6,9 @@ import logging # To create Log-Files # Import built-in modules import os # to geth path of the running script import time # for time calculations +from sklearn.model_selection import StratifiedShuffleSplit +from sklearn.metrics import balanced_accuracy_score +from sklearn.base import clone import h5py # Import 3rd party modules @@ -49,162 +52,348 @@ def exec_monoview_multicore(directory, name, labels_names, **args) -def exec_monoview(directory, X, Y, database_name, labels_names, - classification_indices, - k_folds, nb_cores, databaseType, path, - random_state, hyper_param_search="Random", - metrics={"accuracy_score*": {}}, n_iter=30, view_name="", - hps_kwargs={}, feature_ids=[], **args): - logging.info("Start:\t Loading data") - kwargs, \ - t_start, \ - view_name, \ - classifier_name, \ - X, \ - learning_rate, \ - labels_string, \ - output_file_name, \ - directory, \ - base_file_name = init_constants(args, X, classification_indices, - labels_names, - database_name, directory, view_name, ) - logging.info("Done:\t Loading data") - - logging.info( - "Info:\t Classification - Database:" + str( - database_name) + " View:" + str( - view_name) + " train ratio:" - + str(learning_rate) + ", CrossValidation k-folds: " + str( - k_folds.n_splits) + ", cores:" - + str(nb_cores) + ", algorithm : " + classifier_name) - - logging.info("Start:\t Determine Train/Test split") - X_train, y_train, X_test, y_test = init_train_test(X, Y, - classification_indices) - - logging.info("Info:\t Shape X_train:" + str( - X_train.shape) + ", Length of y_train:" + str(len(y_train))) - logging.info("Info:\t Shape X_test:" + str( - X_test.shape) + ", Length of y_test:" + str(len(y_test))) - logging.info("Done:\t Determine Train/Test split") - - logging.info("Start:\t Generate classifier args") - classifier_module = getattr(monoview_classifiers, classifier_name) - classifier_class_name = classifier_module.classifier_class_name - hyper_param_beg = time.monotonic() - cl_kwargs = get_hyper_params(classifier_module, hyper_param_search, - classifier_name, - classifier_class_name, - X_train, y_train, - random_state, output_file_name, - k_folds, nb_cores, metrics, kwargs, - **hps_kwargs) - hyper_param_duration = time.monotonic() - hyper_param_beg - logging.info("Done:\t Generate classifier args") - - logging.info("Start:\t Training") - - classifier = get_mc_estim(getattr(classifier_module, - classifier_class_name) - (random_state=random_state, **cl_kwargs), - random_state, - y=Y) - fit_beg = time.monotonic() - classifier.fit(X_train, y_train) - fit_duration = time.monotonic() - fit_beg - logging.info("Done:\t Training") - - logging.info("Start:\t Predicting") - train_pred = classifier.predict(X_train) - pred_beg = time.monotonic() - test_pred = classifier.predict(X_test) - pred_duration = time.monotonic() - pred_beg - - #### ROC CURVE ADDITION ### - from sklearn.metrics import roc_curve - fpr, tpr, _ = roc_curve(y_test, classifier.predict_proba(X_test)[:, 1]) - np.savetxt(os.path.join(directory, classifier_class_name+"-fpr.npy"), fpr) - np.savetxt(os.path.join(directory, classifier_class_name + "-tpr.npy"), tpr) - ### END ROC ### - - - # Filling the full prediction in the right order - full_pred = np.zeros(Y.shape, dtype=int) - 100 - for train_index, index in enumerate(classification_indices[0]): - full_pred[index] = train_pred[train_index] - for test_index, index in enumerate(classification_indices[1]): - full_pred[index] = test_pred[test_index] - - logging.info("Done:\t Predicting") - - whole_duration = time.monotonic() - t_start - logging.info( - "Info:\t Duration for training and predicting: " + str( - whole_duration) + "[s]") - - logging.info("Start:\t Getting results") - result_analyzer = MonoviewResultAnalyzer(view_name=view_name, - classifier_name=classifier_name, - shape=X.shape, - classifier=classifier, - classification_indices=classification_indices, - k_folds=k_folds, - hps_method=hyper_param_search, - metrics_dict=metrics, - n_iter=n_iter, - class_label_names=labels_names, - pred=full_pred, - directory=directory, - base_file_name=base_file_name, - labels=Y, - database_name=database_name, - nb_cores=nb_cores, - duration=whole_duration, - feature_ids=feature_ids) - string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ +class MonoViewExp: + + def __init__(self, classifier_name="decision_tree", + classifier_config={"depth":3}, view_name="First view", + view_index=0, nb_class=2, hps_kwargs={}, train_size=0.8, + labels_dictionary={}, database_name="", + hps_type="Random", nb_cores=1, metrics={}): + self.classifier_name = classifier_name + self.classifier_config=classifier_config + self.view_name=view_name + self.view_index=view_index + self.nb_class=nb_class + self.hps_kwargs=hps_kwargs + self.train_size=train_size + self.labels_dictionary=labels_dictionary + self.directory=None + self.database_name=database_name + self.k_folds=None + self.classifier=None + self.split=None + self.hps_type = hps_type + self.nb_cores=nb_cores + self.metrics=metrics + + def add_bootstrap_info(self, directory="", k_folds=[], splits=[], + random_state=42): + self.directory = directory + self.k_folds=k_folds + self.splits=splits + self.random_state = random_state + + def exec(self, dataset_var): + t_start = time.monotonic() + X = dataset_var.get_v(self.view_index) + Y = dataset_var.get_labels() + logging.info("Start:\t Loading data") + self.init_constants() + logging.info("Done:\t Loading data") + + logging.info( + "Info:\t Classification - Database:" + str( + self.database_name) + " View:" + str( + self.view_name) + " train ratio:" + + str(self.train_size) + ", CrossValidation k-folds: " + str( + self.k_folds.n_splits) + ", algorithm : " + self.classifier_name) + + logging.info("Start:\t Determine Train/Test split") + X_train, y_train, X_test, y_test = self.init_train_test(X, Y) + self.X_train = X_train + self.y_train = y_train + + logging.info("Info:\t Shape X_train:" + str( + X_train.shape) + ", Length of y_train:" + str(len(y_train))) + logging.info("Info:\t Shape X_test:" + str( + X_test.shape) + ", Length of y_test:" + str(len(y_test))) + logging.info("Done:\t Determine Train/Test split") + + logging.info("Start:\t Generate classifier args") + classifier_module = getattr(monoview_classifiers, self.classifier_name) + classifier_class_name = classifier_module.classifier_class_name + hyper_param_beg = time.monotonic() + self.cl_kwargs = get_hyper_params(classifier_module, self.hps_type, + self.classifier_name, + classifier_class_name, + X_train, y_train, + self.random_state, self.output_file_name, + self.k_folds, self.nb_cores, self.metrics, self.classifier_config, + **self.hps_kwargs) + self.argi = {} + self.argi[self.directory] = self.cl_kwargs.copy() + self.hyper_param_duration = time.monotonic() - hyper_param_beg + logging.info("Done:\t Generate classifier args") + + logging.info("Start:\t Training") + + self.classifier = get_mc_estim(getattr(classifier_module, + classifier_class_name) + (random_state=self.random_state, **self.cl_kwargs), + self.random_state, + y=Y) + fit_beg = time.monotonic() + self.classifier.fit(X_train, y_train) + + self.fit_duration = time.monotonic() - fit_beg + logging.info("Done:\t Training") + + logging.info("Start:\t Predicting") + train_pred = self.classifier.predict(X_train) + pred_beg = time.monotonic() + test_pred = self.classifier.predict(X_test) + self.pred_duration = time.monotonic() - pred_beg + + # Filling the full prediction in the right order + full_pred = np.zeros(Y.shape, dtype=int) - 100 + for train_index, index in enumerate(self.splits[0]): + full_pred[index] = train_pred[train_index] + for test_index, index in enumerate(self.splits[1]): + full_pred[index] = test_pred[test_index] + + logging.info("Done:\t Predicting") + + self.whole_duration = time.monotonic() - t_start + logging.info( + "Info:\t Duration for training and predicting: " + str( + self.whole_duration) + "[s]") + + logging.info("Start:\t Getting results") + if "n_iter" in self.hps_kwargs: + self.n_iter_hps = self.hps_kwargs["n_iter"] + else: + self.n_iter_hps = 0 + result_analyzer = MonoviewResultAnalyzer(view_name=self.view_name, + classifier_name=self.classifier_name, + shape=X.shape, + classifier=self.classifier, + classification_indices=self.splits, + k_folds=self.k_folds, + hps_method=self.hps_type, + metrics_dict=self.metrics, + n_iter=self.n_iter_hps, + class_label_names=self.labels_names, + pred=full_pred, + directory=self.directory, + base_file_name=self.base_file_name, + labels=Y, + database_name=self.database_name, + nb_cores=self.nb_cores, + duration=self.whole_duration, + feature_ids=dataset_var.feature_ids[self.view_index]) + string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ + confusion_matrix = result_analyzer.analyze() + logging.info("Done:\t Getting results") + + logging.info("Start:\t Saving preds") + save_results(string_analysis, self.output_file_name, full_pred, train_pred, + y_train, images_analysis, y_test, confusion_matrix) + logging.info("Done:\t Saving results") + + return MonoviewResult(self.view_index, self.classifier_name, self.view_name, + metrics_scores, full_pred, self.cl_kwargs, + self.classifier, X_train.shape[1], + self.hyper_param_duration, self.fit_duration, self.pred_duration, + class_metrics_scores) + + def test(self, dataset_var, y=None, feature_ids=None, n_splits=4, + test_size=0.8): + if y is not None: + X = dataset_var + Y = y + else: + X = dataset_var.get_v(self.view_index) + Y = dataset_var.get_labels() + logging.info("Start:\t Loading data") + pred = self.classifier.predict(X) + sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, + random_state=self.random_state) + fit_clf = clone(self.classifier) + ba=[] + for train_index, test_index in sss.split(X, y): + fit_clf.fit(X[train_index], y[train_index]) + pred_labels = fit_clf.predict(X[test_index]) + ba.append(balanced_accuracy_score(y[test_index], pred_labels )) + + result_analyzer = MonoviewResultAnalyzer(view_name=self.view_name, + classifier_name=self.classifier_name, + shape=X.shape, + classifier=self.classifier, + classification_indices=[np.arange(X.shape[0]), + np.arange(X.shape[0]),], + k_folds=self.k_folds, + hps_method=self.hps_type, + metrics_dict=self.metrics, + n_iter=self.n_iter_hps, + class_label_names=self.labels_names, + pred=pred, + directory=self.directory, + base_file_name=self.base_file_name, + labels=Y, + database_name=self.database_name, + nb_cores=self.nb_cores, + duration=self.whole_duration, + feature_ids=feature_ids + ) + string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ confusion_matrix = result_analyzer.analyze() - logging.info("Done:\t Getting results") - - logging.info("Start:\t Saving preds") - save_results(string_analysis, output_file_name, full_pred, train_pred, - y_train, images_analysis, y_test, confusion_matrix) - logging.info("Done:\t Saving results") - - view_index = args["view_index"] - return MonoviewResult(view_index, classifier_name, view_name, - metrics_scores, full_pred, cl_kwargs, - classifier, X_train.shape[1], - hyper_param_duration, fit_duration, pred_duration, - class_metrics_scores) - - -def init_constants(args, X, classification_indices, labels_names, - name, directory, view_name): - try: - kwargs = args["args"] - except KeyError: - kwargs = args - t_start = time.monotonic() - cl_type = kwargs["classifier_name"] - learning_rate = float(len(classification_indices[0])) / ( - len(classification_indices[0]) + len(classification_indices[1])) - labels_string = "-".join(labels_names) - cl_type_string = cl_type - directory = os.path.join(directory, cl_type_string, view_name, ) - base_file_name = cl_type_string + '-' + name + "-" + view_name + "-" - output_file_name = os.path.join(directory, base_file_name) - secure_file_path(output_file_name) - return kwargs, t_start, view_name, cl_type, X, learning_rate, labels_string,\ - output_file_name, directory, base_file_name - - -def init_train_test(X, Y, classification_indices): - train_indices, test_indices = classification_indices - X_train = extract_subset(X, train_indices) - X_test = extract_subset(X, test_indices) - y_train = Y[train_indices] - y_test = Y[test_indices] - return X_train, y_train, X_test, y_test + tr, te = metrics_scores["balanced_accuracy*"] + metrics_scores["balanced_accuracy*"] = (np.mean(ba), te) + logging.info("Done:\t Getting results") + return MonoviewResult(0, self.classifier_name, self.view_name, + metrics_scores, pred, None, + self.classifier, X.shape[1], + self.hyper_param_duration, self.fit_duration, self.pred_duration, + class_metrics_scores) + + def init_constants(self, ): + self.labels_names= [self.labels_dictionary[ind] for ind in range(self.nb_class)] + self.directory = os.path.join(self.directory, self.classifier_name, self.view_name, ) + self.base_file_name = self.classifier_name + '-' + self.database_name + "-" + self.view_name + "-" + self.output_file_name = os.path.join(self.directory, self.base_file_name) + secure_file_path(self.output_file_name) + + def init_train_test(self, X, Y): + train_indices, test_indices = self.splits + X_train = extract_subset(X, train_indices) + X_test = extract_subset(X, test_indices) + y_train = Y[train_indices] + y_test = Y[test_indices] + return X_train, y_train, X_test, y_test + +# def exec_monoview(directory, X, Y, database_name, labels_names, +# classification_indices, +# k_folds, nb_cores, databaseType, path, +# random_state, hyper_param_search="Random", +# metrics={"accuracy_score*": {}}, n_iter=30, view_name="", +# hps_kwargs={}, feature_ids=[], **args): +# logging.info("Start:\t Loading data") +# kwargs, \ +# t_start, \ +# view_name, \ +# classifier_name, \ +# X, \ +# learning_rate, \ +# labels_string, \ +# output_file_name, \ +# directory, \ +# base_file_name = init_constants(args, X, classification_indices, +# labels_names, +# database_name, directory, view_name, ) +# logging.info("Done:\t Loading data") +# +# logging.info( +# "Info:\t Classification - Database:" + str( +# database_name) + " View:" + str( +# view_name) + " train ratio:" +# + str(learning_rate) + ", CrossValidation k-folds: " + str( +# k_folds.n_splits) + ", cores:" +# + str(nb_cores) + ", algorithm : " + classifier_name) +# +# logging.info("Start:\t Determine Train/Test split") +# X_train, y_train, X_test, y_test = init_train_test(X, Y, +# classification_indices) +# +# logging.info("Info:\t Shape X_train:" + str( +# X_train.shape) + ", Length of y_train:" + str(len(y_train))) +# logging.info("Info:\t Shape X_test:" + str( +# X_test.shape) + ", Length of y_test:" + str(len(y_test))) +# logging.info("Done:\t Determine Train/Test split") +# +# logging.info("Start:\t Generate classifier args") +# classifier_module = getattr(monoview_classifiers, classifier_name) +# classifier_class_name = classifier_module.classifier_class_name +# hyper_param_beg = time.monotonic() +# cl_kwargs = get_hyper_params(classifier_module, hyper_param_search, +# classifier_name, +# classifier_class_name, +# X_train, y_train, +# random_state, output_file_name, +# k_folds, nb_cores, metrics, kwargs, +# **hps_kwargs) +# hyper_param_duration = time.monotonic() - hyper_param_beg +# logging.info("Done:\t Generate classifier args") +# +# logging.info("Start:\t Training") +# +# classifier = get_mc_estim(getattr(classifier_module, +# classifier_class_name) +# (random_state=random_state, **cl_kwargs), +# random_state, +# y=Y) +# fit_beg = time.monotonic() +# classifier.fit(X_train, y_train) +# fit_duration = time.monotonic() - fit_beg +# logging.info("Done:\t Training") +# +# logging.info("Start:\t Predicting") +# train_pred = classifier.predict(X_train) +# pred_beg = time.monotonic() +# test_pred = classifier.predict(X_test) +# pred_duration = time.monotonic() - pred_beg +# +# #### ROC CURVE ADDITION ### +# from sklearn.metrics import roc_curve +# fpr, tpr, _ = roc_curve(y_test, classifier.predict_proba(X_test)[:, 1]) +# np.savetxt(os.path.join(directory, classifier_class_name+"-fpr.npy"), fpr) +# np.savetxt(os.path.join(directory, classifier_class_name + "-tpr.npy"), tpr) +# ### END ROC ### +# +# +# # Filling the full prediction in the right order +# full_pred = np.zeros(Y.shape, dtype=int) - 100 +# for train_index, index in enumerate(classification_indices[0]): +# full_pred[index] = train_pred[train_index] +# for test_index, index in enumerate(classification_indices[1]): +# full_pred[index] = test_pred[test_index] +# +# logging.info("Done:\t Predicting") +# +# whole_duration = time.monotonic() - t_start +# logging.info( +# "Info:\t Duration for training and predicting: " + str( +# whole_duration) + "[s]") +# +# logging.info("Start:\t Getting results") +# result_analyzer = MonoviewResultAnalyzer(view_name=view_name, +# classifier_name=classifier_name, +# shape=X.shape, +# classifier=classifier, +# classification_indices=classification_indices, +# k_folds=k_folds, +# hps_method=hyper_param_search, +# metrics_dict=metrics, +# n_iter=n_iter, +# class_label_names=labels_names, +# pred=full_pred, +# directory=directory, +# base_file_name=base_file_name, +# labels=Y, +# database_name=database_name, +# nb_cores=nb_cores, +# duration=whole_duration, +# feature_ids=feature_ids) +# string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ +# confusion_matrix = result_analyzer.analyze() +# logging.info("Done:\t Getting results") +# +# logging.info("Start:\t Saving preds") +# save_results(string_analysis, output_file_name, full_pred, train_pred, +# y_train, images_analysis, y_test, confusion_matrix) +# logging.info("Done:\t Saving results") +# +# view_index = args["view_index"] +# return MonoviewResult(view_index, classifier_name, view_name, +# metrics_scores, full_pred, cl_kwargs, +# classifier, X_train.shape[1], +# hyper_param_duration, fit_duration, pred_duration, +# class_metrics_scores) +# +# + + + + def get_hyper_params(classifier_module, search_method, classifier_module_name, @@ -218,7 +407,7 @@ def get_hyper_params(classifier_module, search_method, classifier_module_name, classifier_hp_search = getattr(hyper_parameter_search, search_method) estimator = getattr(classifier_module, classifier_class_name)( random_state=random_state, - **kwargs[classifier_module_name]) + **kwargs) estimator = get_mc_estim(estimator, random_state, multiview=False, y=y_train) hps = classifier_hp_search(estimator, scoring=metrics, cv=k_folds, @@ -230,7 +419,7 @@ def get_hyper_params(classifier_module, search_method, classifier_module_name, hps.gen_report(output_file_name) logging.info("Done:\t " + search_method + " best settings") else: - cl_kwargs = kwargs[classifier_module_name] + cl_kwargs = kwargs return cl_kwargs diff --git a/summit/multiview_platform/monoview_classifiers/adaboost.py b/summit/multiview_platform/monoview_classifiers/adaboost.py index 0e06e245..b6f1a65d 100644 --- a/summit/multiview_platform/monoview_classifiers/adaboost.py +++ b/summit/multiview_platform/monoview_classifiers/adaboost.py @@ -67,27 +67,27 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): def get_interpretation(self, directory, base_file_name, y_test, feature_ids, multi_class=False): # pragma: no cover interpretString = "" - interpretString += self.get_feature_importance(directory, - base_file_name, - feature_ids) - interpretString += "\n\n Estimator error | Estimator weight\n" - interpretString += "\n".join( - [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for - error, weight in - zip(self.estimator_errors_, self.estimator_weights_)]) - step_test_metrics = np.array( - [self.plotted_metric.score(y_test, step_pred) for step_pred in - self.step_predictions]) - get_accuracy_graph(step_test_metrics, "Adaboost", - os.path.join(directory, - base_file_name + "test_metrics.png"), - self.plotted_metric_name, set="test") - np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"), - step_test_metrics, - delimiter=',') - np.savetxt( - os.path.join(directory, base_file_name + "train_metrics.csv"), - self.metrics, delimiter=',') - np.savetxt(os.path.join(directory, base_file_name + "times.csv"), - np.array([self.train_time, self.pred_time]), delimiter=',') + # interpretString += self.get_feature_importance(directory, + # base_file_name, + # feature_ids) + # interpretString += "\n\n Estimator error | Estimator weight\n" + # interpretString += "\n".join( + # [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for + # error, weight in + # zip(self.estimator_errors_, self.estimator_weights_)]) + # step_test_metrics = np.array( + # [self.plotted_metric.score(y_test, step_pred) for step_pred in + # self.step_predictions]) + # get_accuracy_graph(step_test_metrics, "Adaboost", + # os.path.join(directory, + # base_file_name + "test_metrics.png"), + # self.plotted_metric_name, set="test") + # np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"), + # step_test_metrics, + # delimiter=',') + # np.savetxt( + # os.path.join(directory, base_file_name + "train_metrics.csv"), + # self.metrics, delimiter=',') + # np.savetxt(os.path.join(directory, base_file_name + "times.csv"), + # np.array([self.train_time, self.pred_time]), delimiter=',') return interpretString diff --git a/summit/multiview_platform/monoview_classifiers/bagged_spkm.py b/summit/multiview_platform/monoview_classifiers/bagged_spkm.py new file mode 100644 index 00000000..1a74f064 --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/bagged_spkm.py @@ -0,0 +1,46 @@ +import numpy as np +from sklearn.preprocessing import LabelBinarizer + +from imblearn.under_sampling import RandomUnderSampler + +from spkm.spkm_wrapper import SPKMlikeSklearn +from spkm.kernels_and_gradients import RBFKernel, PolyKernel + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.hyper_parameter_search import CustomRandint +from ..monoview_classifiers.spkm import SPKM + +classifier_class_name = "BaggedSPKM" + +class BaggedSPKM(BaseMonoviewClassifier, SPKMlikeSklearn): + + def __init__(self, random_state=42, n_u=2, kernel=RBFKernel(0.5), + spkmregP=1, spkminit="randn", + nspkminits=10, preprocessinglist=[0,1,2], **kwargs): + + SPKM.__init__(self, random_state=random_state, + n_u=n_u, + kernel=kernel, + spkmregP=spkmregP, + spkminit=spkminit, + nspkminits=nspkminits, + preprocessinglist=preprocessinglist) + self.rus = RandomUnderSampler(random_state=random_state) + + def fit(self, X, y): + self.lb = LabelBinarizer(pos_label=1, neg_label=-1) + y = self.lb.fit_transform(y) + return SPKMlikeSklearn.fit(self, X, y[:,0],) + + def predict(self, X, preprocess=True): + return self.lb.inverse_transform(np.sign(SPKMlikeSklearn.predict(self, X))) + + def get_interpretation(self, directory, base_file_name, labels, multiclass=False): + u = self.feature_interpretability() + importances_sum = np.sum(u) + self.feature_importances_ = u/importances_sum + return "" + + def accepts_multi_class(self, random_state, n_samples=10, dim=2, + n_classes=3, n_views=2): + return False \ No newline at end of file diff --git a/summit/multiview_platform/monoview_classifiers/decision_tree.py b/summit/multiview_platform/monoview_classifiers/decision_tree.py index 33b99090..f8392df4 100644 --- a/summit/multiview_platform/monoview_classifiers/decision_tree.py +++ b/summit/multiview_platform/monoview_classifiers/decision_tree.py @@ -23,6 +23,7 @@ class DecisionTree(DecisionTreeClassifier, BaseMonoviewClassifier): DecisionTreeClassifier.__init__(self, max_depth=max_depth, criterion=criterion, + class_weight="balanced", splitter=splitter, random_state=random_state ) diff --git a/summit/multiview_platform/monoview_classifiers/ib_decision_tree.py b/summit/multiview_platform/monoview_classifiers/ib_decision_tree.py new file mode 100644 index 00000000..f4613fd3 --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/ib_decision_tree.py @@ -0,0 +1,44 @@ +from imblearn.ensemble import BalancedBaggingClassifier +import numpy as np +from sklearn.tree import DecisionTreeClassifier + + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.base import base_boosting_estimators +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + +classifier_class_name = "IBDT" + +class IBDT(BaseMonoviewClassifier, BalancedBaggingClassifier): + + def __init__(self, random_state=None, n_estimators=10, + sampling_strategy="auto", base_estimator=DecisionTreeClassifier(), + replacement=False, **kwargs): + super(IBDT, self).__init__(random_state=random_state, + base_estimator=base_estimator, + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement) + + self.param_names = ["n_estimators", "sampling_strategy", + "base_estimator__max_depth", + "base_estimator__criterion", + "base_estimator__splitter",] + self.classed_params = ["base_estimator"] + self.distribs = [CustomRandint(low=1, high=50), + ["auto"],CustomRandint(low=1, high=300), + ["gini", "entropy"], + ["best", "random"],] + self.weird_strings=[] + + def fit(self, X, y): + BalancedBaggingClassifier.fit(self, X, y) + self.feature_importances_ = np.zeros(X.shape[1]) + for estim in self.estimators_: + if hasattr(estim['classifier'], 'feature_importances_'): + self.feature_importances_ += estim['classifier'].feature_importances_ + self.feature_importances_ /= np.sum(self.feature_importances_) + return self + + + diff --git a/summit/multiview_platform/monoview_classifiers/ib_random_forest.py b/summit/multiview_platform/monoview_classifiers/ib_random_forest.py new file mode 100644 index 00000000..2a96fa95 --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/ib_random_forest.py @@ -0,0 +1,44 @@ +from imblearn.ensemble import BalancedBaggingClassifier +import numpy as np +from sklearn.ensemble import RandomForestClassifier + + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.base import base_boosting_estimators +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + +classifier_class_name = "IBRF" + +class IBRF(BaseMonoviewClassifier, BalancedBaggingClassifier): + + def __init__(self, random_state=None, n_estimators=10, + sampling_strategy="auto", replacement=False, + base_estimator=RandomForestClassifier(), **kwargs): + super(IBRF, self).__init__(random_state=random_state, + base_estimator=base_estimator, + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement) + + self.param_names = ["n_estimators", "sampling_strategy", + "base_estimator__n_estimators", + "base_estimator__max_depth", + "base_estimator__criterion"] + self.classed_params = ["base_estimator"] + self.distribs = [CustomRandint(low=1, high=50), + ["auto"],CustomRandint(low=1, high=300), + CustomRandint(low=1, high=10), + ["gini", "entropy"],] + self.weird_strings=[] + + def fit(self, X, y): + BalancedBaggingClassifier.fit(self, X, y) + self.feature_importances_ = np.zeros(X.shape[1]) + for estim in self.estimators_: + if hasattr(estim['classifier'], 'feature_importances_'): + self.feature_importances_ += estim['classifier'].feature_importances_ + self.feature_importances_ /= np.sum(self.feature_importances_) + return self + + + diff --git a/summit/multiview_platform/monoview_classifiers/ib_random_scm.py b/summit/multiview_platform/monoview_classifiers/ib_random_scm.py new file mode 100644 index 00000000..f6a2e6e9 --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/ib_random_scm.py @@ -0,0 +1,50 @@ +from imblearn.ensemble import BalancedBaggingClassifier +import numpy as np + + +from ..monoview_classifiers.random_scm import ScmBagging +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.base import base_boosting_estimators +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + +classifier_class_name = "IBRSCM" + +class IBRSCM(BaseMonoviewClassifier, BalancedBaggingClassifier): + + def __init__(self, random_state=None, n_estimators=10, + sampling_strategy="auto", replacement=False, + base_estimator=ScmBagging(), **kwargs): + super(IBRSCM, self).__init__(random_state=random_state, + base_estimator=base_estimator, + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement) + + self.param_names = ["n_estimators", "sampling_strategy", + "base_estimator__n_estimators", + "base_estimator__max_rules", + "base_estimator__max_samples", + "base_estimator__max_features", + "base_estimator__model_type", + "base_estimator__p_options",] + self.classed_params = ["base_estimator"] + self.distribs = [CustomRandint(low=1, high=50), + ["auto"],CustomRandint(low=1, high=300), + CustomRandint(low=1, high=20), + CustomUniform(), CustomUniform(), + ["conjunction", "disjunction"], CustomUniform(), ] + self.weird_strings=[] + + def fit(self, X, y): + print(self.base_estimator.n_estimators) + BalancedBaggingClassifier.fit(self, X, y) + self.feature_importances_ = np.zeros(X.shape[1]) + for estim in self.estimators_: + if hasattr(estim['classifier'], 'feature_importances_'): + self.feature_importances_ += estim['classifier'].feature_importances_ + self.feature_importances_ /= np.sum(self.feature_importances_) + print('Fitted') + return self + + + diff --git a/summit/multiview_platform/monoview_classifiers/ib_scm.py b/summit/multiview_platform/monoview_classifiers/ib_scm.py new file mode 100644 index 00000000..aadbae57 --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/ib_scm.py @@ -0,0 +1,42 @@ +from imblearn.ensemble import BalancedBaggingClassifier +import numpy as np +from ..monoview_classifiers.scm import SCM + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.base import base_boosting_estimators +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + +classifier_class_name = "IBSCM" + +class IBSCM(BaseMonoviewClassifier, BalancedBaggingClassifier): + + def __init__(self, random_state=None, n_estimators=10, + sampling_strategy="auto", replacement=False, + base_estimator=SCM(), **kwargs): + super(IBSCM, self).__init__(random_state=random_state, + base_estimator=base_estimator, + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement) + + self.param_names = ["n_estimators", "sampling_strategy", + "base_estimator__model_type", + "base_estimator__max_rules", "base_estimator__p",] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=50), + ["auto"],["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1),] + self.weird_strings=[] + + def fit(self, X, y): + BalancedBaggingClassifier.fit(self, X, y) + self.feature_importances_ = np.zeros(X.shape[1]) + for estim in self.estimators_: + if hasattr(estim['classifier'], 'feature_importances_'): + self.feature_importances_ += estim['classifier'].feature_importances_ + self.feature_importances_ /= np.sum(self.feature_importances_) + return self + + + diff --git a/summit/multiview_platform/monoview_classifiers/random_forest.py b/summit/multiview_platform/monoview_classifiers/random_forest.py index f0d3578c..202ac61b 100644 --- a/summit/multiview_platform/monoview_classifiers/random_forest.py +++ b/summit/multiview_platform/monoview_classifiers/random_forest.py @@ -24,6 +24,7 @@ class RandomForest(RandomForestClassifier, BaseMonoviewClassifier): n_estimators=n_estimators, max_depth=max_depth, criterion=criterion, + class_weight="balanced", random_state=random_state ) self.param_names = ["n_estimators", "max_depth", "criterion", diff --git a/summit/multiview_platform/monoview_classifiers/spkm.py b/summit/multiview_platform/monoview_classifiers/spkm.py new file mode 100644 index 00000000..4c9aeebd --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/spkm.py @@ -0,0 +1,54 @@ +import numpy as np +from sklearn.preprocessing import LabelBinarizer + +from spkm.spkm_wrapper import SPKMlikeSklearn +from spkm.kernels_and_gradients import RBFKernel, PolyKernel + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.hyper_parameter_search import CustomRandint +from ..utils.dataset import get_samples_views_indices + +classifier_class_name = "SPKM" + +class SPKM(BaseMonoviewClassifier, SPKMlikeSklearn): + + def __init__(self, random_state=42, n_u=2, kernel=RBFKernel(0.5), + spkmregP=1, spkminit="randn", + nspkminits=10, preprocessinglist=[0,1,2], **kwargs): + + SPKMlikeSklearn.__init__(self, random_state=random_state, + n_u=n_u, + kernel=kernel, + spkmregP=spkmregP, + spkminit=spkminit, + nspkminits=nspkminits, + preprocessinglist=preprocessinglist) + self.param_names = ["n_u", "kernel", "spkmregP", + "spkminit", "nspkminits", "preprocessinglist", + "random_state"] + self.distribs = [[2], [PolyKernel({"d":3, "r":1}), RBFKernel(0.5)], + CustomRandint(-4,4, multiplier='e'), + ["data"], [10], + [[0,1],], + [random_state],] + self.more_than_two_views = False + self.weird_strings = [] + self.random_state = random_state + + def fit(self, X, y): + self.lb = LabelBinarizer(pos_label=1, neg_label=-1) + y = self.lb.fit_transform(y) + return SPKMlikeSklearn.fit(self, X, y[:,0],) + + def predict(self, X, preprocess=True): + return self.lb.inverse_transform(np.sign(SPKMlikeSklearn.predict(self, X))) + + def get_interpretation(self, directory, base_file_name, labels, multiclass=False): + u = self.feature_interpretability() + importances_sum = np.sum(u) + self.feature_importances_ = u/importances_sum + return "" + + def accepts_multi_class(self, random_state, n_samples=10, dim=2, + n_classes=3, n_views=2): + return False diff --git a/summit/multiview_platform/multiview/exec_multiview.py b/summit/multiview_platform/multiview/exec_multiview.py index ca2e7aed..c2abe337 100644 --- a/summit/multiview_platform/multiview/exec_multiview.py +++ b/summit/multiview_platform/multiview/exec_multiview.py @@ -17,335 +17,355 @@ __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype -def init_constants(kwargs, classification_indices, metrics, - name, nb_cores, k_folds, - dataset_var, directory): - """ - Used to init the constants - Parameters - ---------- - kwargs : - - classification_indices : - - metrics : - - name : - - nb_cores : nint number of cares to execute - - k_folds : - - dataset_var : {array-like} shape (n_samples, n_features) - dataset variable - - Returns - ------- - tuple of (classifier_name, t_start, views_indices, - classifier_config, views, learning_rate) - """ - views = kwargs["view_names"] - views_indices = kwargs["view_indices"] - if metrics is None: - metrics = {"f1_score*": {}} - classifier_name = kwargs["classifier_name"] - classifier_config = kwargs[classifier_name] - learning_rate = len(classification_indices[0]) / float( - (len(classification_indices[0]) + len(classification_indices[1]))) - t_start = time.time() - logging.info("Info\t: Classification - Database : " + str( - name) + " ; Views : " + ", ".join(views) + - " ; Algorithm : " + classifier_name + " ; Cores : " + str( - nb_cores) + ", Train ratio : " + str(learning_rate) + - ", CV on " + str(k_folds.n_splits) + " folds") +class MultiViewExp: + + def __init__(self, classifier_name="decision_tree", + classifier_config={"depth":3}, view_names=[], + view_indices=[0], nb_class=2, hps_kwargs={}, train_size=0.8, + labels_dictionary={}, database_name="", + hps_type="Random", nb_cores=1, metrics={}, + equivalent_draws=False): + self.classifier_name = classifier_name + self.classifier_config=classifier_config + self.view_names=view_names + self.view_indices=view_indices + self.nb_class=nb_class + self.hps_kwargs=hps_kwargs + self.train_size=train_size + self.labels_dictionary=labels_dictionary + self.directory=None + self.database_name=database_name + self.k_folds=None + self.split=None + self.hps_type = hps_type + self.nb_cores=nb_cores + self.metrics=metrics + self.equivalent_draws=equivalent_draws + + def add_bootstrap_info(self, directory="", k_folds=[], splits=[], + random_state=42): + self.directory = directory + self.k_folds=k_folds + self.splits=splits + self.random_state = random_state + + def init_constants(self, dataset_var ): + """ + Used to init the constants + Parameters + ---------- + kwargs : + + classification_indices : + + metrics : + + name : + + nb_cores : nint number of cares to execute + + k_folds : + + dataset_var : {array-like} shape (n_samples, n_features) + dataset variable - for view_index, view_name in zip(views_indices, views): - logging.info("Info:\t Shape of " + str(view_name) + " :" + str( - dataset_var.get_shape(view_index))) - labels = dataset_var.get_labels() - directory = os.path.join(directory, classifier_name) - base_file_name = classifier_name + "-" + dataset_var.get_name() + "-" - output_file_name = os.path.join(directory, base_file_name) - return classifier_name, t_start, views_indices, \ - classifier_config, views, learning_rate, labels, output_file_name, \ - directory, base_file_name, metrics + Returns + ------- + tuple of (classifier_name, t_start, views_indices, + classifier_config, views, learning_rate) + """ + # learning_rate = len(self.split[0]) / float( + # (len(classification_indices[0]) + len(classification_indices[1]))) + t_start = time.time() + logging.info("Info\t: Classification - Database : " + str( + self.database_name) + " ; Views : " + ", ".join(self.view_names) + + " ; Algorithm : " + self.classifier_name + " ; Cores : " + str( + self.nb_cores) + ", Train ratio : " + str(self.train_size) + + ", CV on " + str(self.k_folds.n_splits) + " folds") + for view_index, view_name in zip(self.view_indices, self.view_names): + logging.info("Info:\t Shape of " + str(view_name) + " :" + str( + dataset_var.get_shape(view_index))) + # labels = dataset_var.get_labels() + self.directory = os.path.join(self.directory, self.classifier_name) + self.base_file_name = self.classifier_name + "-" + dataset_var.get_name() + "-" + self.output_file_name = os.path.join(self.directory, self.base_file_name) + # return classifier_name, t_start, views_indices, \ + # classifier_config, views, learning_rate, labels, output_file_name, \ + # directory, base_file_name, metrics -def save_results(string_analysis, images_analysis, output_file_name, - confusion_matrix): # pragma: no cover - """ - Save results in derectory + def save_results(self, string_analysis, images_analysis, + confusion_matrix): # pragma: no cover + """ + Save results in derectory - Parameters - ---------- + Parameters + ---------- - classifier : classifier class + classifier : classifier class - labels_dictionary : dict dictionary of labels + labels_dictionary : dict dictionary of labels - string_analysis : str + string_analysis : str - views : + views : - classifier_module : module of the classifier + classifier_module : module of the classifier - classification_kargs : + classification_kargs : - directory : str directory + directory : str directory - learning_rate : + learning_rate : - name : + name : - images_analysis : + images_analysis : - """ - logging.info(string_analysis) - secure_file_path(output_file_name) - output_text_file = open(output_file_name + 'summary.txt', 'w', - encoding="utf-8") - output_text_file.write(string_analysis) - output_text_file.close() - np.savetxt(output_file_name + "confusion_matrix.csv", confusion_matrix, - delimiter=',') + """ + logging.info(string_analysis) + secure_file_path(self.output_file_name) + output_text_file = open(self.output_file_name + 'summary.txt', 'w', + encoding="utf-8") + output_text_file.write(string_analysis) + output_text_file.close() + np.savetxt(self.output_file_name + "confusion_matrix.csv", confusion_matrix, + delimiter=',') - if images_analysis is not None: - for image_name in images_analysis.keys(): - if os.path.isfile(output_file_name + image_name + ".png"): - for i in range(1, 20): - test_file_name = output_file_name + image_name + "-" + str( - i) + ".png" - if not os.path.isfile(test_file_name): - images_analysis[image_name].savefig(test_file_name, - transparent=True) - break + if images_analysis is not None: + for image_name in images_analysis.keys(): + if os.path.isfile(self.output_file_name + image_name + ".png"): + for i in range(1, 20): + test_file_name = self.output_file_name + image_name + "-" + str( + i) + ".png" + if not os.path.isfile(test_file_name): + images_analysis[image_name].savefig(test_file_name, + transparent=True) + break - images_analysis[image_name].savefig( - output_file_name + image_name + '.png', transparent=True) + images_analysis[image_name].savefig( + self.output_file_name + image_name + '.png', transparent=True) + def exec(self, dataset_var, ): + """Used to execute multiview classification and result analysis -def exec_multiview_multicore(directory, core_index, name, learning_rate, - nb_folds, - database_type, path, labels_dictionary, - random_state, labels, - hyper_param_search=False, nb_cores=1, metrics=None, - n_iter=30, **arguments): # pragma: no cover - """ - execute multiview process on + Parameters + ---------- - Parameters - ---------- + directory : indicate the directory - directory : indicate the directory - core_index : + dataset_var : - name : name of the data file to perform - - learning_rate : - - nb_folds : - - database_type : - - path : path to the data name - - labels_dictionary - - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - labels : - - hyper_param_search : - - nb_cores : in number of cores - - metrics : metric to use - - n_iter : int number of iterations - - arguments : others arguments - - Returns - ------- - exec_multiview on directory, dataset_var, name, learning_rate, nb_folds, 1, - database_type, path, labels_dictionary, - random_state, labels, - hyper_param_search=hyper_param_search, metrics=metrics, - n_iter=n_iter, **arguments - """ - """Used to load an HDF5 dataset_var for each parallel job and execute multiview classification""" - dataset_var = h5py.File(path + name + str(core_index) + ".hdf5", "r") - return exec_multiview(directory, dataset_var, name, learning_rate, nb_folds, - 1, - database_type, path, labels_dictionary, - random_state, labels, - hps_method=hyper_param_search, - metrics=metrics, - n_iter=n_iter, **arguments) - - -def exec_multiview(directory, dataset_var, name, classification_indices, - k_folds, - nb_cores, database_type, path, - labels_dictionary, random_state, labels, - hps_method="None", hps_kwargs={}, metrics=None, - n_iter=30, **kwargs): - """Used to execute multiview classification and result analysis - - Parameters - ---------- - - directory : indicate the directory - - - dataset_var : - - name - - classification_indices - - k_folds - - nb_cores - - database_type - - path - - labels_dictionary : dict dictionary of labels - - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - labels - - hps_method - - metrics - - n_iter : int number of iterations - - kwargs - - Returns - ------- - - ``MultiviewResult`` - """ - - logging.info("Start:\t Initialize constants") - cl_type, \ - t_start, \ - views_indices, \ - classifier_config, \ - views, \ - learning_rate, \ - labels, \ - output_file_name, \ - directory, \ - base_file_name, \ - metrics = init_constants(kwargs, classification_indices, metrics, name, - nb_cores, k_folds, dataset_var, directory) - logging.info("Done:\t Initialize constants") - - extraction_time = time.time() - t_start - logging.info("Info:\t Extraction duration " + str(extraction_time) + "s") - - logging.info("Start:\t Getting train/test split") - learning_indices, validation_indices = classification_indices - logging.info("Done:\t Getting train/test split") - - logging.info("Start:\t Getting classifiers modules") - classifier_module = getattr(multiview_classifiers, cl_type) - classifier_name = classifier_module.classifier_class_name - logging.info("Done:\t Getting classifiers modules") - - logging.info("Start:\t Optimizing hyperparameters") - hps_beg = time.monotonic() - - if hps_method != "None": - hps_method_class = getattr(hyper_parameter_search, hps_method) - estimator = getattr(classifier_module, classifier_name)( - random_state=random_state, - **classifier_config) - estimator = get_mc_estim(estimator, random_state, - multiview=True, - y=dataset_var.get_labels()[learning_indices]) - hps = hps_method_class(estimator, scoring=metrics, cv=k_folds, - random_state=random_state, framework="multiview", - n_jobs=nb_cores, - learning_indices=learning_indices, - view_indices=views_indices, **hps_kwargs) - hps.fit(dataset_var, dataset_var.get_labels(), ) - classifier_config = hps.get_best_params() - hps.gen_report(output_file_name) - hps_duration = time.monotonic() - hps_beg - classifier = get_mc_estim( - getattr(classifier_module, classifier_name)(random_state=random_state, - **classifier_config), - random_state, multiview=True, - y=dataset_var.get_labels()) - logging.info("Done:\t Optimizing hyperparameters") - logging.info("Start:\t Fitting classifier") - fit_beg = time.monotonic() - - classifier.fit(dataset_var, dataset_var.get_labels(), - train_indices=learning_indices, - view_indices=views_indices) - - fit_duration = time.monotonic() - fit_beg - logging.info("Done:\t Fitting classifier") - - logging.info("Start:\t Predicting") - train_pred = classifier.predict(dataset_var, - sample_indices=learning_indices, - view_indices=views_indices) - pred_beg = time.monotonic() - test_pred = classifier.predict(dataset_var, - sample_indices=validation_indices, - view_indices=views_indices) - pred_duration = time.monotonic() - pred_beg - full_pred = np.zeros(dataset_var.get_labels().shape, dtype=int) - 100 - full_pred[learning_indices] = train_pred - full_pred[validation_indices] = test_pred - logging.info("Done:\t Pertidcting") - - whole_duration = time.time() - t_start - logging.info( - "Info:\t Classification duration " + str(extraction_time) + "s") - - logging.info("Start:\t Result Analysis for " + cl_type) - times = (extraction_time, whole_duration) - result_analyzer = MultiviewResultAnalyzer(view_names=views, - classifier=classifier, - classification_indices=classification_indices, - k_folds=k_folds, - hps_method=hps_method, - metrics_dict=metrics, - n_iter=n_iter, - class_label_names=list( - labels_dictionary.values()), - pred=full_pred, - directory=directory, - base_file_name=base_file_name, - labels=labels, - database_name=dataset_var.get_name(), - nb_cores=nb_cores, - duration=whole_duration, - feature_ids=dataset_var.feature_ids) - string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ + name + + classification_indices + + k_folds + + nb_cores + + database_type + + path + + labels_dictionary : dict dictionary of labels + + random_state : int seed, RandomState instance, or None (default=None) + The seed of the pseudo random number multiview_generator to use when + shuffling the data. + + labels + + hps_method + + metrics + + n_iter : int number of iterations + + kwargs + + Returns + ------- + + ``MultiviewResult`` + """ + t_start = time.monotonic() + logging.info("Start:\t Initialize constants") + self.init_constants(dataset_var) + logging.info("Done:\t Initialize constants") + + extraction_time = time.time() - t_start + logging.info( + "Info:\t Extraction duration " + str(extraction_time) + "s") + + logging.info("Start:\t Getting train/test split") + learning_indices, validation_indices = self.splits + logging.info("Done:\t Getting train/test split") + + logging.info("Start:\t Getting classifiers modules") + classifier_module = getattr(multiview_classifiers, self.classifier_name) + classifier_name = classifier_module.classifier_class_name + logging.info("Done:\t Getting classifiers modules") + + logging.info("Start:\t Optimizing hyperparameters") + hps_beg = time.monotonic() + + if self.hps_type != "None": + hps_method_class = getattr(hyper_parameter_search, self.hps_type) + estimator = getattr(classifier_module, classifier_name)( + random_state=self.random_state, + **self.classifier_config) + estimator = get_mc_estim(estimator, self.random_state, + multiview=True, + y=dataset_var.get_labels()[ + learning_indices]) + hps = hps_method_class(estimator, scoring=self.metrics, cv=self.k_folds, + random_state=self.random_state, + framework="multiview", + n_jobs=self.nb_cores, + learning_indices=learning_indices, + view_indices=self.view_indices, + **self.hps_kwargs) + hps.fit(dataset_var, dataset_var.get_labels(), ) + classifier_config = hps.get_best_params() + hps.gen_report(self.output_file_name) + hps_duration = time.monotonic() - hps_beg + self.classifier = get_mc_estim( + getattr(classifier_module, classifier_name)( + random_state=self.random_state, + **self.classifier_config), + self.random_state, multiview=True, + y=dataset_var.get_labels()) + logging.info("Done:\t Optimizing hyperparameters") + logging.info("Start:\t Fitting classifier") + fit_beg = time.monotonic() + + self.classifier.fit(dataset_var, dataset_var.get_labels(), + train_indices=learning_indices, + view_indices=self.view_indices) + + fit_duration = time.monotonic() - fit_beg + logging.info("Done:\t Fitting classifier") + + logging.info("Start:\t Predicting") + train_pred = self.classifier.predict(dataset_var, + sample_indices=learning_indices, + view_indices=self.view_indices) + pred_beg = time.monotonic() + test_pred = self.classifier.predict(dataset_var, + sample_indices=validation_indices, + view_indices=self.view_indices) + pred_duration = time.monotonic() - pred_beg + full_pred = np.zeros(dataset_var.get_labels().shape, dtype=int) - 100 + full_pred[learning_indices] = train_pred + full_pred[validation_indices] = test_pred + logging.info("Done:\t Pertidcting") + + whole_duration = time.time() - t_start + logging.info( + "Info:\t Classification duration " + str(extraction_time) + "s") + + logging.info("Start:\t Result Analysis for " + self.classifier_name) + times = (extraction_time, whole_duration) + if "n_iter" in self.hps_kwargs: + self.n_iter_hps = self.hps_kwargs["n_iter"] + else: + self.n_iter_hps = 0 + result_analyzer = MultiviewResultAnalyzer(view_names=self.view_names, + classifier=self.classifier, + classification_indices=self.splits, + k_folds=self.k_folds, + hps_method=self.hps_type, + metrics_dict=self.metrics, + n_iter=self.n_iter_hps, + class_label_names=[ self.labels_dictionary[ind] + for ind in range(len(self.labels_dictionary))], + pred=full_pred, + directory=self.directory, + base_file_name=self.base_file_name, + labels=dataset_var.get_labels(), + database_name=dataset_var.get_name(), + nb_cores=self.nb_cores, + duration=whole_duration, + feature_ids=dataset_var.feature_ids) + string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ confusion_matrix = result_analyzer.analyze() - logging.info("Done:\t Result Analysis for " + cl_type) + logging.info("Done:\t Result Analysis for " + self.classifier_name) + + logging.info("Start:\t Saving preds") + self.save_results(string_analysis, images_analysis, confusion_matrix) + logging.info("Start:\t Saving preds") + + return MultiviewResult(self.classifier_name, self.classifier_config, metrics_scores, + full_pred, hps_duration, fit_duration, + pred_duration, class_metrics_scores, self.classifier) + + + + + +# def exec_multiview_multicore(directory, core_index, name, learning_rate, +# nb_folds, +# database_type, path, labels_dictionary, +# random_state, labels, +# hyper_param_search=False, nb_cores=1, metrics=None, +# n_iter=30, **arguments): # pragma: no cover +# """ +# execute multiview process on +# +# Parameters +# ---------- +# +# directory : indicate the directory +# +# core_index : +# +# name : name of the data file to perform +# +# learning_rate : +# +# nb_folds : +# +# database_type : +# +# path : path to the data name +# +# labels_dictionary +# +# random_state : int seed, RandomState instance, or None (default=None) +# The seed of the pseudo random number multiview_generator to use when +# shuffling the data. +# +# labels : +# +# hyper_param_search : +# +# nb_cores : in number of cores +# +# metrics : metric to use +# +# n_iter : int number of iterations +# +# arguments : others arguments +# +# Returns +# ------- +# exec_multiview on directory, dataset_var, name, learning_rate, nb_folds, 1, +# database_type, path, labels_dictionary, +# random_state, labels, +# hyper_param_search=hyper_param_search, metrics=metrics, +# n_iter=n_iter, **arguments +# """ +# """Used to load an HDF5 dataset_var for each parallel job and execute multiview classification""" +# dataset_var = h5py.File(path + name + str(core_index) + ".hdf5", "r") +# return exec_multiview(directory, dataset_var, name, learning_rate, nb_folds, +# 1, +# database_type, path, labels_dictionary, +# random_state, labels, +# hps_method=hyper_param_search, +# metrics=metrics, +# n_iter=n_iter, **arguments) + - logging.info("Start:\t Saving preds") - save_results(string_analysis, images_analysis, output_file_name, - confusion_matrix) - logging.info("Start:\t Saving preds") - return MultiviewResult(cl_type, classifier_config, metrics_scores, - full_pred, hps_duration, fit_duration, - pred_duration, class_metrics_scores, classifier) diff --git a/summit/multiview_platform/multiview_classifiers/additions/late_fusion_utils.py b/summit/multiview_platform/multiview_classifiers/additions/late_fusion_utils.py index f6c33b05..7ae5d9a2 100644 --- a/summit/multiview_platform/multiview_classifiers/additions/late_fusion_utils.py +++ b/summit/multiview_platform/multiview_classifiers/additions/late_fusion_utils.py @@ -111,8 +111,13 @@ class LateFusionClassifier(BaseMultiviewClassifier, BaseFusionClassifier): for view_index, monoview_estimator in zip(view_indices, self.monoview_estimators)] + self.get_feature_importance() return self + def get_feature_importance(self): + self.feature_importances_ = np.concatenate([clf.feature_importances_ for clf in self.monoview_estimators]) + self.feature_importances_/=np.sum(self.feature_importances_) + def init_params(self, nb_view, mutliclass=False): if self.weights is None: self.weights = np.ones(nb_view) / nb_view diff --git a/summit/multiview_platform/multiview_classifiers/bagged_spkm_pw.py b/summit/multiview_platform/multiview_classifiers/bagged_spkm_pw.py index c0c3c045..50798413 100644 --- a/summit/multiview_platform/multiview_classifiers/bagged_spkm_pw.py +++ b/summit/multiview_platform/multiview_classifiers/bagged_spkm_pw.py @@ -29,7 +29,7 @@ class SampledPWSPKM(PWSPKM,): self.rus = RandomUnderSampler(random_state=random_state) def fit(self, X, y, train_indices=None, view_indices=None): - + self.spkmregP=10 self.lb = LabelBinarizer(pos_label=1, neg_label=-1) y = self.lb.fit_transform(y) train_indices, view_indices = get_samples_views_indices(X, diff --git a/summit/multiview_platform/multiview_classifiers/early_fusion_ib_decision_tree.py b/summit/multiview_platform/multiview_classifiers/early_fusion_ib_decision_tree.py new file mode 100644 index 00000000..7e452b21 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/early_fusion_ib_decision_tree.py @@ -0,0 +1,32 @@ +from .additions.early_fusion_from_monoview import BaseEarlyFusion +from ..utils.hyper_parameter_search import CustomRandint + +# from ..utils.dataset import get_v + +classifier_class_name = "EarlyFusionIBDT" + + +class EarlyFusionIBDT(BaseEarlyFusion): + + def __init__(self, random_state=None, max_depth=None, n_estimators=10, + sampling_strategy="auto", replacement=False, + criterion='gini', splitter='best', **kwargs): + BaseEarlyFusion.__init__(self, random_state=random_state, + monoview_classifier="ib_decision_tree", + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement, + base_estimator__max_depth=max_depth, + base_estimator__criterion=criterion, + base_estimator__splitter=splitter, **kwargs) + self.param_names = ["n_estimators", "sampling_strategy", + "base_estimator__max_depth", + "base_estimator__criterion", + "base_estimator__splitter", + 'random_state'] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=50), + ["auto"],CustomRandint(low=1, high=300), + ["gini", "entropy"], + ["best", "random"], [random_state]] + self.weird_strings = {} \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/early_fusion_ib_random_forest.py b/summit/multiview_platform/multiview_classifiers/early_fusion_ib_random_forest.py new file mode 100644 index 00000000..d6863d81 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/early_fusion_ib_random_forest.py @@ -0,0 +1,31 @@ +from .additions.early_fusion_from_monoview import BaseEarlyFusion +from ..utils.hyper_parameter_search import CustomRandint + +# from ..utils.dataset import get_v + +classifier_class_name = "EarlyFusionIBRF" + + +class EarlyFusionIBRF(BaseEarlyFusion): + + def __init__(self, random_state=None, max_depth=None, n_estimators=10, + sampling_strategy="auto", replacement=False, **kwargs): + BaseEarlyFusion.__init__(self, random_state=random_state, + monoview_classifier="ib_random_forest", + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement, + base_estimator__max_depth=max_depth, + base_estimator__criterion="gini", + base_estimator__splitter='best', **kwargs) + self.param_names = ["n_estimators", "sampling_strategy", + "base_estimator__n_estimators", + "base_estimator__max_depth", + "base_estimator__criterion", + 'random_state'] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=50), + ["auto"],CustomRandint(low=1, high=300), + CustomRandint(low=1, high=10), + ["gini", "entropy"], [random_state]] + self.weird_strings = {} \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/early_fusion_ib_random_scm.py b/summit/multiview_platform/multiview_classifiers/early_fusion_ib_random_scm.py new file mode 100644 index 00000000..31257914 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/early_fusion_ib_random_scm.py @@ -0,0 +1,34 @@ +from .additions.early_fusion_from_monoview import BaseEarlyFusion +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + +# from ..utils.dataset import get_v + +classifier_class_name = "EarlyFusionIBRSCM" + + +class EarlyFusionIBRSCM(BaseEarlyFusion): + + def __init__(self, random_state=None, n_estimators=10, + sampling_strategy="auto", replacement=False, **kwargs): + BaseEarlyFusion.__init__(self, random_state=random_state, + monoview_classifier="ib_random_scm", + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement, + **kwargs) + self.param_names = ["n_estimators", "sampling_strategy", + "base_estimator__n_estimators", + "base_estimator__max_rules", + "base_estimator__max_samples", + "base_estimator__max_features", + "base_estimator__model_type", + "base_estimator__p_options", + 'random_state'] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=50), + ["auto"],CustomRandint(low=1, high=300), + CustomRandint(low=1, high=20), + CustomUniform(), CustomUniform(), + ["conjunction", "disjunction"], CustomUniform(), + [random_state]] + self.weird_strings = {} \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/early_fusion_ib_scm.py b/summit/multiview_platform/multiview_classifiers/early_fusion_ib_scm.py new file mode 100644 index 00000000..d7789432 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/early_fusion_ib_scm.py @@ -0,0 +1,29 @@ +from .additions.early_fusion_from_monoview import BaseEarlyFusion +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + +# from ..utils.dataset import get_v + +classifier_class_name = "EarlyFusionIBSCM" + + +class EarlyFusionIBSCM(BaseEarlyFusion): + + def __init__(self, random_state=None, n_estimators=10, + sampling_strategy="auto", replacement=False, **kwargs): + BaseEarlyFusion.__init__(self, random_state=random_state, + monoview_classifier="ib_scm", + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement, + **kwargs) + self.param_names = ["n_estimators", "sampling_strategy", + "base_estimator__model_type", + "base_estimator__max_rules", "base_estimator__p", + 'random_state'] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=50), + ["auto"],["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1), + [random_state]] + self.weird_strings = {} \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/spkm_pw.py b/summit/multiview_platform/multiview_classifiers/spkm_pw.py index f0c80553..10ddd635 100644 --- a/summit/multiview_platform/multiview_classifiers/spkm_pw.py +++ b/summit/multiview_platform/multiview_classifiers/spkm_pw.py @@ -28,16 +28,19 @@ class PWSPKM(BaseMultiviewClassifier, pairwiseSPKMlikeSklearn): self.param_names = ["n_u", "kernel1", "kernel2", "spkmregP", "spkminit", "nspkminits", "preprocessinglist", "random_state"] - self.distribs = [[2], [PolyKernel({"d":3, "r":1}), RBFKernel(0.5)], [PolyKernel({"d":3, "r":1}), RBFKernel(0.5)], CustomRandint(-2,2, multiplier='e'), + self.distribs = [[2], [PolyKernel({"d":3, "r":1}), RBFKernel(0.5)], + [PolyKernel({"d":3, "r":1}), RBFKernel(0.5)], + CustomRandint(-4,4, multiplier='e'), ["data"], [10], - [[], [0], [1], [2], [0,1], [0,1,2], [0,2], [1,2]], [random_state],] + [[0,1],], + [random_state],] self.more_than_two_views = False self.random_state = random_state def fit(self, X, y, train_indices=None, view_indices=None): - self.lb = LabelBinarizer(pos_label=1, neg_label=-1) y = self.lb.fit_transform(y) + print(np.unique(y)) train_indices, view_indices = get_samples_views_indices(X, train_indices, view_indices) diff --git a/summit/multiview_platform/result_analysis/error_analysis.py b/summit/multiview_platform/result_analysis/error_analysis.py index aeec9e1e..67240446 100644 --- a/summit/multiview_platform/result_analysis/error_analysis.py +++ b/summit/multiview_platform/result_analysis/error_analysis.py @@ -7,6 +7,7 @@ import matplotlib as mpl import matplotlib.pyplot as plt import numpy as np import plotly +import pandas as pd # Import own Modules @@ -46,7 +47,7 @@ def get_sample_errors(groud_truth, results): def publish_sample_errors(sample_errors, directory, database_name, - label_names, sample_ids, labels): # pragma: no cover + label_names, sample_ids, labels, test=False): # pragma: no cover logging.info("Start:\t Label analysis figure generation") base_file_name = os.path.join(directory, database_name + "-") @@ -59,33 +60,38 @@ def publish_sample_errors(sample_errors, directory, database_name, delimiter=",") plot_2d(data_2d, classifiers_names, nb_classifiers, base_file_name, database_name, - sample_ids=sample_ids, labels=labels, label_names=label_names) + sample_ids=sample_ids, labels=labels, label_names=label_names, test=test) plot_errors_bar(error_on_samples, nb_samples, - base_file_name, database_name, sample_ids=sample_ids) + base_file_name, database_name, sample_ids=sample_ids, test=test) logging.info("Done:\t Label analysis figures generation") def publish_all_sample_errors(iter_results, directory, stats_iter, - sample_ids, labels, data_base_name, label_names): # pragma: no cover + sample_ids, labels, data_base_name, label_names, + test=False): # pragma: no cover logging.info( "Start:\t Global label analysis figure generation") nb_samples, nb_classifiers, data, \ error_on_samples, classifier_names = gen_error_data_glob(iter_results, stats_iter) - - np.savetxt(os.path.join(directory, "clf_errors.csv"), data, delimiter=",") - np.savetxt(os.path.join(directory, "sample_errors.csv"), error_on_samples, + if test: + add='t' + else: + add = "" + np.savetxt(os.path.join(directory, "clf_errors{}.csv".format(add)), data, delimiter=",") + np.savetxt(os.path.join(directory, "sample_errors{}.csv".format(add)), error_on_samples, delimiter=",") - + df = pd.DataFrame(index = sample_ids, columns=["err"], data=1-error_on_samples) + df.to_csv(os.path.join(directory, "sample_err_df{}.csv".format(add))) plot_2d(data, classifier_names, nb_classifiers, os.path.join(directory, ""), data_base_name, stats_iter=stats_iter, - sample_ids=sample_ids, labels=labels, label_names=label_names) + sample_ids=sample_ids, labels=labels, label_names=label_names, test=test) plot_errors_bar(error_on_samples, nb_samples, os.path.join(directory, ""), data_base_name, - sample_ids=sample_ids) + sample_ids=sample_ids, test=test) logging.info( "Done:\t Global label analysis figures generation") @@ -152,7 +158,7 @@ def gen_error_data_glob(iter_results, stats_iter): def plot_2d(data, classifiers_names, nb_classifiers, file_name, dataset_name, labels=None, - stats_iter=1, use_plotly=True, sample_ids=None, label_names=None): # pragma: no cover + stats_iter=1, use_plotly=True, sample_ids=None, label_names=None, test=False): # pragma: no cover r"""Used to generate a 2D plot of the errors. Parameters @@ -178,6 +184,10 @@ def plot_2d(data, classifiers_names, nb_classifiers, file_name, dataset_name, la Returns ------- """ + if test: + add='t' + else: + add = "" if label_names is None: label_names = [str(lab) for lab in np.sort(np.unique(labels))] fig, ax = plt.subplots(nrows=1, ncols=1, ) @@ -225,13 +235,13 @@ def plot_2d(data, classifiers_names, nb_classifiers, file_name, dataset_name, la fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)') fig.update_xaxes(showticklabels=True, ) - plotly.offline.plot(fig, filename=file_name + "error_analysis_2D.html", + plotly.offline.plot(fig, filename=file_name + "error_analysis_2D{}.html".format(add), auto_open=False) del fig def plot_errors_bar(error_on_samples, nb_samples, file_name, dataset_name, - use_plotly=True, sample_ids=None): # pragma: no cover + use_plotly=True, sample_ids=None, test=False): # pragma: no cover r"""Used to generate a barplot of the muber of classifiers that failed to classify each samples Parameters @@ -250,6 +260,10 @@ def plot_errors_bar(error_on_samples, nb_samples, file_name, dataset_name, Returns ------- """ + if test: + add='t' + else: + add = "" fig, ax = plt.subplots() x = np.arange(nb_samples) plt.bar(x, 1 - error_on_samples) @@ -265,7 +279,7 @@ def plot_errors_bar(error_on_samples, nb_samples, file_name, dataset_name, dataset_name) ) - plotly.offline.plot(fig, filename=file_name + "error_analysis_bar.html", + plotly.offline.plot(fig, filename=file_name + "error_analysis_bar{}.html".format(add), auto_open=False) diff --git a/summit/multiview_platform/result_analysis/execution.py b/summit/multiview_platform/result_analysis/execution.py index 931d6186..68943f69 100644 --- a/summit/multiview_platform/result_analysis/execution.py +++ b/summit/multiview_platform/result_analysis/execution.py @@ -14,25 +14,25 @@ from .tracebacks_analysis import save_failed, publish_tracebacks def analyze(results, stats_iter, benchmark_argument_dictionaries, metrics, directory, sample_ids, labels, feature_ids, - view_names): # pragma: no cover + view_names, test=False): # pragma: no cover """Used to analyze the results of the previous benchmarks""" - data_base_name = benchmark_argument_dictionaries[0]["args"]["name"] + data_base_name = benchmark_argument_dictionaries[0].database_name results_means_std, iter_results, flagged_failed, label_names = analyze_iterations( - results, benchmark_argument_dictionaries, - stats_iter, metrics, sample_ids, labels, feature_ids, view_names) + results, stats_iter, metrics, sample_ids, labels, feature_ids, view_names, test=test) if flagged_failed: save_failed(flagged_failed, directory) if stats_iter > 1: results_means_std = analyze_all( iter_results, stats_iter, directory, - data_base_name, sample_ids, label_names) + data_base_name, sample_ids, label_names, test=test) return results_means_std -def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, - metrics, sample_ids, labels, feature_ids, view_names): +def analyze_iterations(bootstrap, stats_iter, + metrics, sample_ids, labels, feature_ids, view_names, + test=False): r"""Used to extract and format the results of the different experimentations performed. @@ -74,39 +74,40 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, "durations": [i for i in range(stats_iter)]} flagged_tracebacks_list = [] fig_errors = [] - for iter_index, result, tracebacks in results: - arguments = get_arguments(benchmark_argument_dictionaries, iter_index) - labels_names = list(arguments["labels_dictionary"].values()) + for bootstrap_iter in bootstrap: + labels_names = [bootstrap_iter.labels_dictionary[ind] + for ind in range(len(bootstrap_iter.labels_dictionary))] metrics_scores, class_metric_scores = get_metrics_scores(metrics, - result, + bootstrap_iter.results, labels_names) - sample_errors = get_sample_errors(labels, result) - feature_importances = get_feature_importances(result, + sample_errors = get_sample_errors(labels, bootstrap_iter.results) + feature_importances = get_feature_importances(bootstrap_iter.results, feature_ids=feature_ids, view_names=view_names,) - durations = get_duration(result) - directory = arguments["directory"] + durations = get_duration(bootstrap_iter.results) + directory = bootstrap_iter.directory - database_name = arguments["args"]["name"] + database_name = bootstrap_iter.database_name flagged_tracebacks_list += publish_tracebacks(directory, database_name, - labels_names, tracebacks, - iter_index) + labels_names, + bootstrap_iter.traceback_outputs, + bootstrap_iter.flag) res = publish_metrics_graphs(metrics_scores, directory, database_name, - labels_names, class_metric_scores) + labels_names, class_metric_scores, test=test) publish_sample_errors(sample_errors, directory, database_name, - labels_names, sample_ids, labels) + labels_names, sample_ids, labels, test=test) publish_feature_importances(feature_importances, directory, - database_name, metric_scores=metrics_scores) + database_name, metric_scores=metrics_scores, test=test) plot_durations(durations, directory, database_name) - iter_results["metrics_scores"][iter_index] = metrics_scores - iter_results["class_metrics_scores"][iter_index] = class_metric_scores - iter_results["sample_errors"][iter_index] = sample_errors - iter_results["feature_importances"][iter_index] = feature_importances + iter_results["metrics_scores"][bootstrap_iter.flag] = metrics_scores + iter_results["class_metrics_scores"][bootstrap_iter.flag] = class_metric_scores + iter_results["sample_errors"][bootstrap_iter.flag] = sample_errors + iter_results["feature_importances"][bootstrap_iter.flag] = feature_importances iter_results["labels"] = labels - iter_results["durations"][iter_index] = durations + iter_results["durations"][bootstrap_iter.flag] = durations logging.info("Done:\t Analyzing all results") @@ -114,7 +115,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, def analyze_all(iter_results, stats_iter, directory, data_base_name, - sample_ids, label_names): # pragma: no cover + sample_ids, label_names, test=False): # pragma: no cover """Used to format the results in order to plot the mean results on the iterations""" metrics_analysis, class_metrics_analysis, error_analysis, feature_importances, \ @@ -125,12 +126,13 @@ def analyze_all(iter_results, stats_iter, directory, data_base_name, class_metrics_analysis, directory, data_base_name, stats_iter, - label_names) + label_names, test_b=test) publish_all_sample_errors(error_analysis, directory, stats_iter, - sample_ids, labels, data_base_name, label_names) + sample_ids, labels, data_base_name, label_names, + test=test) publish_feature_importances(feature_importances, directory, data_base_name, feature_importances_stds, - metric_scores=metrics_analysis) + metric_scores=metrics_analysis, test=test) plot_durations(duration_means, directory, data_base_name, duration_stds) return results @@ -155,7 +157,7 @@ def get_arguments(benchmark_argument_dictionaries, iter_index): needed experimentation. """ for benchmark_argument_dictionary in benchmark_argument_dictionaries: - if benchmark_argument_dictionary["flag"] == iter_index: + if benchmark_argument_dictionary.flag == iter_index: return benchmark_argument_dictionary diff --git a/summit/multiview_platform/result_analysis/feature_importances.py b/summit/multiview_platform/result_analysis/feature_importances.py index b40161da..49de95ed 100644 --- a/summit/multiview_platform/result_analysis/feature_importances.py +++ b/summit/multiview_platform/result_analysis/feature_importances.py @@ -30,7 +30,7 @@ def get_feature_importances(result, feature_ids=None, view_names=None,): feature_importances[classifier_result.view_name] = pd.DataFrame( index=feature_ids[classifier_result.view_index]) if hasattr(classifier_result.clf, 'feature_importances_'): - print(classifier_result.classifier_name, classifier_result.view_name) + # print(classifier_result.classifier_name, classifier_result.view_name) feature_importances[classifier_result.view_name][ classifier_result.classifier_name] = classifier_result.clf.feature_importances_ else: @@ -50,11 +50,13 @@ def get_feature_importances(result, feature_ids=None, view_names=None,): feature_importances["mv"][classifier_result.classifier_name] = concat/np.sum(concat) else: feature_importances["mv"][classifier_result.classifier_name] = classifier_result.clf.feature_importances_ + else: + feature_importances["mv"][classifier_result.classifier_name] = 0 return feature_importances def publish_feature_importances(feature_importances, directory, database_name, - feature_stds=None, metric_scores=None): # pragma: no cover + feature_stds=None, metric_scores=None, test=False): # pragma: no cover importance_dfs = [] std_dfs = [] if not os.path.exists(os.path.join(directory, "feature_importances")): @@ -98,18 +100,22 @@ def publish_feature_importances(feature_importances, directory, database_name, columns=feature_importances["mv"].columns).fillna(0) feature_std_df = pd.concat([feature_std_df, fake], axis=1,).fillna(0) plot_feature_importances(os.path.join(directory, "feature_importances", - database_name), feature_importances_df, feature_std_df) + database_name), feature_importances_df, feature_std_df, test=test) if metric_scores is not None: plot_feature_relevance(os.path.join(directory, "feature_importances", - database_name), feature_importances_df, feature_std_df, metric_scores) + database_name), feature_importances_df, feature_std_df, metric_scores, test=test) def plot_feature_importances(file_name, feature_importance, - feature_std): # pragma: no cover + feature_std, test=False): # pragma: no cover s = feature_importance.sum(axis=1) s = s[s!=0] + if test: + add='t' + else: + add = "" feature_importance = feature_importance.loc[s.sort_values(ascending=False).index] - feature_importance.to_csv(file_name + "_dataframe.csv") + feature_importance.to_csv(file_name + "_dataframe{}.csv".format(add)) hover_text = [["-Feature :" + str(feature_name) + "<br>-Classifier : " + classifier_name + "<br>-Importance : " + str( @@ -131,18 +137,19 @@ def plot_feature_importances(file_name, feature_importance, # yaxis={"showgrid": False, "showticklabels": False, "ticks": ''}) fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)') - plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) + plotly.offline.plot(fig, filename=file_name + "{}.html".format(add), auto_open=False) del fig def plot_feature_relevance(file_name, feature_importance, - feature_std, metric_scores): # pragma: no cover + feature_std, metric_scores, test=False): # pragma: no cover for metric, score_df in metric_scores.items(): if metric.endswith("*"): if isinstance(score_df, dict): score_df = score_df["mean"] for score in score_df.columns: + print(score) if len(score.split("-"))>1: algo, view = score.split("-") list_ind = [ind for ind in feature_importance.index if ind.startswith(view)] @@ -151,4 +158,4 @@ def plot_feature_relevance(file_name, feature_importance, feature_importance[score] *= score_df[score]['test'] file_name+="_relevance" plot_feature_importances(file_name, feature_importance, - feature_std) + feature_std, test=test) diff --git a/summit/multiview_platform/result_analysis/metric_analysis.py b/summit/multiview_platform/result_analysis/metric_analysis.py index 560976d5..ba265a4c 100644 --- a/summit/multiview_platform/result_analysis/metric_analysis.py +++ b/summit/multiview_platform/result_analysis/metric_analysis.py @@ -50,6 +50,7 @@ def get_metrics_scores(metrics, results, label_names): for metric in metrics.keys(): for classifier_result in results: + # print(classifier_result) metrics_scores[metric].loc[ "train", classifier_result.get_classifier_name()] = \ classifier_result.metrics_scores[metric][0] @@ -73,7 +74,7 @@ def get_metrics_scores(metrics, results, label_names): def publish_metrics_graphs(metrics_scores, directory, database_name, labels_names, - class_metric_scores): # pragma: no cover + class_metric_scores, test=False): # pragma: no cover r"""Used to sort the results (names and both scores) in descending test score order. @@ -107,7 +108,7 @@ def publish_metrics_graphs(metrics_scores, directory, database_name, plot_metric_scores(train_scores, test_scores, classifier_names, nb_results, metric_name, file_name, database_name, - tag=" vs ".join(labels_names)) + tag=" vs ".join(labels_names), test=test) class_file_name = file_name+"-class" plot_class_metric_scores(class_test_scores, class_file_name, @@ -119,7 +120,7 @@ def publish_metrics_graphs(metrics_scores, directory, database_name, def publish_all_metrics_scores(iter_results, class_iter_results, directory, data_base_name, stats_iter, label_names, - min_size=10): # pragma: no cover + min_size=10, test_b=False): # pragma: no cover results = [] secure_file_path(os.path.join(directory, "a")) @@ -138,7 +139,7 @@ def publish_all_metrics_scores(iter_results, class_iter_results, directory, plot_metric_scores(train, test, classifier_names, nb_results, metric_name, file_name, data_base_name, tag="Averaged", - train_STDs=train_std, test_STDs=test_std) + train_STDs=train_std, test_STDs=test_std, test=test_b) results += [[classifier_name, metric_name, test_mean, test_std] for classifier_name, test_mean, test_std in zip(classifier_names, test, test_std)] @@ -188,7 +189,7 @@ def plot_metric_scores(train_scores, test_scores, names, nb_results, metric_name, file_name, dataset_name, tag="", train_STDs=None, test_STDs=None, - use_plotly=True): # pragma: no cover + use_plotly=True, test=False): # pragma: no cover r"""Used to plot and save the score barplot for a specific metric. Parameters @@ -215,7 +216,10 @@ def plot_metric_scores(train_scores, test_scores, names, nb_results, Returns ------- """ - + if test: + add='t' + else: + add = "" figKW, barWidth = get_fig_size(nb_results) names, train_scores, test_scores, train_STDs, test_STDs = sort_by_test_score( @@ -255,7 +259,7 @@ def plot_metric_scores(train_scores, test_scores, names, nb_results, test_scores.reshape((train_scores.shape[0], 1)), test_STDs.reshape((train_scores.shape[0], 1))), axis=1)), columns=names, index=["Train", "Train STD", "Test", "Test STD"]) - dataframe.to_csv(file_name + ".csv") + dataframe.to_csv(file_name + "{}.csv".format(add)) if use_plotly: fig = plotly.graph_objs.Figure() fig.add_trace(plotly.graph_objs.Bar( @@ -275,7 +279,7 @@ def plot_metric_scores(train_scores, test_scores, names, nb_results, title="Dataset : {}, metric : {}, task : {} <br> Scores for each classifier <br> Generated on <a href='https://baptiste.bauvin.pages.lis-lab.fr/summit'>SuMMIT</a>.".format(dataset_name, metric_name, tag)) fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)') - plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) + plotly.offline.plot(fig, filename=file_name + "{}.html".format(add), auto_open=False) del fig diff --git a/summit/multiview_platform/utils/base.py b/summit/multiview_platform/utils/base.py index 3e578e11..754b00f7 100644 --- a/summit/multiview_platform/utils/base.py +++ b/summit/multiview_platform/utils/base.py @@ -285,9 +285,11 @@ class ResultAnalyser(): y_true=self.labels[self.train_indices], y_pred=self.pred[self.train_indices], **metric_kwargs) + test_score = metric_module.score(y_true=self.labels[self.test_indices], y_pred=self.pred[self.test_indices], **metric_kwargs) + return class_train_scores, class_test_scores, train_score, test_score def print_metric_score(self, ): diff --git a/summit/multiview_platform/utils/compression.py b/summit/multiview_platform/utils/compression.py index f27c1462..4af07e0a 100644 --- a/summit/multiview_platform/utils/compression.py +++ b/summit/multiview_platform/utils/compression.py @@ -43,5 +43,9 @@ def remove_compressed(exp_path): if __name__=="__main__": - - simplify_plotly("/home/baptiste/Documents/Gitwork/summit/results/hepatitis/debug_started_2022_03_16-15_06_55__/hepatitis-mean_on_10_iter-balanced_accuracy_p.html") + for dir in os.listdir("/home/baptiste/Documents/Gitwork/summit/results/"): + print(dir) + for exp in os.listdir((os.path.join("/home/baptiste/Documents/Gitwork/summit/results/", dir))): + print("\t", exp) + explore_files(os.path.join("/home/baptiste/Documents/Gitwork/summit/results/", dir, exp)) + # simplify_plotly("/home//baptiste/Documents/Gitwork/summit/results/hepatitis/debug_started_2022_03_16-15_06_55__/hepatitis-mean_on_10_iter-balanced_accuracy_p.html") diff --git a/summit/multiview_platform/utils/configuration.py b/summit/multiview_platform/utils/configuration.py index 9c79b83b..07504b6c 100644 --- a/summit/multiview_platform/utils/configuration.py +++ b/summit/multiview_platform/utils/configuration.py @@ -20,7 +20,7 @@ def get_the_args(path_to_config_file=os.path.join(os.path.dirname(package_path), """ with open(path_to_config_file, 'r') as stream: yaml_config = yaml.safe_load(stream) - return pass_default_config(**yaml_config) + return yaml_config def pass_default_config(log=True, diff --git a/summit/multiview_platform/utils/hyper_parameter_search.py b/summit/multiview_platform/utils/hyper_parameter_search.py index f55a90ef..266f0443 100644 --- a/summit/multiview_platform/utils/hyper_parameter_search.py +++ b/summit/multiview_platform/utils/hyper_parameter_search.py @@ -157,7 +157,7 @@ class Random(RandomizedSearchCV, HPSearch): refit=False, n_jobs=1, scoring=None, cv=None, random_state=None, learning_indices=None, view_indices=None, framework="monoview", - equivalent_draws=True, track_tracebacks=True): + equivalent_draws=False, track_tracebacks=True): param_distributions = self.get_param_distribs(estimator, param_distributions) diff --git a/summit/multiview_platform/utils/multiclass.py b/summit/multiview_platform/utils/multiclass.py index 98c6c1d6..a8fd4960 100644 --- a/summit/multiview_platform/utils/multiclass.py +++ b/summit/multiview_platform/utils/multiclass.py @@ -81,7 +81,7 @@ class MultiClassWrapper: params.pop("estimator") return params - def get_interpretation(self, directory, base_file_name, y_test=None): + def get_interpretation(self, directory, base_file_name, feature_ids, y_test=None): # TODO : Multiclass interpretation return "Multiclass wrapper is not interpretable yet" @@ -264,8 +264,9 @@ class MultiviewOVOWrapper(MultiviewWrapper, OneVsOneClassifier): ]))) self.estimators_ = estimators_indices[0] + pairwise = self._get_tags()["pairwise"] self.pairwise_indices_ = ( - estimators_indices[1] if self._pairwise else None) + estimators_indices[1] if pairwise else None) return self -- GitLab