From aa680623109b23a30e4aa8124d3d51158ae8aee9 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Tue, 11 Oct 2022 09:36:11 -0400 Subject: [PATCH] Testing the class --- summit/execute.py | 5 +- summit/multiview_platform/exec_classif.py | 1696 ++++++++++------- ...agging_mincq.py => __scm_bagging_mincq.py} | 2 +- .../multiview/exec_multiview.py | 3 +- .../multiview_classifiers/bagged_spkm_pw.py | 49 + .../early_fusion_random_scm.py | 26 + .../multiview_classifiers/spkm_pw.py | 12 +- summit/multiview_platform/utils/execution.py | 419 +--- 8 files changed, 1139 insertions(+), 1073 deletions(-) rename summit/multiview_platform/monoview_classifiers/{scm_bagging_mincq.py => __scm_bagging_mincq.py} (98%) create mode 100644 summit/multiview_platform/multiview_classifiers/bagged_spkm_pw.py create mode 100644 summit/multiview_platform/multiview_classifiers/early_fusion_random_scm.py diff --git a/summit/execute.py b/summit/execute.py index 30196e99..ec09ba2a 100644 --- a/summit/execute.py +++ b/summit/execute.py @@ -8,7 +8,7 @@ def execute(config_path=None): # pragma: no cover from summit.multiview_platform import exec_classif if config_path is None: - exec_classif.exec_classif(sys.argv[1:]) + sum = exec_classif.Summit(config_path=sys.argv[1:]) else: if config_path == "example 0": config_path = os.path.join( @@ -59,7 +59,8 @@ def execute(config_path=None): # pragma: no cover "examples", "config_files", "config_example_3.yml") - exec_classif.exec_classif(["--config_path", config_path]) + sum = exec_classif.Summit(["--config_path", config_path]) + sum.exec_classif() if __name__ == "__main__": diff --git a/summit/multiview_platform/exec_classif.py b/summit/multiview_platform/exec_classif.py index 11697f4b..1dd98420 100644 --- a/summit/multiview_platform/exec_classif.py +++ b/summit/multiview_platform/exec_classif.py @@ -3,6 +3,9 @@ import os import pkgutil import time import traceback +import argparse +import pickle + import matplotlib import numpy as np @@ -14,8 +17,12 @@ from .monoview.exec_classif_mono_view import exec_monoview from .multiview.exec_multiview import exec_multiview from .result_analysis.execution import analyze_iterations, analyze from .utils import execution, dataset, configuration +from .utils.execution import BaseExec +from .utils.configuration import save_config from .utils.dataset import delete_HDF5 from .utils.organization import secure_file_path +from .utils import get_multiview_db as DB + matplotlib.use( 'Agg') # Anti-Grain Geometry C++ library to make a raster (pixel) image of the figure @@ -24,663 +31,1064 @@ matplotlib.use( __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype +package_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +class Summit(BaseExec): + + def __init__(self, log=True, + name=["plausible", ], + label="_", + file_type=".hdf5", + views=None, + pathf=os.path.join(os.path.dirname(package_path), "data", ""), + nice=0, + random_state=42, + nb_cores=1, + full=True, + debug=False, + add_noise=False, + noise_std=0.0, + res_dir=os.path.join(os.path.dirname(package_path),"results", ""), + track_tracebacks=True, + split=0.49, + nb_folds=5, + nb_class=None, + classes=None, + cl_type=["multiview", ], + algos_monoview=["all"], + algos_multiview=["svm_jumbo_fusion", ], + stats_iter=2, + metrics={"accuracy_score": {}, "f1_score": {}}, + metric_princ="accuracy_score", + hps_type="Random", + hps_iter=1, + hps_kwargs={'n_iter': 10, "equivalent_draws": True}, + config_path=None, + **kwargs): + if config_path is not None: + args = self.parse_the_args(config_path) + args = configuration.get_the_args(args.config_path) + Summit.__init__(self, **args) + else: + self.log = log + if type(name)==list: + self.name = name[0] + else: + self.name = name + self.label = label + self.file_type = file_type + self.views = views + self.pathf = pathf + self.nice = nice + self.random_state = random_state + self.nb_cores = nb_cores + self.full = full + self.debug = debug + self.add_noise = add_noise + self.noise_std = noise_std + self.res_dir = res_dir + self.track_tracebacks = track_tracebacks + self.split = split + self.nb_folds = nb_folds + self.nb_class = nb_class + self.classes = classes + self.cl_type = cl_type + self.algos_monoview = algos_monoview + self.algos_multiview = algos_multiview + self.stats_iter = stats_iter + self.metrics = metrics + self.metric_princ = metric_princ + self.hps_type = hps_type + self.hps_iter = hps_iter + self.hps_kwargs = hps_kwargs + + def exec_classif(self, ): # pragma: no cover + """ + Runs the benchmark with the given arguments + + Parameters + ---------- + arguments : + + Returns + ------- + + + >>> exec_classif([--config_path, /path/to/config/files/]) + >>> + """ + start = time.time() + + import sys + if not sys.platform in ["win32", "cygwin"]: + os.nice(self.nice) + if self.nb_cores == 1: + os.environ['OPENBLAS_NUM_THREADS'] = '1' + self.find_dataset_names() + # noise_results = [] + # for noise_std in args["noise_std"]: + + self.init_log_file() + self.init_random_state() + + self.init_stats_iter_random_states() -def init_benchmark(cl_type, monoview_algos, multiview_algos): - r"""Used to create a list of all the algorithm packages names used for the benchmark. - - First this function will check if the benchmark need mono- or/and multiview - algorithms and adds to the right - dictionary the asked algorithms. If none is asked by the user, all will be added. - - If the keyword `"Benchmark"` is used, all mono- and multiview algorithms will be added. - - Parameters - ---------- - cl_type : List of string - List of types of needed benchmark - multiview_algos : List of strings - List of multiview algorithms needed for the benchmark - monoview_algos : Listof strings - List of monoview algorithms needed for the benchmark - args : ParsedArgumentParser args - All the input args (used to tune the algorithms) - - Returns - ------- - benchmark : Dictionary of dictionaries - Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. - """ - benchmark = {"monoview": {}, "multiview": {}} - - if "monoview" in cl_type: - if monoview_algos == ['all']: # pragma: no cover - benchmark["monoview"] = [name for _, name, isPackage in - pkgutil.iter_modules( - monoview_classifiers.__path__) - if not isPackage] + get_database = self.get_database_function() + self.dataset_var, self.labels_dictionary, self.name = get_database( + self.views, + self.pathf, dataset_name, + self.nb_class, + self.classes, + self.random_state, + self.full, + ) + self.gen_splits() + + self.gen_k_folds() + + self.init_views() + self.views_dictionary = self.dataset_var.get_view_dict() + self.nb_views = len(self.views) + self.nb_class = self.dataset_var.get_nb_class() + + if self.metrics == "all": + metrics_names = [name for _, name, isPackage + in pkgutil.iter_modules( + [os.path.join(os.path.dirname( + os.path.dirname(os.path.realpath(__file__))), + 'metrics')]) if + not isPackage and name not in ["framework", + "log_loss", + "matthews_corrcoef", + "roc_auc_score"]] + self.metrics = dict((metric_name, {}) + for metric_name in metrics_names) + self.arange_metrics() + + self.init_benchmark() + self.init_kwargs_func() + data_base_time = time.time() - start + self.init_argument_dictionaries() + directories = self.gen_direcorties_names() + self.gen_argument_dictionaries() + self.exec_benchmark() + + def parse_the_args(self, arguments): + """Used to parse the args entered by the user""" + + parser = argparse.ArgumentParser( + description='This file is used to benchmark the scores fo multiple ' + 'classification algorithm on multiview data.', + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars='@') + + groupStandard = parser.add_argument_group('Standard arguments') + groupStandard.add_argument('--config_path', metavar='STRING', + action='store', + help='Path to the hdf5 dataset or database ' + 'folder (default: %(default)s)', + default='../config_files/config.yml') + args = parser.parse_args(arguments) + return args + + + def init_random_state(self,): + r""" + Used to init a random state. + If no random state is specified, it will generate a 'random' seed. + If the `randomSateArg` is a string containing only numbers, it will be converted in + an int to generate a seed. + If the `randomSateArg` is a string with letters, it must be a path to a pickled random + state file that will be loaded. + The function will also pickle the new random state in a file tobe able to retrieve it later. + Tested + + + Parameters + ---------- + random_state_arg : None or string + See function description. + directory : string + Path to the results directory. + + Returns + ------- + random_state : numpy.random.RandomState object + This random state will be used all along the benchmark . + """ + + if self.random_state is None: + self.random_state = np.random.RandomState(self.random_state) + else: + try: + seed = int(self.random_state) + self.random_state = np.random.RandomState(seed) + except ValueError: + file_name = self.random_state + with open(file_name, 'rb') as handle: + self.random_state = pickle.load(handle) + with open(os.path.join(self.res_dir, "random_state.pickle"), "wb") as handle: + pickle.dump(self.random_state, handle) + + + def init_stats_iter_random_states(self,): + r""" + Used to initialize multiple random states if needed because of multiple statistical iteration of the same benchmark + + Parameters + ---------- + stats_iter : int + Number of statistical iterations of the same benchmark done (with a different random state). + random_state : numpy.random.RandomState object + The random state of the whole experimentation, that will be used to generate the ones for each + statistical iteration. + + Returns + ------- + stats_iter_random_states : list of numpy.random.RandomState objects + Multiple random states, one for each sattistical iteration of the same benchmark. + """ + if self.stats_iter > 1: + self.stats_iter_random_states = [ + np.random.RandomState(self.random_state.randint(5000)) for _ in + range(self.stats_iter)] else: - benchmark["monoview"] = monoview_algos - - if "multiview" in cl_type: - if multiview_algos == ["all"]: # pragma: no cover - benchmark["multiview"] = [name for _, name, isPackage in - pkgutil.iter_modules( - multiview_classifiers.__path__) - if not isPackage] + self.stats_iter_random_states = [self.random_state] + + + def get_database_function(self,): + r"""Used to get the right database extraction function according to the type of database and it's name + + Parameters + ---------- + name : string + Name of the database. + type_var : string + type of dataset hdf5 or csv + + Returns + ------- + getDatabase : function + The function that will be used to extract the database + """ + if self.name not in ["fake", "plausible"]: + get_database = getattr(DB, "get_classic_db_" + self.type[1:]) else: - benchmark["multiview"] = multiview_algos - return benchmark - - -def init_argument_dictionaries(benchmark, views_dictionary, - nb_class, init_kwargs, hps_method, - hps_kwargs): # pragma: no cover - argument_dictionaries = {"monoview": [], "multiview": []} - if benchmark["monoview"]: - argument_dictionaries["monoview"] = init_monoview_exps( - benchmark["monoview"], - views_dictionary, - nb_class, - init_kwargs["monoview"], hps_method, hps_kwargs) - if benchmark["multiview"]: - argument_dictionaries["multiview"] = init_multiview_exps( - benchmark["multiview"], - views_dictionary, - nb_class, - init_kwargs["multiview"], hps_method, hps_kwargs) - return argument_dictionaries - - -def init_multiview_exps(classifier_names, views_dictionary, nb_class, - kwargs_init, hps_method, - hps_kwargs): # pragma: no cover - multiview_arguments = [] - for classifier_name in classifier_names: - arguments = get_path_dict(kwargs_init[classifier_name]) - if hps_method == "Grid": - multiview_arguments += [ - gen_single_multiview_arg_dictionary(classifier_name, - arguments, - nb_class, - {"param_grid": hps_kwargs[ - classifier_name]}, - views_dictionary=views_dictionary)] - elif hps_method == "Random": - hps_kwargs = get_random_hps_args(hps_kwargs, classifier_name) - multiview_arguments += [ - gen_single_multiview_arg_dictionary(classifier_name, - arguments, - nb_class, - hps_kwargs, - views_dictionary=views_dictionary)] - elif hps_method == "None": - multiview_arguments += [ - gen_single_multiview_arg_dictionary(classifier_name, - arguments, - nb_class, - hps_kwargs, - views_dictionary=views_dictionary)] + get_database = getattr(DB, "get_" + self.name + "_db_" + self.type[1:]) + return get_database + + + def init_log_file(self): + r"""Used to init the directory where the preds will be stored and the log file. + + First this function will check if the result directory already exists (only one per minute is allowed). + + If the the result directory name is available, it is created, and the logfile is initiated. + + Parameters + ---------- + name : string + Name of the database. + views : list of strings + List of the view names that will be used in the benchmark. + cl_type : list of strings + Type of benchmark that will be made . + log : bool + Whether to show the log file in console or hide it. + debug : bool + for debug option + label : str for label + + result_directory : str name of the result directory + + add_noise : bool for add noise + + noise_std : level of std noise + + Returns + ------- + results_directory : string + Reference to the main results directory for the benchmark. + """ + if self.views is None: + self.views = [] + # result_directory = os.path.join(os.path.dirname( + # os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), + # result_directory) + if self.debug: + self.res_dir = os.path.join(self.res_dir, self.name, + "debug_started_" + time.strftime( + "%Y_%m_%d-%H_%M_%S") + "_" + self.label) else: - raise ValueError('At the moment only "None", "Random" or "Grid" ' - 'are available as hyper-parameter search ' - 'methods, sadly "{}" is not'.format(hps_method) - ) - - return multiview_arguments - - -def init_monoview_exps(classifier_names, - views_dictionary, nb_class, kwargs_init, hps_method, - hps_kwargs): # pragma: no cover - r"""Used to add each monoview exeperience args to the list of monoview experiences args. - - First this function will check if the benchmark need mono- or/and multiview algorithms and adds to the right - dictionary the asked algorithms. If none is asked by the user, all will be added. - - If the keyword `"Benchmark"` is used, all mono- and multiview algorithms will be added. - - Parameters - ---------- - classifier_names : dictionary - All types of monoview and multiview experiments that have to be benchmarked - argument_dictionaries : dictionary - Maps monoview and multiview experiments arguments. - views_dictionary : dictionary - Maps the view names to their index in the HDF5 dataset - nb_class : integer - Number of different labels in the classification - - Returns - ------- - benchmark : Dictionary of dictionaries - Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. - """ - monoview_arguments = [] - for view_name, view_index in views_dictionary.items(): + self.res_dir = os.path.join(self.res_dir, self.name, + "started_" + time.strftime( + "%Y_%m_%d-%H_%M") + "_" + self.label) + log_file_name = time.strftime("%Y_%m_%d-%H_%M") + "-" + ''.join( + self.cl_type) + "-" + "_".join(self.views) + "-" + self.name + "-LOG.log" + if os.path.exists(self.res_dir): # pragma: no cover + raise NameError("The result dir already exists, wait 1 min and retry") + log_file_path = os.path.join(self.res_dir, log_file_name) + os.makedirs(os.path.dirname(log_file_path)) + logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', + filename=log_file_path, level=logging.INFO, + filemode='w') + if self.log: + logging.getLogger().addHandler(logging.StreamHandler()) + save_config(self.res_dir, self.__dict__) + + + def gen_splits(self,): + r"""Used to _gen the train/test splits using one or multiple random states. + + Parameters + ---------- + labels : numpy.ndarray + Name of the database. + split_ratio : float + The ratio of samples between train and test set. + stats_iter_random_states : list of numpy.random.RandomState + The random states for each statistical iteration. + + Returns + ------- + splits : list of lists of numpy.ndarray + For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and + the ones of the testing set. + """ + indices = np.arange(len(self.dataset_var.get_labels())) + self.splits = [] + for random_state in self.stats_iter_random_states: + folds_obj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, + random_state=random_state, + test_size=self.split) + folds = folds_obj.split(indices, self.dataset_var.get_labels()) + for fold in folds: + train_fold, test_fold = fold + train_indices = indices[train_fold] + test_indices = indices[test_fold] + self.splits.append([train_indices, test_indices]) + + + def gen_k_folds(self,): + r"""Used to generate folds indices for cross validation for each statistical iteration. + + Parameters + ---------- + stats_iter : integer + Number of statistical iterations of the benchmark. + nb_folds : integer + The number of cross-validation folds for the benchmark. + stats_iter_random_states : list of numpy.random.RandomState + The random states for each statistical iteration. + + Returns + ------- + folds_list : list of list of sklearn.model_selection.StratifiedKFold + For each statistical iteration a Kfold stratified (keeping the ratio between classes in each fold). + """ + if self.stats_iter > 1: + self.folds_list = [] + for random_state in self.stats_iter_random_states: + self.folds_list.append( + sklearn.model_selection.StratifiedKFold(n_splits=self.nb_folds, + random_state=random_state, + shuffle=True)) + else: + if isinstance(self.stats_iter_random_states, list): + self.stats_iter_random_states = self.stats_iter_random_states[0] + self.folds_list = [sklearn.model_selection.StratifiedKFold(n_splits=self.nb_folds, + random_state=self.stats_iter_random_states, + shuffle=True)] + + + def init_views(self,): + r"""Used to return the views names that will be used by the + benchmark, their indices and all the views names. + + Parameters + ---------- + dataset_var : HDF5 dataset file + The full dataset that wil be used by the benchmark. + arg_views : list of strings + The views that will be used by the benchmark (arg). + + Returns + ------- + views : list of strings + Names of the views that will be used by the benchmark. + view_indices : list of ints + The list of the indices of the view that will be used in the benchmark (according to the dataset). + all_views : list of strings + Names of all the available views in the dataset. + """ + nb_view = self.dataset_var.nb_view + if self.arg_views is not None: + allowed_views = self.arg_views + self.all_views = [str(self.dataset_var.get_view_name(view_index)) + if not isinstance(self.dataset_var.get_view_name(view_index), bytes) + else self.dataset_var.get_view_name(view_index).decode("utf-8") + for view_index in range(nb_view)] + self.views = [] + self.views_indices = [] + for view_index in range(nb_view): + view_name = self.dataset_var.get_view_name(view_index) + if isinstance(view_name, bytes): + view_name = view_name.decode("utf-8") + if view_name in allowed_views: + self.views.append(view_name) + self.views_indices.append(view_index) + else: + self.views = [str(self.dataset_var.get_view_name(view_index)) + if not isinstance(self.dataset_var.get_view_name(view_index), bytes) + else self.dataset_var.get_view_name(view_index).decode("utf-8") + for view_index in range(nb_view)] + self.views_indices = range(nb_view) + self.all_views = self.views + + + def gen_direcorties_names(self,): + r"""Used to generate the different directories of each iteration if needed. + + Parameters + ---------- + directory : string + Path to the results directory. + statsIter : int + The number of statistical iterations. + + Returns + ------- + directories : list of strings + Paths to each statistical iterations result directory. + """ + if self.stats_iter > 1: + self.directories = [] + for i in range(self.stats_iter): + self.directories.append(os.path.join(self.directory, "iter_" + str(i + 1))) + else: + self.directories = [self.directory] + + def find_dataset_names(self, ): + """This function goal is to browse the dataset directory and extrats all + the needed dataset names.""" + package_path = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + if os.path.isdir(self.pathf): + pass + elif os.path.isdir(os.path.join(package_path, self.pathf)): + self.pathf = os.path.join(package_path, self.pathf) + else: + raise ValueError("The provided pathf does not exist ({}) SuMMIT checks " + "the prefix from where you are running your script ({}) " + "and the summit package prefix ({}). " + "You may want to try with an absolute path in the " + "config file".format(self.pathf, os.getcwd(), package_path)) + available_file_names = [file_name.strip().split(".")[0] + for file_name in + os.listdir(self.pathf) + if file_name.endswith(self.file_type)] + print(self.name) + self.dataset_list = [self.name] + if self.dataset_list == ["all"]: + self.dataset_list = available_file_names + elif isinstance(self.dataset_list, str): + self.dataset_list = [used_name for used_name in available_file_names if + self.dataset_list == used_name] + elif len(self.dataset_list) > 1: + selected_names = [used_name for used_name in available_file_names if + used_name in self.dataset_list] + if not selected_names: + raise ValueError( + "None of the provided dataset names are available. Available datasets are {}".format( + available_file_names)) + self.dataset_list = [used_name for used_name in available_file_names if + used_name in self.dataset_list] + elif self.dataset_list[0] in available_file_names: + pass + else: + raise ValueError( + "The asked dataset ({}) is not available in {}. \n The available ones are {}".format( + self.dataset_list[0], self.pathf, available_file_names)) + + + def gen_argument_dictionaries(self,): # pragma: no cover + r"""Used to generate a dictionary for each benchmark. + + One for each label combination (if multiclass), for each statistical iteration, generates an dictionary with + all necessary information to perform the benchmark + + Parameters + ---------- + labels_dictionary : dictionary + Dictionary mapping labels indices to labels names. + directories : list of strings + List of the paths to the result directories for each statistical iteration. + multiclass_labels : list of lists of numpy.ndarray + For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the + indices for the biclass training set, the ones for the biclass testing set and the ones for the + multiclass testing set. + labels_combinations : list of lists of numpy.ndarray + Each original couple of different labels. + indices_multiclass : list of lists of numpy.ndarray + For each combination, contains a biclass labels numpy.ndarray with the 0/1 labels of combination. + hyper_param_search : string + Type of hyper parameter optimization method + args : parsed args objects + All the args passed by the user. + k_folds : list of list of sklearn.model_selection.StratifiedKFold + For each statistical iteration a Kfold stratified (keeping the ratio between classes in each fold). + stats_iter_random_states : list of numpy.random.RandomState objects + Multiple random states, one for each sattistical iteration of the same benchmark. + metrics : list of lists + metrics that will be used to evaluate the algorithms performance. + argument_dictionaries : dictionary + Dictionary resuming all the specific arguments for the benchmark, oe dictionary for each classifier. + benchmark : dictionary + Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. + nb_views : int + THe number of views used by the benchmark. + views : list of strings + List of the names of the used views. + views_indices : list of ints + List of indices (according to the dataset) of the used views. + + Returns + ------- + benchmarkArgumentDictionaries : list of dicts + All the needed arguments for the benchmarks. + + """ + self.benchmark_argument_dictionaries = [] + for iter_index, iterRandomState in enumerate(self.stats_iter_random_states): + benchmark_argument_dictionary = { + "labels_dictionary": self.labels_dictionary, + "directory": self.directories[iter_index], + "classification_indices": self.splits[iter_index], + "args": self.args, + "k_folds": self.k_folds[iter_index], + "random_state": iterRandomState, + "hyper_param_search": self.hyper_param_search, + "metrics": self.metrics, + "argument_dictionaries": self.argument_dictionaries, + "benchmark": self.benchmark, + "views": self.views, + "views_indices": self.views_indices, + "flag": iter_index} + self.benchmark_argument_dictionaries.append(benchmark_argument_dictionary) + + + def init_benchmark(self, ): + r"""Used to create a list of all the algorithm packages names used for the benchmark. + + First this function will check if the benchmark need mono- or/and multiview + algorithms and adds to the right + dictionary the asked algorithms. If none is asked by the user, all will be added. + + If the keyword `"Benchmark"` is used, all mono- and multiview algorithms will be added. + + Parameters + ---------- + cl_type : List of string + List of types of needed benchmark + multiview_algos : List of strings + List of multiview algorithms needed for the benchmark + monoview_algos : Listof strings + List of monoview algorithms needed for the benchmark + args : ParsedArgumentParser args + All the input args (used to tune the algorithms) + + Returns + ------- + benchmark : Dictionary of dictionaries + Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. + """ + benchmark = {"monoview": {}, "multiview": {}} + + if "monoview" in self.cl_type: + if self.monoview_algos == ['all']: # pragma: no cover + self.benchmark["monoview"] = [name for _, name, isPackage in + pkgutil.iter_modules( + monoview_classifiers.__path__) + if not isPackage] + + else: + self.benchmark["monoview"] = self.monoview_algos + + if "multiview" in self.cl_type: + if self.multiview_algos == ["all"]: # pragma: no cover + self.benchmark["multiview"] = [name for _, name, isPackage in + pkgutil.iter_modules( + multiview_classifiers.__path__) + if not isPackage] + else: + self.benchmark["multiview"] = self.multiview_algos + + + def init_argument_dictionaries(self, ): # pragma: no cover + self.argument_dictionaries = {"monoview": [], "multiview": []} + if self.benchmark["monoview"]: + self.argument_dictionaries["monoview"] = self.init_monoview_exps( + self.benchmark["monoview"], + self.views_dictionary, + self.nb_class, + self.init_kwargs["monoview"], self.hps_method, self.hps_kwargs) + if self.benchmark["multiview"]: + self.argument_dictionaries["multiview"] = self.init_multiview_exps( + self.benchmark["multiview"], + self.views_dictionary, + self.nb_class, + self.init_kwargs["multiview"], self.hps_method, self.hps_kwargs) + + + def init_multiview_exps(self, classifier_names, views_dictionary, nb_class, + kwargs_init, hps_method, + hps_kwargs): # pragma: no cover + multiview_arguments = [] for classifier_name in classifier_names: + arguments = self.get_path_dict(kwargs_init[classifier_name]) if hps_method == "Grid": - arguments = gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - {"param_grid": - hps_kwargs[ - classifier_name]}) + multiview_arguments += [ + self.gen_single_multiview_arg_dictionary(classifier_name, + arguments, + nb_class, + {"param_grid": hps_kwargs[ + classifier_name]}, + views_dictionary=views_dictionary)] elif hps_method == "Random": - hps_kwargs = get_random_hps_args(hps_kwargs, classifier_name) - arguments = gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - hps_kwargs) + hps_kwargs = self.get_random_hps_args(hps_kwargs, classifier_name) + multiview_arguments += [ + self.gen_single_multiview_arg_dictionary(classifier_name, + arguments, + nb_class, + hps_kwargs, + views_dictionary=views_dictionary)] elif hps_method == "None": - arguments = gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - hps_kwargs) - + multiview_arguments += [ + self.gen_single_multiview_arg_dictionary(classifier_name, + arguments, + nb_class, + hps_kwargs, + views_dictionary=views_dictionary)] else: - raise ValueError( - 'At the moment only "None", "Random" or "Grid" ' - 'are available as hyper-parameter search ' - 'methods, sadly "{}" is not'.format(hps_method) - ) - monoview_arguments.append(arguments) - return monoview_arguments - - -def get_random_hps_args(hps_args, classifier_name): - hps_dict = {} - for key, value in hps_args.items(): - if key in ["n_iter", "equivalent_draws"]: - hps_dict[key] = value - if key==classifier_name: - hps_dict["param_distributions"] = value - return hps_dict - - -def gen_single_monoview_arg_dictionary(classifier_name, arguments, nb_class, - view_index, view_name, hps_kwargs): - if classifier_name in arguments: - classifier_config = dict((key, value) for key, value in arguments[ - classifier_name].items()) - else: - classifier_config = {} - return {classifier_name: classifier_config, - "view_name": view_name, - "view_index": view_index, - "classifier_name": classifier_name, - "nb_class": nb_class, - "hps_kwargs": hps_kwargs} - - -def gen_single_multiview_arg_dictionary(classifier_name, arguments, nb_class, - hps_kwargs, views_dictionary=None): - return {"classifier_name": classifier_name, - "view_names": list(views_dictionary.keys()), - 'view_indices': list(views_dictionary.values()), - "nb_class": nb_class, - "labels_names": None, - "hps_kwargs": hps_kwargs, - classifier_name: extract_dict(arguments) - } - - -def extract_dict(classifier_config): - """Reverse function of get_path_dict""" - extracted_dict = {} - for key, value in classifier_config.items(): - extracted_dict = set_element(extracted_dict, key, value) - return extracted_dict - - -def set_element(dictionary, path, value): - """Set value in dictionary at the location indicated by path""" - existing_keys = path.split(".")[:-1] - dict_state = dictionary - for existing_key in existing_keys: - if existing_key in dict_state: - dict_state = dict_state[existing_key] + raise ValueError('At the moment only "None", "Random" or "Grid" ' + 'are available as hyper-parameter search ' + 'methods, sadly "{}" is not'.format(hps_method) + ) + + return multiview_arguments + + + def init_monoview_exps(self, classifier_names, + views_dictionary, nb_class, kwargs_init, hps_method, + hps_kwargs): # pragma: no cover + r"""Used to add each monoview exeperience args to the list of monoview experiences args. + + First this function will check if the benchmark need mono- or/and multiview algorithms and adds to the right + dictionary the asked algorithms. If none is asked by the user, all will be added. + + If the keyword `"Benchmark"` is used, all mono- and multiview algorithms will be added. + + Parameters + ---------- + classifier_names : dictionary + All types of monoview and multiview experiments that have to be benchmarked + argument_dictionaries : dictionary + Maps monoview and multiview experiments arguments. + views_dictionary : dictionary + Maps the view names to their index in the HDF5 dataset + nb_class : integer + Number of different labels in the classification + + Returns + ------- + benchmark : Dictionary of dictionaries + Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. + """ + monoview_arguments = [] + for view_name, view_index in views_dictionary.items(): + for classifier_name in classifier_names: + if hps_method == "Grid": + arguments = self.gen_single_monoview_arg_dictionary(classifier_name, + kwargs_init, + nb_class, + view_index, + view_name, + {"param_grid": + hps_kwargs[ + classifier_name]}) + elif hps_method == "Random": + hps_kwargs = self.get_random_hps_args(hps_kwargs, classifier_name) + arguments = self.gen_single_monoview_arg_dictionary(classifier_name, + kwargs_init, + nb_class, + view_index, + view_name, + hps_kwargs) + elif hps_method == "None": + arguments = self.gen_single_monoview_arg_dictionary(classifier_name, + kwargs_init, + nb_class, + view_index, + view_name, + hps_kwargs) + + else: + raise ValueError( + 'At the moment only "None", "Random" or "Grid" ' + 'are available as hyper-parameter search ' + 'methods, sadly "{}" is not'.format(hps_method) + ) + monoview_arguments.append(arguments) + return monoview_arguments + + + def get_random_hps_args(self, hps_args, classifier_name): + hps_dict = {} + for key, value in hps_args.items(): + if key in ["n_iter", "equivalent_draws"]: + hps_dict[key] = value + if key==classifier_name: + hps_dict["param_distributions"] = value + return hps_dict + + + def gen_single_monoview_arg_dictionary(self, classifier_name, arguments, nb_class, + view_index, view_name, hps_kwargs): + if classifier_name in arguments: + classifier_config = dict((key, value) for key, value in arguments[ + classifier_name].items()) else: - dict_state[existing_key] = {} - dict_state = dict_state[existing_key] - dict_state[path.split(".")[-1]] = value - return dictionary - - -def get_path_dict(multiview_classifier_args): - """This function is used to generate a dictionary with each key being - the path to the value. - If given {"key1":{"key1_1":value1}, "key2":value2}, it will return - {"key1.key1_1":value1, "key2":value2}""" - path_dict = dict( - (key, value) for key, value in multiview_classifier_args.items()) - paths = is_dict_in(path_dict) - while paths: - for path in paths: - for key, value in path_dict[path].items(): - path_dict[".".join([path, key])] = value - path_dict.pop(path) - paths = is_dict_in(path_dict) - return path_dict - - -def is_dict_in(dictionary): - """ - Returns True if any of the dictionary value is a dictionary itself. - - Parameters - ---------- - dictionary - - Returns - ------- - - """ - paths = [] - for key, value in dictionary.items(): - if isinstance(value, dict): - paths.append(key) - return paths - - -def init_kwargs(args, classifiers_names, framework="monoview"): - r"""Used to init kwargs thanks to a function in each monoview classifier package. - - Parameters - ---------- - args : parsed args objects - All the args passed by the user. - classifiers_names : list of strings - List of the benchmarks's monoview classifiers names. - - Returns - ------- - kwargs : Dictionary - Dictionary resuming all the specific arguments for the benchmark, one dictionary for each classifier. - - For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" - - logging.info("Start:\t Initializing monoview classifiers arguments") - kwargs = {} - for classifiers_name in classifiers_names: - try: - if framework == "monoview": - getattr(monoview_classifiers, classifiers_name) + classifier_config = {} + return {classifier_name: classifier_config, + "view_name": view_name, + "view_index": view_index, + "classifier_name": classifier_name, + "nb_class": nb_class, + "hps_kwargs": hps_kwargs} + + + def gen_single_multiview_arg_dictionary(self, classifier_name, arguments, nb_class, + hps_kwargs, views_dictionary=None): + return {"classifier_name": classifier_name, + "view_names": list(views_dictionary.keys()), + 'view_indices': list(views_dictionary.values()), + "nb_class": nb_class, + "labels_names": None, + "hps_kwargs": hps_kwargs, + classifier_name: self.extract_dict(arguments) + } + + + def extract_dict(self, classifier_config): + """Reverse function of get_path_dict""" + extracted_dict = {} + for key, value in classifier_config.items(): + extracted_dict = self.set_element(extracted_dict, key, value) + return extracted_dict + + + def set_element(self, dictionary, path, value): + """Set value in dictionary at the location indicated by path""" + existing_keys = path.split(".")[:-1] + dict_state = dictionary + for existing_key in existing_keys: + if existing_key in dict_state: + dict_state = dict_state[existing_key] else: - getattr(multiview_classifiers, classifiers_name) - except AttributeError: - raise AttributeError( - classifiers_name + " is not implemented in monoview_classifiers, " - "please specify the name of the file in monoview_classifiers") - if classifiers_name in args: - kwargs[classifiers_name] = args[classifiers_name] + dict_state[existing_key] = {} + dict_state = dict_state[existing_key] + dict_state[path.split(".")[-1]] = value + return dictionary + + + def get_path_dict(self, multiview_classifier_args): + """This function is used to generate a dictionary with each key being + the path to the value. + If given {"key1":{"key1_1":value1}, "key2":value2}, it will return + {"key1.key1_1":value1, "key2":value2}""" + path_dict = dict( + (key, value) for key, value in multiview_classifier_args.items()) + paths = self.is_dict_in(path_dict) + while paths: + for path in paths: + for key, value in path_dict[path].items(): + path_dict[".".join([path, key])] = value + path_dict.pop(path) + paths = self.is_dict_in(path_dict) + return path_dict + + + def is_dict_in(self, dictionary): + """ + Returns True if any of the dictionary value is a dictionary itself. + + Parameters + ---------- + dictionary + + Returns + ------- + + """ + paths = [] + for key, value in dictionary.items(): + if isinstance(value, dict): + paths.append(key) + return paths + + + def init_kwargs(self, args, classifiers_names, framework="monoview"): + r"""Used to init kwargs thanks to a function in each monoview classifier package. + + Parameters + ---------- + args : parsed args objects + All the args passed by the user. + classifiers_names : list of strings + List of the benchmarks's monoview classifiers names. + + Returns + ------- + kwargs : Dictionary + Dictionary resuming all the specific arguments for the benchmark, one dictionary for each classifier. + + For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" + + logging.info("Start:\t Initializing monoview classifiers arguments") + kwargs = {} + for classifiers_name in classifiers_names: + try: + if framework == "monoview": + getattr(monoview_classifiers, classifiers_name) + else: + getattr(multiview_classifiers, classifiers_name) + except AttributeError: + raise AttributeError( + classifiers_name + " is not implemented in monoview_classifiers, " + "please specify the name of the file in monoview_classifiers") + if classifiers_name in args: + kwargs[classifiers_name] = args[classifiers_name] + else: + kwargs[classifiers_name] = {} + logging.info("Done:\t Initializing monoview classifiers arguments") + + return kwargs + + + def init_kwargs_func(self, args, benchmark): + """ + Dispached the kwargs initialization to monoview and multiview and creates + the kwargs variable + + Parameters + ---------- + args : parsed args objects + All the args passed by the user. + + benchmark : dict + The name of the mono- and mutli-view classifiers to run in the benchmark + + Returns + ------- + + kwargs : dict + The arguments for each mono- and multiview algorithms + """ + monoview_kwargs = self.init_kwargs(args, benchmark["monoview"], + framework="monoview") + multiview_kwargs = self.init_kwargs(args, benchmark["multiview"], + framework="multiview") + kwargs = {"monoview": monoview_kwargs, "multiview": multiview_kwargs} + return kwargs + + + def arange_metrics(self,): + """Used to get the metrics list in the right order so that + the first one is the principal metric specified in args + + Parameters + ---------- + metrics : dict + The metrics that will be used in the benchmark + + metric_princ : str + The name of the metric that need to be used for the hyper-parameter + optimization process + + Returns + ------- + metrics : list of lists + The metrics list, but arranged so the first one is the principal one.""" + if self.metric_princ in self.metrics: + self.metrics = dict( + (key, value) if not key == self.metric_princ else (key + "*", value) for + key, value in self.metrics.items()) else: - kwargs[classifiers_name] = {} - logging.info("Done:\t Initializing monoview classifiers arguments") - - return kwargs - - -def init_kwargs_func(args, benchmark): - """ - Dispached the kwargs initialization to monoview and multiview and creates - the kwargs variable - - Parameters - ---------- - args : parsed args objects - All the args passed by the user. - - benchmark : dict - The name of the mono- and mutli-view classifiers to run in the benchmark - - Returns - ------- - - kwargs : dict - The arguments for each mono- and multiview algorithms - """ - monoview_kwargs = init_kwargs(args, benchmark["monoview"], - framework="monoview") - multiview_kwargs = init_kwargs(args, benchmark["multiview"], - framework="multiview") - kwargs = {"monoview": monoview_kwargs, "multiview": multiview_kwargs} - return kwargs - - -def arange_metrics(metrics, metric_princ): - """Used to get the metrics list in the right order so that - the first one is the principal metric specified in args - - Parameters - ---------- - metrics : dict - The metrics that will be used in the benchmark - - metric_princ : str - The name of the metric that need to be used for the hyper-parameter - optimization process - - Returns - ------- - metrics : list of lists - The metrics list, but arranged so the first one is the principal one.""" - if metric_princ in metrics: - metrics = dict( - (key, value) if not key == metric_princ else (key + "*", value) for - key, value in metrics.items()) - else: - raise ValueError("{} not in metric pool ({})".format(metric_princ, - metrics)) - return metrics - - -def benchmark_init(directory, classification_indices, labels, labels_dictionary, - k_folds, dataset_var): - """ - Initializes the benchmark, by saving the indices of the train - samples and the cross validation folds. - - Parameters - ---------- - directory : str - The benchmark's result directory - - classification_indices : numpy array - The indices of the samples, splitted for the train/test split - - labels : numpy array - The labels of the dataset - - labels_dictionary : dict - The dictionary with labels as keys and their names as values - - k_folds : sklearn.model_selection.Folds object - The folds for the cross validation process - - Returns - ------- - - """ - logging.info("Start:\t Benchmark initialization") - secure_file_path(os.path.join(directory, "train_labels.csv")) - train_indices = classification_indices[0] - train_labels = dataset_var.get_labels(sample_indices=train_indices) - np.savetxt(os.path.join(directory, "train_labels.csv"), train_labels, - delimiter=",") - np.savetxt(os.path.join(directory, "train_indices.csv"), - classification_indices[0], - delimiter=",") - results_monoview = [] - folds = k_folds.split(np.arange(len(train_labels)), train_labels) - min_fold_len = int(len(train_labels) / k_folds.n_splits) - for fold_index, (train_cv_indices, test_cv_indices) in enumerate(folds): - file_name = os.path.join(directory, "folds", "test_labels_fold_" + str( - fold_index) + ".csv") - secure_file_path(file_name) - np.savetxt(file_name, train_labels[test_cv_indices[:min_fold_len]], + raise ValueError("{} not in metric pool ({})".format(self.metric_princ, + self.metrics)) + + + def benchmark_init(self, ): + """ + Initializes the benchmark, by saving the indices of the train + samples and the cross validation folds. + + Parameters + ---------- + directory : str + The benchmark's result directory + + classification_indices : numpy array + The indices of the samples, splitted for the train/test split + + labels : numpy array + The labels of the dataset + + labels_dictionary : dict + The dictionary with labels as keys and their names as values + + k_folds : sklearn.model_selection.Folds object + The folds for the cross validation process + + Returns + ------- + + """ + logging.info("Start:\t Benchmark initialization") + secure_file_path(os.path.join(self.directory, "train_labels.csv")) + train_indices = self.classification_indices[0] + train_labels = self.dataset_var.get_labels(sample_indices=train_indices) + np.savetxt(os.path.join(self.directory, "train_labels.csv"), train_labels, delimiter=",") - labels_names = list(labels_dictionary.values()) - logging.info("Done:\t Benchmark initialization") - return results_monoview, labels_names - - -def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, - directory=None, classification_indices=None, - args=None, - k_folds=None, random_state=None, - hyper_param_search=None, metrics=None, - argument_dictionaries=None, - benchmark=None, views=None, views_indices=None, - flag=None, labels=None, - track_tracebacks=False, nb_cores=1): # pragma: no cover - results_monoview, labels_names = benchmark_init(directory, - classification_indices, - labels, - labels_dictionary, k_folds, - dataset_var) - logging.getLogger('matplotlib.font_manager').disabled = True - logging.info("Start:\t monoview benchmark") - traceback_outputs = {} - for arguments in argument_dictionaries["monoview"]: - try: - X = dataset_var.get_v(arguments["view_index"]) - Y = dataset_var.get_labels() - results_monoview += [ - exec_monoview(directory, X, Y, args["name"], labels_names, - classification_indices, k_folds, - nb_cores, args["file_type"], args["pathf"], random_state, - hyper_param_search=hyper_param_search, - metrics=metrics, feature_ids=dataset_var.feature_ids[arguments["view_index"]], - **arguments)] - except BaseException: - if track_tracebacks: - traceback_outputs[ - arguments["classifier_name"] + "-" + arguments[ - "view_name"]] = traceback.format_exc() - else: - raise - logging.info("Done:\t monoview benchmark") - - logging.info("Start:\t multiview benchmark") - results_multiview = [] - for arguments in argument_dictionaries["multiview"]: - try: - results_multiview += [ - exec_multiview(directory, dataset_var, args["name"], - classification_indices, - k_folds, nb_cores, args["file_type"], - args["pathf"], labels_dictionary, random_state, - labels, - hps_method=hyper_param_search, - metrics=metrics, n_iter=args["hps_iter"], - **arguments)] - except BaseException: - if track_tracebacks: - traceback_outputs[ - arguments["classifier_name"]] = traceback.format_exc() - else: - raise - logging.info("Done:\t multiview benchmark") - - return [flag, results_monoview + results_multiview, traceback_outputs] - - -def exec_benchmark(nb_cores, stats_iter, - benchmark_arguments_dictionaries, - directory, metrics, dataset_var, track_tracebacks, - exec_one_benchmark_mono_core=exec_one_benchmark_mono_core, - analyze=analyze, delete=delete_HDF5, - analyze_iterations=analyze_iterations): # pragma: no cover - r"""Used to execute the needed benchmark(s) on multicore or mono-core functions. - - Parameters - ---------- - nb_cores : int - Number of threads that the benchmarks can use. - stats_iter : int - Number of statistical iterations that have to be done. - benchmark_arguments_dictionaries : list of dictionaries - All the needed arguments for the benchmarks. - classification_indices : list of lists of numpy.ndarray - For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and - the ones of the testing set. - directories : list of strings - List of the paths to the result directories for each statistical iteration. - directory : string - Path to the main results directory. - multi_class_labels : ist of lists of numpy.ndarray - For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the - indices for the biclass training set, the ones for the biclass testing set and the ones for the - multiclass testing set. - metrics : list of lists - metrics that will be used to evaluate the algorithms performance. - labels_dictionary : dictionary - Dictionary mapping labels indices to labels names. - nb_labels : int - Total number of different labels in the dataset. - dataset_var : HDF5 dataset file - The full dataset that wil be used by the benchmark. - classifiers_names : list of strings - List of the benchmarks's monoview classifiers names. - rest_of_the_args : - Just used for testing purposes - - - Returns - ------- - results : list of lists - The results of the benchmark. - """ - logging.info("Start:\t Executing all the needed benchmarks") - results = [] - for arguments in benchmark_arguments_dictionaries: - benchmark_results = exec_one_benchmark_mono_core( - dataset_var=dataset_var, - track_tracebacks=track_tracebacks, nb_cores=nb_cores, - **arguments) - analyze_iterations([benchmark_results], - benchmark_arguments_dictionaries, stats_iter, - metrics, sample_ids=dataset_var.sample_ids, - labels=dataset_var.get_labels(), - feature_ids=dataset_var.feature_ids, - view_names=dataset_var.view_names) - results += [benchmark_results] - logging.info("Done:\t Executing all the needed benchmarks") - - # Do everything with flagging - logging.info("Start:\t Analyzing predictions") - results_mean_stds = analyze(results, stats_iter, - benchmark_arguments_dictionaries, - metrics, - directory, - dataset_var.sample_ids, - dataset_var.get_labels(),dataset_var.feature_ids, - dataset_var.view_names) - logging.info("Done:\t Analyzing predictions") - return results_mean_stds - - -def exec_classif(arguments): # pragma: no cover - """ - Runs the benchmark with the given arguments - - Parameters - ---------- - arguments : - - Returns - ------- - - - >>> exec_classif([--config_path, /path/to/config/files/]) - >>> - """ - start = time.time() - args = execution.parse_the_args(arguments) - args = configuration.get_the_args(args.config_path) - import sys - if not sys.platform in ["win32", "cygwin"]: - os.nice(args["nice"]) - nb_cores = args["nb_cores"] - if nb_cores == 1: - os.environ['OPENBLAS_NUM_THREADS'] = '1' - stats_iter = args["stats_iter"] - hps_method = args["hps_type"] - hps_kwargs = args["hps_args"] - cl_type = args["type"] - monoview_algos = args["algos_monoview"] - multiview_algos = args["algos_multiview"] - path, dataset_list = execution.find_dataset_names(args["pathf"], - args["file_type"], - args["name"]) - args["pathf"] = path - for dataset_name in dataset_list: - # noise_results = [] - # for noise_std in args["noise_std"]: + np.savetxt(os.path.join(self.directory, "train_indices.csv"), + self.classification_indices[0], + delimiter=",") + self.results_monoview = [] + folds = self.k_folds.split(np.arange(len(train_labels)), train_labels) + min_fold_len = int(len(train_labels) / self.k_folds.n_splits) + for fold_index, (train_cv_indices, test_cv_indices) in enumerate(folds): + file_name = os.path.join(self.directory, "folds", "test_labels_fold_" + str( + fold_index) + ".csv") + secure_file_path(file_name) + np.savetxt(file_name, train_labels[test_cv_indices[:min_fold_len]], + delimiter=",") + self.labels_names = list(self.labels_dictionary.values()) + logging.info("Done:\t Benchmark initialization") + + + def exec_one_benchmark_mono_core(self, ): # pragma: no cover + self.benchmark_init() + logging.getLogger('matplotlib.font_manager').disabled = True + logging.info("Start:\t monoview benchmark") + traceback_outputs = {} + for arguments in self.argument_dictionaries["monoview"]: + try: + X = self.dataset_var.get_v(arguments["view_index"]) + Y = self.dataset_var.get_labels() + self.results_monoview += [ + exec_monoview(self.directory, X, Y, self.name, self.labels_names, + self.classification_indices, self.k_folds, + self.nb_cores, self.args["file_type"], self.args["pathf"], self.random_state, + hyper_param_search=self.hyper_param_search, + metrics=self.metrics, feature_ids=self.dataset_var.feature_ids[arguments["view_index"]], + **self.arguments)] + except BaseException: + if self.track_tracebacks: + traceback_outputs[ + arguments["classifier_name"] + "-" + arguments[ + "view_name"]] = traceback.format_exc() + else: + raise + logging.info("Done:\t monoview benchmark") + + logging.info("Start:\t multiview benchmark") + self.results_multiview = [] + for arguments in self.argument_dictionaries["multiview"]: + try: + self.results_multiview += [ + exec_multiview(self.directory, self.dataset_var, self.name, + self.classification_indices, + self.k_folds, self.nb_cores, self.args["file_type"], + self.pathf, self.labels_dictionary, self.random_state, + self.labels, + hps_method=self.hyper_param_search, + metrics=self.metrics, n_iter=self.hps_iter, + **self.arguments)] + except BaseException: + if self.track_tracebacks: + traceback_outputs[ + arguments["classifier_name"]] = traceback.format_exc() + else: + raise + logging.info("Done:\t multiview benchmark") + + return [flag, results_monoview + results_multiview, traceback_outputs] + + def exec_benchmark(self, ): # pragma: no cover + r"""Used to execute the needed benchmark(s) on multicore or mono-core functions. + + Parameters + ---------- + nb_cores : int + Number of threads that the benchmarks can use. + stats_iter : int + Number of statistical iterations that have to be done. + benchmark_arguments_dictionaries : list of dictionaries + All the needed arguments for the benchmarks. + classification_indices : list of lists of numpy.ndarray + For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and + the ones of the testing set. + directories : list of strings + List of the paths to the result directories for each statistical iteration. + directory : string + Path to the main results directory. + multi_class_labels : ist of lists of numpy.ndarray + For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the + indices for the biclass training set, the ones for the biclass testing set and the ones for the + multiclass testing set. + metrics : list of lists + metrics that will be used to evaluate the algorithms performance. + labels_dictionary : dictionary + Dictionary mapping labels indices to labels names. + nb_labels : int + Total number of different labels in the dataset. + dataset_var : HDF5 dataset file + The full dataset that wil be used by the benchmark. + classifiers_names : list of strings + List of the benchmarks's monoview classifiers names. + rest_of_the_args : + Just used for testing purposes + + + Returns + ------- + results : list of lists + The results of the benchmark. + """ + logging.info("Start:\t Executing all the needed benchmarks") + self.results = [] + for arguments in self.benchmark_arguments_dictionaries: + benchmark_results = self.exec_one_benchmark_mono_core() + analyze_iterations([benchmark_results], + benchmark_arguments_dictionaries, stats_iter, + metrics, sample_ids=dataset_var.sample_ids, + labels=dataset_var.get_labels(), + feature_ids=dataset_var.feature_ids, + view_names=dataset_var.view_names) + results += [benchmark_results] + logging.info("Done:\t Executing all the needed benchmarks") + + # Do everything with flagging + logging.info("Start:\t Analyzing predictions") + results_mean_stds = analyze(results, stats_iter, + benchmark_arguments_dictionaries, + metrics, + directory, + dataset_var.sample_ids, + dataset_var.get_labels(),dataset_var.feature_ids, + dataset_var.view_names) + logging.info("Done:\t Analyzing predictions") + return results_mean_stds - directory = execution.init_log_file(dataset_name, args["views"], - args["file_type"], - args["log"], args["debug"], - args["label"], - args["res_dir"], - args) - - random_state = execution.init_random_state(args["random_state"], - directory) - stats_iter_random_states = execution.init_stats_iter_random_states( - stats_iter, - random_state) - - get_database = execution.get_database_function(dataset_name, - args["file_type"]) - - dataset_var, labels_dictionary, datasetname = get_database( - args["views"], - args["pathf"], dataset_name, - args["nb_class"], - args["classes"], - random_state, - args["full"], - ) - args["name"] = datasetname - splits = execution.gen_splits(dataset_var.get_labels(), - args["split"], - stats_iter_random_states) - - # multiclass_labels, labels_combinations, indices_multiclass = multiclass.gen_multiclass_labels( - # dataset_var.get_labels(), multiclass_method, splits) - - k_folds = execution.gen_k_folds(stats_iter, args["nb_folds"], - stats_iter_random_states) - - # dataset_files = dataset.init_multiple_datasets(args["pathf"], - # args["name"], - # nb_cores) - - views, views_indices, all_views = execution.init_views(dataset_var, - args[ - "views"]) - views_dictionary = dataset_var.get_view_dict() - nb_views = len(views) - nb_class = dataset_var.get_nb_class() - - metrics = args["metrics"] - if metrics == "all": - metrics_names = [name for _, name, isPackage - in pkgutil.iter_modules( - [os.path.join(os.path.dirname( - os.path.dirname(os.path.realpath(__file__))), - 'metrics')]) if - not isPackage and name not in ["framework", - "log_loss", - "matthews_corrcoef", - "roc_auc_score"]] - metrics = dict((metric_name, {}) - for metric_name in metrics_names) - metrics = arange_metrics(metrics, args["metric_princ"]) - benchmark = init_benchmark(cl_type, monoview_algos, multiview_algos, ) - init_kwargs = init_kwargs_func(args, benchmark) - data_base_time = time.time() - start - argument_dictionaries = init_argument_dictionaries( - benchmark, views_dictionary, - nb_class, init_kwargs, hps_method, hps_kwargs) - # argument_dictionaries = initMonoviewExps(benchmark, viewsDictionary, - # NB_CLASS, initKWARGS) - directories = execution.gen_direcorties_names(directory, stats_iter) - benchmark_argument_dictionaries = execution.gen_argument_dictionaries( - labels_dictionary, directories, - splits, - hps_method, args, k_folds, - stats_iter_random_states, metrics, - argument_dictionaries, benchmark, - views, views_indices) - exec_benchmark(nb_cores, stats_iter, benchmark_argument_dictionaries, - directory, metrics, dataset_var, args["track_tracebacks"]) diff --git a/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py b/summit/multiview_platform/monoview_classifiers/__scm_bagging_mincq.py similarity index 98% rename from summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py rename to summit/multiview_platform/monoview_classifiers/__scm_bagging_mincq.py index 32892dd1..92c7d950 100644 --- a/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py +++ b/summit/multiview_platform/monoview_classifiers/__scm_bagging_mincq.py @@ -82,7 +82,7 @@ class ScmBaggingMinCq(RandomScmClassifier, BaseMonoviewClassifier): random_state=None): if isinstance(p_options, float): p_options = [p_options] - ScmBaggingClassifier.__init__(self, n_estimators=n_estimators, + RandomScmClassifier.__init__(self, n_estimators=n_estimators, max_samples=max_samples, max_features=max_features, max_rules=max_rules, diff --git a/summit/multiview_platform/multiview/exec_multiview.py b/summit/multiview_platform/multiview/exec_multiview.py index 96b2d202..ca2e7aed 100644 --- a/summit/multiview_platform/multiview/exec_multiview.py +++ b/summit/multiview_platform/multiview/exec_multiview.py @@ -292,10 +292,11 @@ def exec_multiview(directory, dataset_var, name, classification_indices, logging.info("Done:\t Optimizing hyperparameters") logging.info("Start:\t Fitting classifier") fit_beg = time.monotonic() + classifier.fit(dataset_var, dataset_var.get_labels(), train_indices=learning_indices, view_indices=views_indices) - print("pou") + fit_duration = time.monotonic() - fit_beg logging.info("Done:\t Fitting classifier") diff --git a/summit/multiview_platform/multiview_classifiers/bagged_spkm_pw.py b/summit/multiview_platform/multiview_classifiers/bagged_spkm_pw.py new file mode 100644 index 00000000..c0c3c045 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/bagged_spkm_pw.py @@ -0,0 +1,49 @@ +import numpy as np +from sklearn.preprocessing import LabelBinarizer + +from imblearn.under_sampling import RandomUnderSampler + +from spkm.spkm_wrapper import pairwiseSPKMlikeSklearn +from spkm.kernels_and_gradients import RBFKernel, PolyKernel + +from ..multiview.multiview_utils import BaseMultiviewClassifier +from ..utils.hyper_parameter_search import CustomRandint +from ..utils.dataset import get_samples_views_indices +from ..multiview_classifiers.spkm_pw import PWSPKM + +classifier_class_name = "SampledPWSPKM" + +class SampledPWSPKM(PWSPKM,): + + def __init__(self, random_state=42, n_u=2, kernel1=RBFKernel(0.5), + kernel2=RBFKernel(0.5), spkmregP=1, spkminit="randn", + nspkminits=10, preprocessinglist=[0,1,2], **kwargs): + PWSPKM.__init__(self, random_state=random_state, + n_u=n_u, + kernel1=kernel1, + kernel2=kernel2, + spkmregP=spkmregP, + spkminit=spkminit, + nspkminits=nspkminits, + preprocessinglist=preprocessinglist) + self.rus = RandomUnderSampler(random_state=random_state) + + def fit(self, X, y, train_indices=None, view_indices=None): + + self.lb = LabelBinarizer(pos_label=1, neg_label=-1) + y = self.lb.fit_transform(y) + train_indices, view_indices = get_samples_views_indices(X, + train_indices, + view_indices) + if len(view_indices)>2: + self.more_than_two_views = True + self.label_set = np.unique(y) + return self + self.used_views = view_indices + self.view_names = [X.get_view_name(view_index) + for view_index in view_indices] + view_list = [X.get_v(view_index)[train_indices, :] + for view_index in view_indices] + self.rus.fit_resample(view_list[0], y[train_indices]) + resampled_list = [X[self.rus.sample_indices_, :] for X in view_list] + return pairwiseSPKMlikeSklearn.fit(self, resampled_list, y[train_indices,0][self.rus.sample_indices_],) diff --git a/summit/multiview_platform/multiview_classifiers/early_fusion_random_scm.py b/summit/multiview_platform/multiview_classifiers/early_fusion_random_scm.py new file mode 100644 index 00000000..c47293ef --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/early_fusion_random_scm.py @@ -0,0 +1,26 @@ +from .additions.early_fusion_from_monoview import BaseEarlyFusion +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + +classifier_class_name = "EarlyFusionRSCM" + + +class EarlyFusionRSCM(BaseEarlyFusion): + + def __init__(self, n_estimators=100, max_samples=0.5, max_features=0.5, + max_rules=10, p_options=[1.0], model_type="conjunction", + random_state=None, **kwargs): + if isinstance(p_options, float): + p_options = [p_options] + BaseEarlyFusion.__init__(self, random_state=random_state, + monoview_classifier="random_scm", + n_estimators=n_estimators, + max_samples=max_samples, + max_features=max_features, + max_rules=max_rules, + p_options=p_options, + model_type=model_type, **kwargs) + self.param_names = ["n_estimators", "max_rules", "max_samples", "max_features", "model_type", "p_options", "random_state"] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=300), CustomRandint(low=1, high=20), + CustomUniform(), CustomUniform(), ["conjunction", "disjunction"], CustomUniform(), [random_state]] + self.weird_strings = {} \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/spkm_pw.py b/summit/multiview_platform/multiview_classifiers/spkm_pw.py index 4c5a51ad..f0c80553 100644 --- a/summit/multiview_platform/multiview_classifiers/spkm_pw.py +++ b/summit/multiview_platform/multiview_classifiers/spkm_pw.py @@ -28,9 +28,9 @@ class PWSPKM(BaseMultiviewClassifier, pairwiseSPKMlikeSklearn): self.param_names = ["n_u", "kernel1", "kernel2", "spkmregP", "spkminit", "nspkminits", "preprocessinglist", "random_state"] - self.distribs = [[2], [PolyKernel({"d":3, "r":1})], [PolyKernel({"d":3, "r":1})], CustomRandint(1,15), - ["data", "randn"], CustomRandint(1,30), - [[], [0], [0,1], [0,1,2]], [random_state],] + self.distribs = [[2], [PolyKernel({"d":3, "r":1}), RBFKernel(0.5)], [PolyKernel({"d":3, "r":1}), RBFKernel(0.5)], CustomRandint(-2,2, multiplier='e'), + ["data"], [10], + [[], [0], [1], [2], [0,1], [0,1,2], [0,2], [1,2]], [random_state],] self.more_than_two_views = False self.random_state = random_state @@ -50,7 +50,6 @@ class PWSPKM(BaseMultiviewClassifier, pairwiseSPKMlikeSklearn): for view_index in view_indices] view_list = [X.get_v(view_index)[train_indices, :] for view_index in view_indices] - return pairwiseSPKMlikeSklearn.fit(self, view_list, y[train_indices,0],) def predict(self, X, sample_indices=None, view_indices=None): @@ -59,18 +58,15 @@ class PWSPKM(BaseMultiviewClassifier, pairwiseSPKMlikeSklearn): sample_indices, view_indices = get_samples_views_indices(X, sample_indices, view_indices) - view_list = [X.get_v(view_index)[sample_indices, :] - for view_index in view_indices] self._check_views(view_indices) view_list = [X.get_v(view_index)[sample_indices, :] for view_index in view_indices] - print(self.lb.inverse_transform(np.sign(pairwiseSPKMlikeSklearn.predict(self, view_list)))) return self.lb.inverse_transform(np.sign(pairwiseSPKMlikeSklearn.predict(self, view_list))) def get_interpretation(self, directory, base_file_name, labels, multiclass=False): u, v = self.feature_interpretability() - importances_sum = np.sum(u+v) + importances_sum = np.sum(np.sum(u)+np.sum(v)) self.feature_importances_ = [u/importances_sum, v/importances_sum] return "" diff --git a/summit/multiview_platform/utils/execution.py b/summit/multiview_platform/utils/execution.py index 4c2e94b7..75fa04c3 100644 --- a/summit/multiview_platform/utils/execution.py +++ b/summit/multiview_platform/utils/execution.py @@ -10,421 +10,6 @@ import sklearn from . import get_multiview_db as DB from ..utils.configuration import save_config +class BaseExec: -def parse_the_args(arguments): - """Used to parse the args entered by the user""" - - parser = argparse.ArgumentParser( - description='This file is used to benchmark the scores fo multiple ' - 'classification algorithm on multiview data.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@') - - groupStandard = parser.add_argument_group('Standard arguments') - groupStandard.add_argument('--config_path', metavar='STRING', - action='store', - help='Path to the hdf5 dataset or database ' - 'folder (default: %(default)s)', - default='../config_files/config.yml') - args = parser.parse_args(arguments) - return args - - -def init_random_state(random_state_arg, directory): - r""" - Used to init a random state. - If no random state is specified, it will generate a 'random' seed. - If the `randomSateArg` is a string containing only numbers, it will be converted in - an int to generate a seed. - If the `randomSateArg` is a string with letters, it must be a path to a pickled random - state file that will be loaded. - The function will also pickle the new random state in a file tobe able to retrieve it later. - Tested - - - Parameters - ---------- - random_state_arg : None or string - See function description. - directory : string - Path to the results directory. - - Returns - ------- - random_state : numpy.random.RandomState object - This random state will be used all along the benchmark . - """ - - if random_state_arg is None: - random_state = np.random.RandomState(random_state_arg) - else: - try: - seed = int(random_state_arg) - random_state = np.random.RandomState(seed) - except ValueError: - file_name = random_state_arg - with open(file_name, 'rb') as handle: - random_state = pickle.load(handle) - with open(os.path.join(directory, "random_state.pickle"), "wb") as handle: - pickle.dump(random_state, handle) - return random_state - - -def init_stats_iter_random_states(stats_iter, random_state): - r""" - Used to initialize multiple random states if needed because of multiple statistical iteration of the same benchmark - - Parameters - ---------- - stats_iter : int - Number of statistical iterations of the same benchmark done (with a different random state). - random_state : numpy.random.RandomState object - The random state of the whole experimentation, that will be used to generate the ones for each - statistical iteration. - - Returns - ------- - stats_iter_random_states : list of numpy.random.RandomState objects - Multiple random states, one for each sattistical iteration of the same benchmark. - """ - if stats_iter > 1: - stats_iter_random_states = [ - np.random.RandomState(random_state.randint(5000)) for _ in - range(stats_iter)] - else: - stats_iter_random_states = [random_state] - return stats_iter_random_states - - -def get_database_function(name, type_var): - r"""Used to get the right database extraction function according to the type of database and it's name - - Parameters - ---------- - name : string - Name of the database. - type_var : string - type of dataset hdf5 or csv - - Returns - ------- - getDatabase : function - The function that will be used to extract the database - """ - if name not in ["fake", "plausible"]: - get_database = getattr(DB, "get_classic_db_" + type_var[1:]) - else: - get_database = getattr(DB, "get_" + name + "_db_" + type_var[1:]) - return get_database - - -def init_log_file(name, views, cl_type, log, debug, label, - result_directory, args): - r"""Used to init the directory where the preds will be stored and the log file. - - First this function will check if the result directory already exists (only one per minute is allowed). - - If the the result directory name is available, it is created, and the logfile is initiated. - - Parameters - ---------- - name : string - Name of the database. - views : list of strings - List of the view names that will be used in the benchmark. - cl_type : list of strings - Type of benchmark that will be made . - log : bool - Whether to show the log file in console or hide it. - debug : bool - for debug option - label : str for label - - result_directory : str name of the result directory - - add_noise : bool for add noise - - noise_std : level of std noise - - Returns - ------- - results_directory : string - Reference to the main results directory for the benchmark. - """ - if views is None: - views = [] - # result_directory = os.path.join(os.path.dirname( - # os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), - # result_directory) - if debug: - result_directory = os.path.join(result_directory, name, - "debug_started_" + time.strftime( - "%Y_%m_%d-%H_%M_%S") + "_" + label) - else: - result_directory = os.path.join(result_directory, name, - "started_" + time.strftime( - "%Y_%m_%d-%H_%M") + "_" + label) - log_file_name = time.strftime("%Y_%m_%d-%H_%M") + "-" + ''.join( - cl_type) + "-" + "_".join(views) + "-" + name + "-LOG.log" - if os.path.exists(result_directory): # pragma: no cover - raise NameError("The result dir already exists, wait 1 min and retry") - log_file_path = os.path.join(result_directory, log_file_name) - os.makedirs(os.path.dirname(log_file_path)) - logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', - filename=log_file_path, level=logging.INFO, - filemode='w') - if log: - logging.getLogger().addHandler(logging.StreamHandler()) - save_config(result_directory, args) - return result_directory - - -def gen_splits(labels, split_ratio, stats_iter_random_states): - r"""Used to _gen the train/test splits using one or multiple random states. - - Parameters - ---------- - labels : numpy.ndarray - Name of the database. - split_ratio : float - The ratio of samples between train and test set. - stats_iter_random_states : list of numpy.random.RandomState - The random states for each statistical iteration. - - Returns - ------- - splits : list of lists of numpy.ndarray - For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and - the ones of the testing set. - """ - indices = np.arange(len(labels)) - splits = [] - for random_state in stats_iter_random_states: - folds_obj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, - random_state=random_state, - test_size=split_ratio) - folds = folds_obj.split(indices, labels) - for fold in folds: - train_fold, test_fold = fold - train_indices = indices[train_fold] - test_indices = indices[test_fold] - splits.append([train_indices, test_indices]) - - return splits - - -def gen_k_folds(stats_iter, nb_folds, stats_iter_random_states): - r"""Used to generate folds indices for cross validation for each statistical iteration. - - Parameters - ---------- - stats_iter : integer - Number of statistical iterations of the benchmark. - nb_folds : integer - The number of cross-validation folds for the benchmark. - stats_iter_random_states : list of numpy.random.RandomState - The random states for each statistical iteration. - - Returns - ------- - folds_list : list of list of sklearn.model_selection.StratifiedKFold - For each statistical iteration a Kfold stratified (keeping the ratio between classes in each fold). - """ - if stats_iter > 1: - folds_list = [] - for random_state in stats_iter_random_states: - folds_list.append( - sklearn.model_selection.StratifiedKFold(n_splits=nb_folds, - random_state=random_state, - shuffle=True)) - else: - if isinstance(stats_iter_random_states, list): - stats_iter_random_states = stats_iter_random_states[0] - folds_list = [sklearn.model_selection.StratifiedKFold(n_splits=nb_folds, - random_state=stats_iter_random_states, - shuffle=True)] - return folds_list - - -def init_views(dataset_var, arg_views): - r"""Used to return the views names that will be used by the - benchmark, their indices and all the views names. - - Parameters - ---------- - dataset_var : HDF5 dataset file - The full dataset that wil be used by the benchmark. - arg_views : list of strings - The views that will be used by the benchmark (arg). - - Returns - ------- - views : list of strings - Names of the views that will be used by the benchmark. - view_indices : list of ints - The list of the indices of the view that will be used in the benchmark (according to the dataset). - all_views : list of strings - Names of all the available views in the dataset. - """ - nb_view = dataset_var.nb_view - if arg_views is not None: - allowed_views = arg_views - all_views = [str(dataset_var.get_view_name(view_index)) - if not isinstance(dataset_var.get_view_name(view_index), bytes) - else dataset_var.get_view_name(view_index).decode("utf-8") - for view_index in range(nb_view)] - views = [] - views_indices = [] - for view_index in range(nb_view): - view_name = dataset_var.get_view_name(view_index) - if isinstance(view_name, bytes): - view_name = view_name.decode("utf-8") - if view_name in allowed_views: - views.append(view_name) - views_indices.append(view_index) - else: - views = [str(dataset_var.get_view_name(view_index)) - if not isinstance(dataset_var.get_view_name(view_index), bytes) - else dataset_var.get_view_name(view_index).decode("utf-8") - for view_index in range(nb_view)] - views_indices = range(nb_view) - all_views = views - return views, views_indices, all_views - - -def gen_direcorties_names(directory, stats_iter): - r"""Used to generate the different directories of each iteration if needed. - - Parameters - ---------- - directory : string - Path to the results directory. - statsIter : int - The number of statistical iterations. - - Returns - ------- - directories : list of strings - Paths to each statistical iterations result directory. - """ - if stats_iter > 1: - directories = [] - for i in range(stats_iter): - directories.append(os.path.join(directory, "iter_" + str(i + 1))) - else: - directories = [directory] - return directories - - -def find_dataset_names(path, type, names): - """This function goal is to browse the dataset directory and extrats all - the needed dataset names.""" - package_path = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - if os.path.isdir(path): - pass - elif os.path.isdir(os.path.join(package_path, path)): - path = os.path.join(package_path, path) - else: - raise ValueError("The provided pathf does not exist ({}) SuMMIT checks " - "the prefix from where you are running your script ({}) " - "and the summit package prefix ({}). " - "You may want to try with an absolute path in the " - "config file".format(path, os.getcwd(), package_path)) - available_file_names = [file_name.strip().split(".")[0] - for file_name in - os.listdir(path) - if file_name.endswith(type)] - if names == ["all"]: - return path, available_file_names - elif isinstance(names, str): - return path, [used_name for used_name in available_file_names if - names == used_name] - elif len(names) > 1: - selected_names = [used_name for used_name in available_file_names if - used_name in names] - if not selected_names: - raise ValueError( - "None of the provided dataset names are available. Available datasets are {}".format( - available_file_names)) - return path, [used_name for used_name in available_file_names if - used_name in names] - elif names[0] in available_file_names: - return path, names - else: - raise ValueError( - "The asked dataset ({}) is not available in {}. \n The available ones are {}".format( - names[0], path, available_file_names)) - - -def gen_argument_dictionaries(labels_dictionary, directories, - splits, - hyper_param_search, args, k_folds, - stats_iter_random_states, metrics, - argument_dictionaries, - benchmark, views, - views_indices, ): # pragma: no cover - r"""Used to generate a dictionary for each benchmark. - - One for each label combination (if multiclass), for each statistical iteration, generates an dictionary with - all necessary information to perform the benchmark - - Parameters - ---------- - labels_dictionary : dictionary - Dictionary mapping labels indices to labels names. - directories : list of strings - List of the paths to the result directories for each statistical iteration. - multiclass_labels : list of lists of numpy.ndarray - For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the - indices for the biclass training set, the ones for the biclass testing set and the ones for the - multiclass testing set. - labels_combinations : list of lists of numpy.ndarray - Each original couple of different labels. - indices_multiclass : list of lists of numpy.ndarray - For each combination, contains a biclass labels numpy.ndarray with the 0/1 labels of combination. - hyper_param_search : string - Type of hyper parameter optimization method - args : parsed args objects - All the args passed by the user. - k_folds : list of list of sklearn.model_selection.StratifiedKFold - For each statistical iteration a Kfold stratified (keeping the ratio between classes in each fold). - stats_iter_random_states : list of numpy.random.RandomState objects - Multiple random states, one for each sattistical iteration of the same benchmark. - metrics : list of lists - metrics that will be used to evaluate the algorithms performance. - argument_dictionaries : dictionary - Dictionary resuming all the specific arguments for the benchmark, oe dictionary for each classifier. - benchmark : dictionary - Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. - nb_views : int - THe number of views used by the benchmark. - views : list of strings - List of the names of the used views. - views_indices : list of ints - List of indices (according to the dataset) of the used views. - - Returns - ------- - benchmarkArgumentDictionaries : list of dicts - All the needed arguments for the benchmarks. - - """ - benchmark_argument_dictionaries = [] - for iter_index, iterRandomState in enumerate(stats_iter_random_states): - benchmark_argument_dictionary = { - "labels_dictionary": labels_dictionary, - "directory": directories[iter_index], - "classification_indices": splits[iter_index], - "args": args, - "k_folds": k_folds[iter_index], - "random_state": iterRandomState, - "hyper_param_search": hyper_param_search, - "metrics": metrics, - "argument_dictionaries": argument_dictionaries, - "benchmark": benchmark, - "views": views, - "views_indices": views_indices, - "flag": iter_index} - benchmark_argument_dictionaries.append(benchmark_argument_dictionary) - return benchmark_argument_dictionaries + pass -- GitLab