From fb8b2fe4228cbd3dcb8e2e34156fa6987bd60aa4 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Fri, 8 Nov 2019 08:21:22 -0500 Subject: [PATCH] Some doc" --- config_files/config.yml | 38 +++- .../exec_classif.py | 212 ++++++++++++------ 2 files changed, 182 insertions(+), 68 deletions(-) diff --git a/config_files/config.yml b/config_files/config.yml index ba74aca0..b2df8fac 100644 --- a/config_files/config.yml +++ b/config_files/config.yml @@ -1,41 +1,75 @@ # The base configuration of the benchmark Base : - log: true + # Enable logging + log: True + # The name of each dataset in the directory on which the benchmark should be run name: ["plausible"] + # A label for the resul directory label: "_" + # The type of dataset, currently supported ".hdf5", and ".csv" type: ".hdf5" + # The views to use in the banchmark, an empty value will result in using all the views views: + # The path to the directory where the datasets are stored pathf: "../data/" + # The niceness of the processes, useful to lower their priority nice: 0 + # The random state of the benchmark, useful for reproducibility random_state: 42 + # The number of parallel computing threads nb_cores: 1 + # Used to run the benchmark on the full dataset full: False - debug: True + # Used to be able to run more than one benchmark per minute + debug: False + # To add noise to the data, will add gaussian noise with noise_std add_noise: False noise_std: 0.0 + # The directory in which the results will be stored res_dir: "../results/" # All the classification-realted configuration options Classification: + # If the dataset is multiclass, will use this multiclass-to-biclass method multiclass_method: "oneVersusOne" + # The ratio number of test exmaples/number of train examples split: 0.8 + # The nubmer of folds in the cross validation process when hyper-paramter optimization is performed nb_folds: 2 + # The number of classes to select in the dataset nb_class: 2 + # The name of the classes to select in the dataset classes: + # The type of algorithms to run during the benchmark (monoview and/or multiview) type: ["monoview","multiview"] + # The name of the monoview algorithms to run, ["all"] to run all the available classifiers algos_monoview: ["all"] + # The names of the multiview algorithms to run, ["all"] to run all the available classifiers algos_multiview: ["all"] + # The number of times the benchamrk is repeated with different train/test + # split, to have more statistically significant results stats_iter: 2 + # The metrics that will be use din the result analysis metrics: ["accuracy_score", "f1_score"] + # The metric that will be used in the hyper-parameter optimization process metric_princ: "f1_score" + # The type of hyper-parameter optimization method hps_type: "randomized_search" + # The number of iteration in the hyper-parameter optimization process hps_iter: 2 +# The following arguments are classifier-specific, and are documented in each +# of the corresponding modules. + +# In order to run multiple sets of parameters, use multiple values in the +# following lists, and set hps_type to None. + ##################################### # The Monoview Classifier arguments # ##################################### + random_forest: n_estimators: [25] max_depth: [3] diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py index aa342b54..ce66f705 100644 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ b/multiview_platform/mono_multi_view_classifiers/exec_classif.py @@ -57,15 +57,13 @@ def init_benchmark(cl_type, monoview_algos, multiview_algos, args): Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. """ benchmark = {"monoview": {}, "multiview": {}} - all_multiview_packages = [name for _, name, isPackage - in pkgutil.iter_modules( - ['./mono_multi_view_classifiers/multiview_classifiers/']) if isPackage] + if "monoview" in cl_type: if monoview_algos == ['all']: benchmark["monoview"] = [name for _, name, isPackage in pkgutil.iter_modules([ - "./mono_multi_view_classifiers/monoview_classifiers"]) + "./mono_multi_view_classifiers/monoview_classifiers"]) if not isPackage] else: @@ -82,34 +80,6 @@ def init_benchmark(cl_type, monoview_algos, multiview_algos, args): return benchmark -# def gen_views_dictionnary(dataset_var, views): -# r"""Used to generate a dictionary mapping a view name (key) to it's index in the dataset (value). -# -# Parameters -# ---------- -# dataset_var : `h5py` dataset file -# The full dataset on which the benchmark will be done -# views : List of strings -# Names of the selected views on which the banchmark will be done -# -# Returns -# ------- -# viewDictionary : Dictionary -# Dictionary mapping the view names totheir indexin the full dataset. -# """ -# datasets_names = dataset_var.get_view_dict().keys() -# views_dictionary = {} -# for dataset_name in datasets_names: -# if dataset_name[:4] == "View": -# view_name = dataset_var.get(dataset_name).attrs["name"] -# if type(view_name) == bytes: -# view_name = view_name.decode("utf-8") -# if view_name in views: -# views_dictionary[view_name] = int(dataset_name[4:]) -# -# return views_dictionary - - def init_argument_dictionaries(benchmark, views_dictionary, nb_class, init_kwargs): argument_dictionaries = {"monoview": [], "multiview": []} @@ -263,6 +233,17 @@ def get_path_dict(multiview_classifier_args): def is_dict_in(dictionary): + """ + Returns True if any of the dictionary value is a dictionary itself. + + Parameters + ---------- + dictionary + + Returns + ------- + + """ paths = [] for key, value in dictionary.items(): if isinstance(value, dict): @@ -271,6 +252,24 @@ def is_dict_in(dictionary): def gen_multiple_kwargs_combinations(cl_kwrags): + """ + Generates all the possible combination of the asked args + + Parameters + ---------- + cl_kwrags : dict + The arguments, with one at least having multiple values + + Returns + ------- + kwargs_combination : list + The list of all the combinations of arguments + + reduced_kwargs_combination : list + The reduced names and values of the arguments will be used in the naming + process of the different classifiers + + """ values = list(cl_kwrags.values()) listed_values = [[_] if type(_) is not list else _ for _ in values] values_cartesian_prod = [_ for _ in itertools.product(*listed_values)] @@ -292,6 +291,39 @@ def gen_multiple_args_dictionnaries(nb_class, kwargs_init, classifier, view_name=None, view_index=None, views_dictionary=None, framework="monoview"): + """ + Used in the case of mutliple arguments asked in the config file. + Will combine the arguments to explore all the possibilities. + + Parameters + ---------- + nb_class : int, + The number of classes in the dataset + + kwargs_init : dict + The arguments given in the config file + + classifier : str + The name of the classifier for which multiple arguments have been asked + + view_name : str + The name of the view in consideration. + + view_index : int + The index of the view in consideration + + views_dictionary : dict + The dictionary of all the views indices and their names + + framework : str + Either monoview or multiview + + Returns + ------- + args_dictionaries : list + The list of all the possible combination of asked arguments + + """ if framework=="multiview": classifier_config = get_path_dict(kwargs_init[classifier]) else: @@ -322,12 +354,12 @@ def init_kwargs(args, classifiers_names, framework="monoview"): ---------- args : parsed args objects All the args passed by the user. - classifiers-names : list of strings + classifiers_names : list of strings List of the benchmarks's monoview classifiers names. Returns ------- - monoviewKWARGS : Dictionary of dictionaries + kwargs : Dictionary Dictionary resuming all the specific arguments for the benchmark, one dictionary for each classifier. For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" @@ -351,7 +383,25 @@ def init_kwargs(args, classifiers_names, framework="monoview"): def init_kwargs_func(args, benchmark): - monoview_kwargs = init_kwargs(args, benchmark["monoview"]) + """ + Dispached the kwargs initialization to monoview and multiview and creates + the kwargs variable + + Parameters + ---------- + args : parsed args objects + All the args passed by the user. + + benchmark : dict + The name of the mono- and mutli-view classifiers to run in the benchmark + + Returns + ------- + + kwargs : dict + The arguments for each mono- and multiview algorithms + """ + monoview_kwargs = init_kwargs(args, benchmark["monoview"], framework="monoview") multiview_kwargs = init_kwargs(args, benchmark["multiview"], framework="multiview") kwargs = {"monoview":monoview_kwargs, "multiview":multiview_kwargs} return kwargs @@ -373,31 +423,45 @@ def init_kwargs_func(args, benchmark): # return multiview_kwargs -def init_multiview_arguments(args, benchmark, views, views_indices, - argument_dictionaries, random_state, directory, - results_monoview, classification_indices): - """Used to add each monoview exeperience args to the list of monoview experiences args""" - logging.debug("Start:\t Initializing multiview classifiers arguments") - multiview_arguments = [] - if "multiview" in benchmark: - for multiview_algo_name in benchmark["multiview"]: - mutliview_module = getattr(multiview_classifiers, - multiview_algo_name) - - multiview_arguments += mutliview_module.getArgs(args, benchmark, - views, views_indices, - random_state, - directory, - results_monoview, - classification_indices) - argument_dictionaries["multiview"] = multiview_arguments - logging.debug("Start:\t Initializing multiview classifiers arguments") - return argument_dictionaries +# def init_multiview_arguments(args, benchmark, views, views_indices, +# argument_dictionaries, random_state, directory, +# results_monoview, classification_indices): +# """Used to add each monoview exeperience args to the list of monoview experiences args""" +# logging.debug("Start:\t Initializing multiview classifiers arguments") +# multiview_arguments = [] +# if "multiview" in benchmark: +# for multiview_algo_name in benchmark["multiview"]: +# mutliview_module = getattr(multiview_classifiers, +# multiview_algo_name) +# +# multiview_arguments += mutliview_module.getArgs(args, benchmark, +# views, views_indices, +# random_state, +# directory, +# results_monoview, +# classification_indices) +# argument_dictionaries["multiview"] = multiview_arguments +# logging.debug("Start:\t Initializing multiview classifiers arguments") +# return argument_dictionaries def arange_metrics(metrics, metric_princ): """Used to get the metrics list in the right order so that - the first one is the principal metric specified in args""" + the first one is the principal metric specified in args + + Parameters + ---------- + metrics : list of lists + The metrics that will be used in the benchmark + + metric_princ : str + The name of the metric that need to be used for the hyper-parameter + optimization process + + Returns + ------- + metrics : list of lists + The metrics list, but arranged so the first one is the principal one.""" if [metric_princ] in metrics: metric_index = metrics.index([metric_princ]) first_metric = metrics[0] @@ -410,6 +474,31 @@ def arange_metrics(metrics, metric_princ): def benchmark_init(directory, classification_indices, labels, labels_dictionary, k_folds): + """ + Initializes the benchmark, by saving the indices of the train + examples and the cross validation folds. + + Parameters + ---------- + directory : str + The benchmark's result directory + + classification_indices : numpy array + The indices of the examples, splitted for the train/test split + + labels : numpy array + The labels of the dataset + + labels_dictionary : dict + The dictionary with labels as keys and their names as values + + k_folds : sklearn.model_selection.Folds object + The folds for the cross validation process + + Returns + ------- + + """ logging.debug("Start:\t Benchmark initialization") if not os.path.exists(os.path.dirname(directory + "train_labels.csv")): try: @@ -448,8 +537,7 @@ def exec_one_benchmark(core_index=-1, labels_dictionary=None, directory=None, benchmark=None, views=None, views_indices=None, flag=None, labels=None, exec_monoview_multicore=exec_monoview_multicore, - exec_multiview_multicore=exec_multiview_multicore, - init_multiview_arguments=init_multiview_arguments): + exec_multiview_multicore=exec_multiview_multicore,): """Used to run a benchmark using one core. ExecMonoview_multicore, initMultiviewArguments and exec_multiview_multicore args are only used for tests""" @@ -469,14 +557,6 @@ def exec_one_benchmark(core_index=-1, labels_dictionary=None, directory=None, for argument in argument_dictionaries["Monoview"]] logging.debug("Done:\t monoview benchmark") - logging.debug("Start:\t multiview arguments initialization") - # argument_dictionaries = initMultiviewArguments(args, benchmark, views, - # views_indices, - # argument_dictionaries, - # random_state, directory, - # resultsMonoview, - # classification_indices) - logging.debug("Done:\t multiview arguments initialization") logging.debug("Start:\t multiview benchmark") results_multiview = [ -- GitLab