diff --git a/config_files/config.yml b/config_files/config.yml index ffa9ea2e6bb7f09ca423ddc3bad36b7eb5e300a8..bcd321ed418dff029bdeda2adde64812b4aa3369 100644 --- a/config_files/config.yml +++ b/config_files/config.yml @@ -1,75 +1,41 @@ # The base configuration of the benchmark Base : - # Enable logging - log: True - # The name of each dataset in the directory on which the benchmark should be run - name: ["plausible"] - # A label for the resul directory + log: true + name: ["Plausible"] label: "_" - # The type of dataset, currently supported ".hdf5", and ".csv" type: ".hdf5" - # The views to use in the banchmark, an empty value will result in using all the views views: - # The path to the directory where the datasets are stored pathf: "../data/" - # The niceness of the processes, useful to lower their priority nice: 0 - # The random state of the benchmark, useful for reproducibility random_state: 42 - # The number of parallel computing threads nb_cores: 1 - # Used to run the benchmark on the full dataset full: False - # Used to be able to run more than one benchmark per minute - debug: False - # To add noise to the data, will add gaussian noise with noise_std + debug: True add_noise: False noise_std: 0.0 - # The directory in which the results will be stored res_dir: "../results/" # All the classification-realted configuration options Classification: - # If the dataset is multiclass, will use this multiclass-to-biclass method multiclass_method: "oneVersusOne" - # The ratio number of test exmaples/number of train examples split: 0.8 - # The nubmer of folds in the cross validation process when hyper-paramter optimization is performed nb_folds: 2 - # The number of classes to select in the dataset nb_class: 2 - # The name of the classes to select in the dataset classes: - # The type of algorithms to run during the benchmark (monoview and/or multiview) type: ["monoview","multiview"] - # The name of the monoview algorithms to run, ["all"] to run all the available classifiers algos_monoview: ["all"] - # The names of the multiview algorithms to run, ["all"] to run all the available classifiers algos_multiview: ["all"] - # The number of times the benchamrk is repeated with different train/test - # split, to have more statistically significant results stats_iter: 2 - # The metrics that will be use din the result analysis metrics: ["accuracy_score", "f1_score"] - # The metric that will be used in the hyper-parameter optimization process metric_princ: "f1_score" - # The type of hyper-parameter optimization method hps_type: "randomized_search" - # The number of iteration in the hyper-parameter optimization process hps_iter: 2 -# The following arguments are classifier-specific, and are documented in each -# of the corresponding modules. - -# In order to run multiple sets of parameters, use multiple values in the -# following lists, and set hps_type to None. - ##################################### # The Monoview Classifier arguments # ##################################### - random_forest: n_estimators: [25] max_depth: [3] diff --git a/config_files/config_test.yml b/config_files/config_test.yml index d675a0606b3e8376651c8bb58fa280ca4b1ef1fc..e98dc442c64126c478dc9d38794a98f4449bef35 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -4,12 +4,12 @@ Base : name: ["control_vs_malade"] label: "_" type: ".hdf5" - views: ["300nm", "350nm"] + views: pathf: "../data/" nice: 0 random_state: 42 nb_cores: 1 - full: True + full: False debug: True add_noise: False noise_std: 0.0 @@ -22,10 +22,10 @@ Classification: nb_folds: 5 nb_class: 2 classes: - type: ["monoview",] - algos_monoview: ["decision_tree"] - algos_multiview: ["all"] - stats_iter: 4 + type: ["multiview"] + algos_monoview: ["all"] + algos_multiview: ["lp_norm_mkl",] + stats_iter: 5 metrics: ["accuracy_score", "f1_score"] metric_princ: "f1_score" hps_type: "randomized_search" diff --git a/data/Plausible.hdf5 b/data/Plausible.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..4f10a2ad8f524e8692771be0ab2f3f3709f37c16 Binary files /dev/null and b/data/Plausible.hdf5 differ diff --git a/data/Plausible0.hdf5 b/data/Plausible0.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..c7e0dd9d3a42182c5879b66d3ac225656171d2e0 Binary files /dev/null and b/data/Plausible0.hdf5 differ diff --git a/data/Plausible1.hdf5 b/data/Plausible1.hdf5 new file mode 100644 index 0000000000000000000000000000000000000000..c7e0dd9d3a42182c5879b66d3ac225656171d2e0 Binary files /dev/null and b/data/Plausible1.hdf5 differ diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py index 88ae2c8717d873d46a2ceb85f1144abcc0adb5c5..7dae037afa71cd77c1010cd54c1970107068346f 100644 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ b/multiview_platform/mono_multi_view_classifiers/exec_classif.py @@ -57,13 +57,15 @@ def init_benchmark(cl_type, monoview_algos, multiview_algos, args): Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. """ benchmark = {"monoview": {}, "multiview": {}} - + all_multiview_packages = [name for _, name, isPackage + in pkgutil.iter_modules( + ['./mono_multi_view_classifiers/multiview_classifiers/']) if isPackage] if "monoview" in cl_type: if monoview_algos == ['all']: benchmark["monoview"] = [name for _, name, isPackage in pkgutil.iter_modules([ - "./mono_multi_view_classifiers/monoview_classifiers"]) + "./mono_multi_view_classifiers/monoview_classifiers"]) if not isPackage] else: @@ -80,6 +82,34 @@ def init_benchmark(cl_type, monoview_algos, multiview_algos, args): return benchmark +# def gen_views_dictionnary(dataset_var, views): +# r"""Used to generate a dictionary mapping a view name (key) to it's index in the dataset (value). +# +# Parameters +# ---------- +# dataset_var : `h5py` dataset file +# The full dataset on which the benchmark will be done +# views : List of strings +# Names of the selected views on which the banchmark will be done +# +# Returns +# ------- +# viewDictionary : Dictionary +# Dictionary mapping the view names totheir indexin the full dataset. +# """ +# datasets_names = dataset_var.get_view_dict().keys() +# views_dictionary = {} +# for dataset_name in datasets_names: +# if dataset_name[:4] == "View": +# view_name = dataset_var.get(dataset_name).attrs["name"] +# if type(view_name) == bytes: +# view_name = view_name.decode("utf-8") +# if view_name in views: +# views_dictionary[view_name] = int(dataset_name[4:]) +# +# return views_dictionary + + def init_argument_dictionaries(benchmark, views_dictionary, nb_class, init_kwargs): argument_dictionaries = {"monoview": [], "multiview": []} @@ -233,17 +263,6 @@ def get_path_dict(multiview_classifier_args): def is_dict_in(dictionary): - """ - Returns True if any of the dictionary value is a dictionary itself. - - Parameters - ---------- - dictionary - - Returns - ------- - - """ paths = [] for key, value in dictionary.items(): if isinstance(value, dict): @@ -252,24 +271,6 @@ def is_dict_in(dictionary): def gen_multiple_kwargs_combinations(cl_kwrags): - """ - Generates all the possible combination of the asked args - - Parameters - ---------- - cl_kwrags : dict - The arguments, with one at least having multiple values - - Returns - ------- - kwargs_combination : list - The list of all the combinations of arguments - - reduced_kwargs_combination : list - The reduced names and values of the arguments will be used in the naming - process of the different classifiers - - """ values = list(cl_kwrags.values()) listed_values = [[_] if type(_) is not list else _ for _ in values] values_cartesian_prod = [_ for _ in itertools.product(*listed_values)] @@ -291,39 +292,6 @@ def gen_multiple_args_dictionnaries(nb_class, kwargs_init, classifier, view_name=None, view_index=None, views_dictionary=None, framework="monoview"): - """ - Used in the case of mutliple arguments asked in the config file. - Will combine the arguments to explore all the possibilities. - - Parameters - ---------- - nb_class : int, - The number of classes in the dataset - - kwargs_init : dict - The arguments given in the config file - - classifier : str - The name of the classifier for which multiple arguments have been asked - - view_name : str - The name of the view in consideration. - - view_index : int - The index of the view in consideration - - views_dictionary : dict - The dictionary of all the views indices and their names - - framework : str - Either monoview or multiview - - Returns - ------- - args_dictionaries : list - The list of all the possible combination of asked arguments - - """ if framework=="multiview": classifier_config = get_path_dict(kwargs_init[classifier]) else: @@ -354,12 +322,12 @@ def init_kwargs(args, classifiers_names, framework="monoview"): ---------- args : parsed args objects All the args passed by the user. - classifiers_names : list of strings + classifiers-names : list of strings List of the benchmarks's monoview classifiers names. Returns ------- - kwargs : Dictionary + monoviewKWARGS : Dictionary of dictionaries Dictionary resuming all the specific arguments for the benchmark, one dictionary for each classifier. For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" @@ -383,25 +351,7 @@ def init_kwargs(args, classifiers_names, framework="monoview"): def init_kwargs_func(args, benchmark): - """ - Dispached the kwargs initialization to monoview and multiview and creates - the kwargs variable - - Parameters - ---------- - args : parsed args objects - All the args passed by the user. - - benchmark : dict - The name of the mono- and mutli-view classifiers to run in the benchmark - - Returns - ------- - - kwargs : dict - The arguments for each mono- and multiview algorithms - """ - monoview_kwargs = init_kwargs(args, benchmark["monoview"], framework="monoview") + monoview_kwargs = init_kwargs(args, benchmark["monoview"]) multiview_kwargs = init_kwargs(args, benchmark["multiview"], framework="multiview") kwargs = {"monoview":monoview_kwargs, "multiview":multiview_kwargs} return kwargs @@ -423,45 +373,31 @@ def init_kwargs_func(args, benchmark): # return multiview_kwargs -# def init_multiview_arguments(args, benchmark, views, views_indices, -# argument_dictionaries, random_state, directory, -# results_monoview, classification_indices): -# """Used to add each monoview exeperience args to the list of monoview experiences args""" -# logging.debug("Start:\t Initializing multiview classifiers arguments") -# multiview_arguments = [] -# if "multiview" in benchmark: -# for multiview_algo_name in benchmark["multiview"]: -# mutliview_module = getattr(multiview_classifiers, -# multiview_algo_name) -# -# multiview_arguments += mutliview_module.getArgs(args, benchmark, -# views, views_indices, -# random_state, -# directory, -# results_monoview, -# classification_indices) -# argument_dictionaries["multiview"] = multiview_arguments -# logging.debug("Start:\t Initializing multiview classifiers arguments") -# return argument_dictionaries +def init_multiview_arguments(args, benchmark, views, views_indices, + argument_dictionaries, random_state, directory, + results_monoview, classification_indices): + """Used to add each monoview exeperience args to the list of monoview experiences args""" + logging.debug("Start:\t Initializing multiview classifiers arguments") + multiview_arguments = [] + if "multiview" in benchmark: + for multiview_algo_name in benchmark["multiview"]: + mutliview_module = getattr(multiview_classifiers, + multiview_algo_name) + + multiview_arguments += mutliview_module.getArgs(args, benchmark, + views, views_indices, + random_state, + directory, + results_monoview, + classification_indices) + argument_dictionaries["multiview"] = multiview_arguments + logging.debug("Start:\t Initializing multiview classifiers arguments") + return argument_dictionaries def arange_metrics(metrics, metric_princ): """Used to get the metrics list in the right order so that - the first one is the principal metric specified in args - - Parameters - ---------- - metrics : list of lists - The metrics that will be used in the benchmark - - metric_princ : str - The name of the metric that need to be used for the hyper-parameter - optimization process - - Returns - ------- - metrics : list of lists - The metrics list, but arranged so the first one is the principal one.""" + the first one is the principal metric specified in args""" if [metric_princ] in metrics: metric_index = metrics.index([metric_princ]) first_metric = metrics[0] @@ -474,31 +410,6 @@ def arange_metrics(metrics, metric_princ): def benchmark_init(directory, classification_indices, labels, labels_dictionary, k_folds): - """ - Initializes the benchmark, by saving the indices of the train - examples and the cross validation folds. - - Parameters - ---------- - directory : str - The benchmark's result directory - - classification_indices : numpy array - The indices of the examples, splitted for the train/test split - - labels : numpy array - The labels of the dataset - - labels_dictionary : dict - The dictionary with labels as keys and their names as values - - k_folds : sklearn.model_selection.Folds object - The folds for the cross validation process - - Returns - ------- - - """ logging.debug("Start:\t Benchmark initialization") if not os.path.exists(os.path.dirname(directory + "train_labels.csv")): try: @@ -537,7 +448,8 @@ def exec_one_benchmark(core_index=-1, labels_dictionary=None, directory=None, benchmark=None, views=None, views_indices=None, flag=None, labels=None, exec_monoview_multicore=exec_monoview_multicore, - exec_multiview_multicore=exec_multiview_multicore,): + exec_multiview_multicore=exec_multiview_multicore, + init_multiview_arguments=init_multiview_arguments): """Used to run a benchmark using one core. ExecMonoview_multicore, initMultiviewArguments and exec_multiview_multicore args are only used for tests""" @@ -557,6 +469,14 @@ def exec_one_benchmark(core_index=-1, labels_dictionary=None, directory=None, for argument in argument_dictionaries["Monoview"]] logging.debug("Done:\t monoview benchmark") + logging.debug("Start:\t multiview arguments initialization") + # argument_dictionaries = initMultiviewArguments(args, benchmark, views, + # views_indices, + # argument_dictionaries, + # random_state, directory, + # resultsMonoview, + # classification_indices) + logging.debug("Done:\t multiview arguments initialization") logging.debug("Start:\t multiview benchmark") results_multiview = [ @@ -581,7 +501,8 @@ def exec_one_benchmark_multicore(nb_cores=-1, labels_dictionary=None, benchmark=None, views=None, views_indices=None, flag=None, labels=None, exec_monoview_multicore=exec_monoview_multicore, - exec_multiview_multicore=exec_multiview_multicore,): + exec_multiview_multicore=exec_multiview_multicore, + init_multiview_arguments=init_multiview_arguments): """Used to run a benchmark using multiple cores. ExecMonoview_multicore, initMultiviewArguments and exec_multiview_multicore args are only used for tests""" @@ -647,11 +568,13 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, hyper_param_search=None, metrics=None, argument_dictionaries=None, benchmark=None, views=None, views_indices=None, - flag=None, labels=None,): + flag=None, labels=None, + exec_monoview_multicore=exec_monoview_multicore, + exec_multiview_multicore=exec_multiview_multicore, + init_multiview_arguments=init_multiview_arguments): results_monoview, labels_names = benchmark_init(directory, classification_indices, labels, labels_dictionary, k_folds) - logging.getLogger('matplotlib.font_manager').disabled = True logging.debug("Start:\t monoview benchmark") for arguments in argument_dictionaries["monoview"]: X = dataset_var.get_v(arguments["view_index"]) @@ -773,8 +696,7 @@ def exec_benchmark(nb_cores, stats_iter, nb_multiclass, directory, labels_dictionary, nb_examples, - nb_labels, - dataset_var.example_ids) + nb_labels) logging.debug("Done:\t Analyzing predictions") delete(benchmark_arguments_dictionaries, nb_cores, dataset_var) return results_mean_stds diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py index 875e3763ab77ea4ea771e6653aa80e2242b87ef2..cc637d2757c1b04aae97e7b634ab852d04fdecb2 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py @@ -16,7 +16,7 @@ from . import monoview_utils from .analyze_result import execute # Import own modules from .. import monoview_classifiers -from ..utils.dataset import extract_subset, Dataset +from ..utils.dataset import get_value, extract_subset, Dataset from ..utils import hyper_parameter_search # Author-Info @@ -97,19 +97,15 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, logging.debug("Start:\t Predicting") y_train_pred = classifier.predict(X_train) y_test_pred = classifier.predict(X_test) - - #Filling the full prediction in the right order - full_pred = np.zeros(Y.shape, dtype=int) - 100 + full_labels_pred = np.zeros(Y.shape, dtype=int) - 100 for trainIndex, index in enumerate(classificationIndices[0]): - full_pred[index] = y_train_pred[trainIndex] + full_labels_pred[index] = y_train_pred[trainIndex] for testIndex, index in enumerate(classificationIndices[1]): - full_pred[index] = y_test_pred[testIndex] - + full_labels_pred[index] = y_test_pred[testIndex] if X_test_multiclass != []: y_test_multiclass_pred = classifier.predict(X_test_multiclass) else: y_test_multiclass_pred = [] - logging.debug("Done:\t Predicting") t_end = time.time() - t_start @@ -128,7 +124,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, logging.debug("Done:\t Getting results") logging.debug("Start:\t Saving preds") - saveResults(stringAnalysis, outputFileName, full_pred, y_train_pred, + saveResults(stringAnalysis, outputFileName, full_labels_pred, y_train_pred, y_train, imagesAnalysis, y_test) logging.info("Done:\t Saving results") @@ -136,7 +132,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, if testFoldsPreds is None: testFoldsPreds = y_train_pred return monoview_utils.MonoviewResult(viewIndex, classifier_name, feat, metricsScores, - full_pred, clKWARGS, + full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds) # return viewIndex, [CL_type, feat, metricsScores, full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds] diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost_graalpy.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost_graalpy.py new file mode 100644 index 0000000000000000000000000000000000000000..3ea726756552ef3a00f3feb4944e8301c73b28be --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost_graalpy.py @@ -0,0 +1,277 @@ +import logging + +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils.validation import check_is_fitted + +from ..metrics import zero_one_loss +from ..monoview.additions.BoostUtils import StumpsClassifiersGenerator, \ + BaseBoost +from ..monoview.monoview_utils import CustomRandint, \ + BaseMonoviewClassifier, change_label_to_minus, change_label_to_zero + +classifier_class_name = "AdaboostGraalpy" + +class AdaBoostGP(BaseEstimator, ClassifierMixin, BaseBoost): + """Scikit-Learn compatible AdaBoost classifier. Original code by Pascal Germain, adapted by Jean-Francis Roy. + + + Parameters + ---------- + + n_iterations : int, optional + The number of iterations of the algorithm. Defaults to 200. + + iterations_to_collect_as_hyperparameters : list + Iteration numbers to collect while learning, that will be converted as hyperparameter values at evaluation time. + Defaults to None. + classifiers_generator : Transformer, optional + A transformer to convert input samples in voters' outputs. Default: Decision stumps transformer, with 10 stumps + per attributes. + callback_function : function, optional + A function to call at each iteration that is supplied learning information. Defaults to None. + + n_stumps : int ( default : 10) + + self_complemented : boolean (default : True + + Attributes + ---------- + n_iterations : int, optional + The number of iterations of the algorithm. Defaults to 200. + iterations_to_collect_as_hyperparameters : list + Iteration numbers to collect while learning, that will be converted as hyperparameter values at evaluation time. + Defaults to None. + classifiers_generator : Transformer, optional + A transformer to convert input samples in voters' outputs. Default: Decision stumps transformer, with 10 stumps + per attributes. + callback_function : function, optional + A function to call at each iteration that is supplied learning information. Defaults to None. + + """ + + def __init__(self, n_iterations=200, + iterations_to_collect_as_hyperparameters=True, + classifiers_generator=None, callback_function=None, + n_stumps=10, self_complemented=True): + + self.n_iterations = n_iterations + self.n_stumps = n_stumps + self.iterations_to_collect_as_hyperparameters = iterations_to_collect_as_hyperparameters + self.estimators_generator = classifiers_generator + self.callback_function = callback_function + self.self_complemented = self_complemented + + def fit(self, X, y): + """Fits the algorithm on training data. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The input data. + y : ndarray of shape (n_samples, ) + The input labels. + + Returns + ------- + self + + """ + y_neg = change_label_to_minus(y) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator( + n_stumps_per_attribute=self.n_stumps, + self_complemented=self.self_complemented) + + # Step 1: We fit the classifiers generator and get its classification matrix. + self.estimators_generator.fit(X, y_neg) + # hint: This is equivalent to construct a new X + classification_matrix = self._binary_classification_matrix(X) + + n_samples, n_voters = classification_matrix.shape + # logging.debug("n_voters = {}".format(n_voters)) + + # Step 2: We initialize the weights on the samples and the weak classifiers. + sample_weights = np.ones(n_samples) / n_samples + alpha_weights = np.zeros(n_voters) + self.losses = [] + + # Step 3: We loop for each iteration. + self.collected_weight_vectors_ = [] + for t in range(self.n_iterations): + + # Step 4: We find the classifier that maximizes the success, weighted by the sample weights. + classifier_successes = np.dot(classification_matrix.T, + sample_weights * y_neg) + + best_voter_index = np.argmax(classifier_successes) + success = classifier_successes[best_voter_index] + + if success >= 1.0: + logging.info("AdaBoost stopped : perfect classifier found!") + self.weights_ = np.zeros(n_voters) + self.weights_[best_voter_index] = 1.0 + return self + + # Step 5: We calculate the alpha_t parameter and update the alpha weights. + alpha = 0.5 * np.log((1.0 + success) / (1.0 - success)) + alpha_weights[best_voter_index] += alpha + + # logging.debug("{} : {}".format(t, str(alpha))) + + # Step 6: We update the sample weights. + sample_weights *= np.exp( + -1 * alpha * y_neg * classification_matrix[:, best_voter_index]) + + normalization_constant = sample_weights.sum() + sample_weights = sample_weights / normalization_constant + + # We collect iteration information for later evaluation. + if self.iterations_to_collect_as_hyperparameters: + weights = alpha_weights / np.sum(alpha_weights) + self.collected_weight_vectors_.append(weights.copy()) + + loss = zero_one_loss.score(y_neg, np.sign(np.sum( + np.multiply(classification_matrix, + alpha_weights / np.sum(alpha_weights)), axis=1))) + self.losses.append(loss) + + if self.callback_function is not None: + self.callback_function(t, alpha_weights, normalization_constant, + self.estimators_generator, self.weights_) + + self.weights_ = alpha_weights / np.sum(alpha_weights) + self.losses = np.array(self.losses) + self.learner_info_ = { + 'n_nonzero_weights': np.sum(self.weights_ > 1e-12)} + + return self + + def predict(self, X): + """Predict inputs using the fit classifier. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data to classify. + + Returns + ------- + predictions : ndarray of shape (n_samples, ) + The estimated labels. + + """ + check_is_fitted(self, 'weights_') + classification_matrix = self._binary_classification_matrix(X) + + if self.iterations_to_collect_as_hyperparameters: + self.test_preds = [] + for weight_vector in self.collected_weight_vectors_: + preds = np.sum(np.multiply(classification_matrix, + weight_vector), axis=1) + self.test_preds.append(change_label_to_zero(np.sign(preds))) + self.test_preds = np.array(self.test_preds) + margins = np.squeeze( + np.asarray(np.dot(classification_matrix, self.weights_))) + return change_label_to_zero( + np.array([int(x) for x in np.sign(margins)])) + + +class AdaboostGraalpy(AdaBoostGP, BaseMonoviewClassifier): + """AdaboostGraalpy + + Parameters + ---------- + random_state : int seed, RandomState instance, or None (default=None) + The seed of the pseudo random number generator to use when + shuffling the data. + + n_iterations : in number of iterations (default : 200) + + n_stumps : int (default 1) + + kwargs : others arguments + + + Attributes + ---------- + param_names : + + distribs : + + weird_strings : + + n_stumps : + + nbCores : + + """ + def __init__(self, random_state=None, n_iterations=200, n_stumps=1, + **kwargs): + + super(AdaboostGraalpy, self).__init__( + n_iterations=n_iterations, + n_stumps=n_stumps + ) + self.param_names = ["n_iterations", "n_stumps", "random_state"] + self.distribs = [CustomRandint(low=1, high=500), [n_stumps], + [random_state]] + self.classed_params = [] + self.weird_strings = {} + self.n_stumps = n_stumps + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + # def canProbas(self): + # """ + # Used to know if the classifier can return label probabilities + # + # Returns + # ------- + # True in any case + # """ + # return True + + def getInterpret(self, directory, y_test): + """ + + Parameters + ---------- + directory : + + y_test : + + Returns + ------- + retur string of interpret + """ + np.savetxt(directory + "train_metrics.csv", self.losses, delimiter=',') + np.savetxt(directory + "y_test_step.csv", self.test_preds, + delimiter=',') + step_metrics = [] + for step_index in range(self.test_preds.shape[0] - 1): + step_metrics.append(zero_one_loss.score(y_test, + self.test_preds[step_index, + :])) + step_metrics = np.array(step_metrics) + np.savetxt(directory + "step_test_metrics.csv", step_metrics, + delimiter=',') + return "" + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"n_iterations": args.AdG_n_iter, +# "n_stumps": args.AdG_stumps, } +# return kwargsDict + + +def paramsToSet(nIter, random_state): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"n_iterations": random_state.randint(1, 500), }) + return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/cq_boost.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/cq_boost.py new file mode 100644 index 0000000000000000000000000000000000000000..fc9b44ed7d608d61b084d1915a6ee6084dbea05a --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/cq_boost.py @@ -0,0 +1,76 @@ +import numpy as np + +from ..monoview.additions.BoostUtils import getInterpretBase +from ..monoview.additions.CQBoostUtils import ColumnGenerationClassifier +from ..monoview.monoview_utils import CustomUniform, CustomRandint, \ + BaseMonoviewClassifier + +classifier_class_name = "CQBoost" + +class CQBoost(ColumnGenerationClassifier, BaseMonoviewClassifier): + + def __init__(self, random_state=None, mu=0.01, epsilon=1e-06, n_stumps=1, + n_max_iterations=None, estimators_generator="Stumps", + max_depth=1, **kwargs): + super(CQBoost, self).__init__( + random_state=random_state, + mu=mu, + epsilon=epsilon, + estimators_generator=estimators_generator, + n_max_iterations=n_max_iterations, + max_depth=max_depth + ) + self.param_names = ["mu", "epsilon", "n_stumps", "random_state", + "n_max_iterations", "estimators_generator", + "max_depth"] + self.distribs = [CustomUniform(loc=0.5, state=1.0, multiplier="e-"), + CustomRandint(low=1, high=15, multiplier="e-"), + [n_stumps], [random_state], [n_max_iterations], + ["Stumps", "Trees"], CustomRandint(low=1, high=5)] + self.classed_params = [] + self.weird_strings = {} + self.n_stumps = n_stumps + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + # def canProbas(self): + # """Used to know if the classifier can return label probabilities""" + # return False + + def getInterpret(self, directory, y_test): + np.savetxt(directory + "train_metrics.csv", self.train_metrics, + delimiter=',') + np.savetxt(directory + "c_bounds.csv", self.c_bounds, + delimiter=',') + np.savetxt(directory + "y_test_step.csv", self.step_decisions, + delimiter=',') + step_metrics = [] + for step_index in range(self.step_decisions.shape[1] - 1): + step_metrics.append(self.plotted_metric.score(y_test, + self.step_decisions[:, + step_index])) + step_metrics = np.array(step_metrics) + np.savetxt(directory + "step_test_metrics.csv", step_metrics, + delimiter=',') + return getInterpretBase(self, directory, "CQBoost", self.weights_, + y_test) + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"mu": args.CQB_mu, +# "epsilon": args.CQB_epsilon, +# "n_stumps": args.CQB_stumps, +# "n_max_iterations": args.CQB_n_iter} +# return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 10 ** -randomState.uniform(0.5, 1.5), + "epsilon": 10 ** -randomState.randint(1, 15)}) + return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq.py new file mode 100644 index 0000000000000000000000000000000000000000..ec0bd7e7c56b46720afd2e759cec7a65957d6acd --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq.py @@ -0,0 +1,652 @@ +from ..monoview.monoview_utils import CustomUniform, BaseMonoviewClassifier + +#### Algorithm code #### + +# -*- coding:utf-8 -*- +""" MinCq learning algorithm + +Related papers: +[1] From PAC-Bayes Bounds to Quadratic Programs for Majority Votes (Laviolette et al., 2011) +[2] Risk Bounds for the Majority Vote: From a PAC-Bayesian Analysis to a Learning Algorithm (Germain et al., 2014) + +http://graal.ift.ulaval.ca/majorityvote/ +""" +__author__ = 'Jean-Francis Roy' +import time +import logging +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.metrics.pairwise import rbf_kernel, linear_kernel, \ + polynomial_kernel +# from qp import QP +from ..monoview.additions.BoostUtils import ConvexProgram as QP + + +classifier_class_name = "MinCQ" + +# from majority_vote import MajorityVote +# from voter import StumpsVotersGenerator, KernelVotersGenerator + +class MinCqLearner(BaseEstimator, ClassifierMixin): + """ + MinCq algorithm learner. See [1, 2] + + Parameters + ---------- + mu : float + The fixed value of the first moment of the margin. + + voters_type : string, optional (default='kernel') + Specifies the type of voters. + It must be one of 'kernel', 'stumps' or 'manual'. If 'manual' is specified, the voters have to be manually set + using the "voters" parameter of the fit function. + + n_stumps_per_attribute : int, optional (default=10) + Specifies the amount of decision stumps per attribute. + It is only significant with 'stumps' voters_type. + + kernel : string, optional (default='rbf') + Specifies the kernel type to be used in the algorithm. + It must be one of 'linear', 'poly', 'rbf'. + + degree : int, optional (default=3) + Degree of the polynomial kernel function ('poly'). + Ignored by all other kernels. + + gamma : float, optional (default=0.0) + Kernel coefficient for 'rbf' and 'poly'. + If gamma is 0.0 then 1/n_features will be used instead. + """ + + def __init__(self, mu, voters_type, n_stumps_per_attribute=10, kernel='rbf', + degree=3, gamma=0.0, self_complemented=True): + assert 0 < mu <= 1, "MinCqLearner: mu parameter must be in (0, 1]" + self.mu = mu + self.voters_type = voters_type + self.n_stumps_per_attribute = n_stumps_per_attribute + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.log = False + self.self_complemented = self_complemented + + self.majority_vote = None + self.qp = None + + def fit(self, X, y, voters=None): + """ Learn a majority vote weights using MinCq. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Training data + + y_reworked : ndarray, shape=(n_samples,), optional + Training labels + + voters : shape=(n_voters,), optional + A priori generated voters + """ + # Preparation of the majority vote, using a voter generator that depends on class attributes + if (np.unique(y) != [-1, 1]).any(): + y_reworked = np.copy(y) + y_reworked[np.where(y_reworked == 0)] = -1 + else: + y_reworked = y + + assert self.voters_type in ['stumps', 'kernel', + 'manual'], "MinCqLearner: voters_type must be 'stumps', 'kernel' or 'manual'" + + if self.voters_type == 'manual': + if voters is None: + logging.error( + "Manually set voters is True, but no voters have been set.") + return self + + else: + voters_generator = None + + if self.voters_type == 'stumps': + assert self.n_stumps_per_attribute >= 1, 'MinCqLearner: n_stumps_per_attribute must be positive' + voters_generator = StumpsVotersGenerator( + self.n_stumps_per_attribute) + + elif self.voters_type == 'kernel': + assert self.kernel in ['linear', 'poly', + 'rbf'], "MinCqLearner: kernel must be 'linear', 'poly' or 'rbf'" + + gamma = self.gamma + if gamma == 0.0: + gamma = 1.0 / np.shape(X)[1] + + if self.kernel == 'linear': + voters_generator = KernelVotersGenerator(linear_kernel) + elif self.kernel == 'poly': + voters_generator = KernelVotersGenerator(polynomial_kernel, + degree=self.degree, + gamma=gamma) + elif self.kernel == 'rbf': + voters_generator = KernelVotersGenerator(rbf_kernel, + gamma=gamma) + + voters = voters_generator.generate(X, y_reworked, + self_complemented=self.self_complemented) + + if self.log: + logging.info("MinCq training started...") + logging.info("Training dataset shape: {}".format(str(np.shape(X)))) + logging.info("Number of voters: {}".format(len(voters))) + self.majority_vote = MajorityVote(voters) + n_base_voters = len(self.majority_vote.weights) + + # Preparation and resolution of the quadratic program + + if self.log: + logging.info("Preparing QP...") + self._prepare_qp(X, y_reworked) + beg = time.time() + try: + if self.log: + logging.info("Solving QP...") + solver_weights = self.qp.solve() + + # Conversion of the weights of the n first voters to weights on the implicit 2n voters. + # See Section 7.1 of [2] for an explanation. + self.majority_vote.weights = np.array( + [2 * q - 1.0 / n_base_voters for q in solver_weights]) + if self.log: + logging.info( + "First moment of the margin on the training set: {:.4f}".format( + np.mean(y_reworked * self.majority_vote.margin(X)))) + + except Exception as e: + logging.error( + "{}: Error while solving the quadratic program: {}.".format( + str(self), str(e))) + self.majority_vote = None + self.cbound_train = self.majority_vote.cbound_value(X, y_reworked) + end=time.time() + self.train_time=end-beg + return self + + def predict(self, X, save_data=True): + """ Using previously learned majority vote weights, predict the labels of new data points. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Samples to predict + + Returns + ------- + predictions : ndarray, shape=(n_samples,) + The predicted labels + """ + if self.log: + logging.info("Predicting...") + if self.majority_vote is None: + logging.error( + "{}: Error while predicting: MinCq has not been fit or fitting has failed. Will output invalid labels".format( + str(self))) + return np.zeros((len(X),)) + if save_data: + self.x_test = X + + vote = self.majority_vote.vote(X) + vote[np.where(vote == -1)] = 0 + return vote + + def predict_proba(self, X): + """ Using previously learned majority vote weights, predict the labels of new data points with a confidence + level. The confidence level is the margin of the majority vote. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Samples to predict + + Returns + ------- + predictions : ndarray, shape=(n_samples,) + The predicted labels + """ + probabilities = np.zeros((np.shape(X)[0], 2)) + + # The margin is between -1 and 1, we rescale it to be between 0 and 1. + margins = self.majority_vote.margin(X) + margins += 1 + margins /= 2 + + # Then, the conficence for class +1 is set to the margin, and confidence for class -1 is set to 1 - margin. + probabilities[:, 1] = margins + probabilities[:, 0] = 1 - margins + return probabilities + + def _prepare_qp(self, X, y): + """ Prepare MinCq's quadratic program. See Program 1 of [2] for more details on its content. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Training data + + y : ndarray, shape=(n_samples,) + Training labels + """ + + self.qp = QP() + + n_features = len(self.majority_vote.voters) + n_examples = len(X) + classification_matrix = self.majority_vote.classification_matrix(X) + + # Objective function. + self.qp.quadratic_func = 2.0 / n_examples * classification_matrix.T.dot( + classification_matrix) + self.qp.linear_func = np.matrix( + np.matrix(-1.0 * np.mean(self.qp.quadratic_func / 2.0, axis=1))).T + + # First moment of the margin fixed to mu. + a_matrix = 2.0 / n_examples * y.T.dot(classification_matrix) + self.qp.add_equality_constraints(a_matrix, + self.mu + 1.0 / 2 * np.mean(a_matrix)) + + # Lower and upper bounds on the variables + self.qp.add_lower_bound(0.0) + self.qp.add_upper_bound(1.0 / n_features) + + +class MajorityVote(object): + """ A Majority Vote of real-valued functions. + + Parameters + ---------- + voters : ndarray of Voter instances + The voters of the majority vote. Each voter must take an example as an input, and output a real value in [-1,1]. + + weights : ndarray, optional (default: uniform distribution) + The weights associated to each voter. + """ + + def __init__(self, voters, weights=None): + self._voters = np.array(voters) + + if weights is not None: + assert (len(voters) == len(weights)) + self._weights = np.array(weights) + else: + self._weights = np.array([1.0 / len(voters)] * len(voters)) + + def vote(self, X): + """ Returns the vote of the Majority Vote on a list of samples. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Input data to classify. + + Returns + ------- + votes : ndarray, shape=(n_samples,), where each value is either -1 or 1 + The vote of the majority vote for each sample. + """ + margins = self.margin(X) + return np.array([int(x) for x in np.sign(margins)]) + + def margin(self, X): + """ Returns the margin of the Majority Vote on a list of samples. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Input data on which to calculate the margin. + + Returns + ------- + margins : ndarray, shape=(n_samples,), where each value is either -1 or 1 + The margin of the majority vote for each sample. + """ + classification_matrix = self.classification_matrix(X) + return np.squeeze( + np.asarray(np.dot(classification_matrix, self.weights))) + + def classification_matrix(self, X): + """ Returns the classification matrix of the majority vote. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Input data to classify + + Returns + ------- + classification_matrix : ndrray, shape=(n_samples, n_voters) + A matrix that contains the value output by each voter, for each sample. + + """ + return np.matrix([v.vote(X) for v in self._voters]).T + + @property + def weights(self): + return self._weights + + @weights.setter + def weights(self, weights): + self._weights = np.array(weights) + + @property + def voters(self): + return self._voters + + @voters.setter + def voters(self, voters): + self._voters = np.array(voters) + + def cbound_value(self, X, y): + """ Returns the value of the C-bound, evaluated on given examples. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_feature) + Input data + y : ndarray, shape=(n_samples, ) + Input labels, where each label is either -1 or 1. + """ + assert np.all(np.in1d(y, [-1, + 1])), 'cbound_value: labels should be either -1 or 1' + + classification_matrix = self.classification_matrix(X) + first_moment = float( + 1.0 / len(y) * classification_matrix.dot(self.weights).dot(y)) + second_moment = float(1.0 / len(y) * self.weights.T.dot( + classification_matrix.T.dot(classification_matrix)).dot( + self.weights)) + + return 1 - (first_moment ** 2 / second_moment) + + +# -*- coding:utf-8 -*- +__author__ = "Jean-Francis Roy" + +import numpy as np + + +class Voter(object): + """ Base class for a voter (function X -> [-1, 1]), where X is an array of samples + """ + + def __init__(self): + pass + + def vote(self, X): + """ Returns the output of the voter, on a sample list X + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Input data to classify + + Returns + ------- + votes : ndarray, shape=(n_samples,) + The result the the voter function, for each sample + """ + raise NotImplementedError("Voter.vote: Not implemented.") + + +class BinaryKernelVoter(Voter): + """ A Binary Kernel Voter, which outputs the value of a kernel function whose first example is fixed a priori. + The sign of the output depends on the label (-1 or 1) of the sample on which the kernel voter is based + + Parameters + ---------- + x : ndarray, shape=(n_features,) + The base sample's description vector + + y : int, -1 or 1 + The label of the base sample. Determines if the voter thinks "negative" or "positive" + + kernel_function : function + The kernel function takes two samples and returns a similarity value. If the kernel has parameters, they should + be set using kwargs parameter + + kwargs : keyword arguments (optional) + Additional parameters for the kernel function + """ + + def __init__(self, x, y, kernel_function, **kwargs): + assert (y in {-1, 1}) + super(BinaryKernelVoter, self).__init__() + self._x = x + self._y = y + self._kernel_function = kernel_function + self._kernel_kwargs = kwargs + + def vote(self, X): + base_point_array = np.array([self._x]) + votes = self._y * self._kernel_function(base_point_array, X, + **self._kernel_kwargs) + votes = np.squeeze(np.asarray(votes)) + + return votes + + +class DecisionStumpVoter(Voter): + """ + Generic Attribute Threshold Binary Classifier + + Parameters + ---------- + attribute_index : int + The attribute to consider for the classification + + threshold : float + The threshold value for classification rule + + direction : int (-1 or 1) + Used to reverse classification decision + + Attributes + ---------- + + attribute_index : + threshold : + direction : + """ + + def __init__(self, attribute_index, threshold, direction=1): + super(DecisionStumpVoter, self).__init__() + self.attribute_index = attribute_index + self.threshold = threshold + self.direction = direction + + def vote(self, points): + return [((point[ + self.attribute_index] > self.threshold) * 2 - 1) * self.direction + for point in points] + + +class VotersGenerator(object): + """ Base class to create a set of voters using training samples + """ + + def generate(self, X, y=None, self_complemented=False): + """ Generates the voters using samples. + + Parameters + ---------- + X : ndarray, shape=(n_samples, n_features) + Input data on which to base the voters + + y : ndarray, shape=(n_samples,), optional + Input labels, usually determines the decision polarity of each voter + + self_complemented : bool + Determines if complement voters should be generated or not + + Returns + ------- + voters : ndarray + An array of voters + """ + raise NotImplementedError("VotersGenerator.generate: not implemented") + + +class StumpsVotersGenerator(VotersGenerator): + """ Decision Stumps Voters generator. + + Parameters + ---------- + n_stumps_per_attribute : int, (default=10) + Determines how many decision stumps will be created for each attribute. + """ + + def __init__(self, n_stumps_per_attribute=10): + self._n_stumps_per_attribute = n_stumps_per_attribute + + def _find_extremums(self, X, i): + mini = np.Infinity + maxi = -np.Infinity + for x in X: + if x[i] < mini: + mini = x[i] + if x[i] > maxi: + maxi = x[i] + return mini, maxi + + def generate(self, X, y=None, self_complemented=False, + only_complements=False): + """ + + Parameters + ---------- + X + y + self_complemented + only_complements + + Returns + ------- + + """ + voters = [] + if len(X) != 0: + for i in range(len(X[0])): + t = self._find_extremums(X, i) + inter = (t[1] - t[0]) / (self._n_stumps_per_attribute + 1) + + if inter != 0: + # If inter is zero, the attribute is useless as it has a constant value. We do not add stumps for + # this attribute. + for x in range(self._n_stumps_per_attribute): + + if not only_complements: + voters.append( + DecisionStumpVoter(i, t[0] + inter * (x + 1), + 1)) + + if self_complemented or only_complements: + voters.append( + DecisionStumpVoter(i, t[0] + inter * (x + 1), + -1)) + + return np.array(voters) + + +class KernelVotersGenerator(VotersGenerator): + """ Utility function to create binary kernel voters for each (x, y) sample. + + Parameters + ---------- + kernel_function : function + The kernel function takes two samples and returns a similarity value. If the kernel has parameters, they should + be set using kwargs parameter + + kwargs : keyword arguments (optional) + Additional parameters for the kernel function + """ + + def __init__(self, kernel_function, **kwargs): + self._kernel_function = kernel_function + self._kernel_kwargs = kwargs + + def generate(self, X, y=None, self_complemented=False, + only_complements=False): + if y is None: + y = np.array([1] * len(X)) + + voters = [] + + for point, label in zip(X, y): + if not only_complements: + voters.append( + BinaryKernelVoter(point, label, self._kernel_function, + **self._kernel_kwargs)) + + if self_complemented or only_complements: + voters.append( + BinaryKernelVoter(point, -1 * label, self._kernel_function, + **self._kernel_kwargs)) + + return np.array(voters) + + +class MinCQ(MinCqLearner, BaseMonoviewClassifier): + + def __init__(self, random_state=None, mu=0.01, self_complemented=True, + n_stumps_per_attribute=10, **kwargs): + super(MinCQ, self).__init__(mu=mu, + voters_type='stumps', + n_stumps_per_attribute=n_stumps_per_attribute, + self_complemented=self_complemented + ) + self.param_names = ["mu", "n_stumps_per_attribute", "random_state"] + self.distribs = [CustomUniform(loc=0.5, state=2.0, multiplier="e-"), + [n_stumps_per_attribute], [random_state]] + self.random_state = random_state + self.classed_params = [] + self.weird_strings = {} + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + # def canProbas(self): + # """Used to know if the classifier can return label probabilities""" + # return True + + def set_params(self, **params): + self.mu = params["mu"] + self.random_state = params["random_state"] + self.n_stumps_per_attribute = params["n_stumps_per_attribute"] + return self + + def get_params(self, deep=True): + return {"random_state": self.random_state, "mu": self.mu, + "n_stumps_per_attribute": self.n_stumps_per_attribute} + + def getInterpret(self, directory, y_test): + interpret_string = "Train C_bound value : " + str(self.cbound_train) + y_rework = np.copy(y_test) + y_rework[np.where(y_rework == 0)] = -1 + interpret_string += "\n Test c_bound value : " + str( + self.majority_vote.cbound_value(self.x_test, y_rework)) + np.savetxt(directory+"times.csv", np.array([self.train_time, 0])) + return interpret_string + + def get_name_for_fusion(self): + return "MCQ" + +# +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"mu": args.MCQ_mu, +# "n_stumps_per_attribute": args.MCQ_stumps} +# return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({}) + return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy.py new file mode 100644 index 0000000000000000000000000000000000000000..8355dffc1a47dda9290a6cd57bbede64890d3454 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy.py @@ -0,0 +1,141 @@ +import numpy as np + +from ..monoview.additions.BoostUtils import StumpsClassifiersGenerator +from ..monoview.additions.MinCQUtils import RegularizedBinaryMinCqClassifier +from ..monoview.monoview_utils import BaseMonoviewClassifier, CustomUniform + + +classifier_class_name = "MinCQGraalpy" + +class MinCQGraalpy(RegularizedBinaryMinCqClassifier, BaseMonoviewClassifier): + """ + MinCQGraalpy extend of ``RegularizedBinaryMinCqClassifier `` + + Parameters + ---------- + random_state : int seed, RandomState instance, or None (default=None) + The seed of the pseudo random number generator to use when + shuffling the data. + + mu : float, (default: 0.01) + + self_complemented : bool (default : True) + + n_stumps_per_attribute : (default: =1 + + kwargs : others arguments + + + Attributes + ---------- + param_names + + distribs + + n_stumps_per_attribute + + classed_params + + weird_strings + + nbCores : number of cores + + """ + def __init__(self, random_state=None, mu=0.01, self_complemented=True, + n_stumps_per_attribute=1, **kwargs): + super(MinCQGraalpy, self).__init__(mu=mu, + estimators_generator=StumpsClassifiersGenerator( + n_stumps_per_attribute=n_stumps_per_attribute, + self_complemented=self_complemented), + ) + self.param_names = ["mu", "n_stumps_per_attribute", "random_state"] + self.distribs = [CustomUniform(loc=0.05, state=2.0, multiplier="e-"), + [n_stumps_per_attribute], [random_state]] + self.n_stumps_per_attribute = n_stumps_per_attribute + self.classed_params = [] + self.weird_strings = {} + self.random_state = random_state + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + # def canProbas(self): + # """ + # Used to know if the classifier can return label probabilities + # Returns + # ------- + # False + # """ + # return False + + def set_params(self, **params): + """ + set parameter 'self.mu', 'self.random_state + 'self.n_stumps_per_attribute + + Parameters + ---------- + params + + Returns + ------- + self : object + Returns self. + """ + self.mu = params["mu"] + self.random_state = params["random_state"] + self.n_stumps_per_attribute = params["n_stumps_per_attribute"] + return self + + def get_params(self, deep=True): + """ + + Parameters + ---------- + deep : bool (default : true) not used + + Returns + ------- + dictianary with "random_state", "mu", "n_stumps_per_attribute" + """ + return {"random_state": self.random_state, "mu": self.mu, + "n_stumps_per_attribute": self.n_stumps_per_attribute} + + def getInterpret(self, directory, y_test): + """ + + Parameters + ---------- + directory + y_test + + Returns + ------- + string of interpret_string + """ + interpret_string = "Cbound on train :" + str(self.train_cbound) + np.savetxt(directory + "times.csv", np.array([self.train_time, 0])) + # interpret_string += "Train C_bound value : "+str(self.cbound_train) + # y_rework = np.copy(y_test) + # y_rework[np.where(y_rework==0)] = -1 + # interpret_string += "\n Test c_bound value : "+str(self.majority_vote.cbound_value(self.x_test, y_rework)) + return interpret_string + + def get_name_for_fusion(self): + return "MCG" + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"mu": args.MCG_mu, +# "n_stumps_per_attribute": args.MCG_stumps} +# return kwargsDict + + +def paramsToSet(nIter, random_state): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({}) + return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy_tree.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy_tree.py new file mode 100644 index 0000000000000000000000000000000000000000..ac7a409d82e0b0698f1af913b4c1f2f41b9114d6 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy_tree.py @@ -0,0 +1,152 @@ +import numpy as np + +from ..monoview.additions.BoostUtils import TreeClassifiersGenerator +from ..monoview.additions.MinCQUtils import RegularizedBinaryMinCqClassifier +from ..monoview.monoview_utils import BaseMonoviewClassifier, CustomUniform + +classifier_class_name = "MinCQGraalpyTree" + +class MinCQGraalpyTree(RegularizedBinaryMinCqClassifier, + BaseMonoviewClassifier): + """ + + Parameters + ---------- + random_state : + + mu : (default : 0.01) + + self_complemented : ( default : True) + + n_stumps_per_attribute : int ( default : 1) + max_depth : + + kwargs : others parameters + + + Attributes + ---------- + param_name : + + distribs : + + classed_params : + + n_stumps_per_attribute : int + + weird_strings : + + max_depth : + + random_state : + + nbCores : + """ + def __init__(self, random_state=None, mu=0.01, self_complemented=True, + n_stumps_per_attribute=1, max_depth=2, **kwargs): + + super(MinCQGraalpyTree, self).__init__(mu=mu, + estimators_generator=TreeClassifiersGenerator( + n_trees=n_stumps_per_attribute, + max_depth=max_depth, + self_complemented=self_complemented), + ) + self.param_names = ["mu", "n_stumps_per_attribute", "random_state", + "max_depth"] + self.distribs = [CustomUniform(loc=0.05, state=2.0, multiplier="e-"), + [n_stumps_per_attribute], [random_state], [max_depth]] + self.n_stumps_per_attribute = n_stumps_per_attribute + self.classed_params = [] + self.weird_strings = {} + self.max_depth = max_depth + self.random_state = random_state + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + # def canProbas(self): + # """ + # Used to know if the classifier can return label probabilities + # + # Returns + # ------- + # True + # """ + # return True + + def set_params(self, **params): + """ + set parameter in the input dictionary + + Parameters + ---------- + params : dict parameter to set + + Returns + ------- + self : object + Returns self. + """ + self.mu = params["mu"] + self.random_state = params["random_state"] + self.n_stumps_per_attribute = params["n_stumps_per_attribute"] + self.max_depth = params["max_depth"] + return self + + def get_params(self, deep=True): + """ + get parameter + + Parameters + ---------- + deep : (boolean (default : True) not used + + Returns + ------- + dictionary of parameter as key and its values + """ + return {"random_state": self.random_state, "mu": self.mu, + "n_stumps_per_attribute": self.n_stumps_per_attribute, + "max_depth": self.max_depth} + + def getInterpret(self, directory, y_test): + """ + + Parameters + ---------- + directory : + + y_test : + + + Returns + ------- + string for interpretation interpret_string + """ + interpret_string = "Cbound on train :" + str(self.train_cbound) + np.savetxt(directory + "times.csv", np.array([self.train_time, 0])) + # interpret_string += "Train C_bound value : "+str(self.cbound_train) + # y_rework = np.copy(y_test) + # y_rework[np.where(y_rework==0)] = -1 + # interpret_string += "\n Test c_bound value : "+str(self.majority_vote.cbound_value(self.x_test, y_rework)) + return interpret_string + + def get_name_for_fusion(self): + return "MCG" + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"mu": args.MCGT_mu, +# "n_stumps_per_attribute": args.MCGT_trees, +# "max_depth": args.MCGT_max_depth} +# return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({}) + return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm.py new file mode 100644 index 0000000000000000000000000000000000000000..eb829fb97321b974951aa0802661050f3af59c54 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm.py @@ -0,0 +1,125 @@ +from pyscm.scm import SetCoveringMachineClassifier as scm + +from ..monoview.monoview_utils import CustomRandint, CustomUniform, \ + BaseMonoviewClassifier + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +# class DecisionStumpSCMNew(scm, BaseEstimator, ClassifierMixin): +# """docstring for SCM +# A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like +# CV, gridsearch, and so on ...""" +# +# def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): +# super(DecisionStumpSCMNew, self).__init__(model_type=model_type, max_rules=max_rules, p=p, random_state=random_state) +# # self.model_type = model_type +# # self.p = p +# # self.max_rules = max_rules +# # self.random_state = random_state +# # self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) +# +# # def fit(self, X, y): +# # print(self.clf.model_type) +# # self.clf.fit(X=X, y=y) +# # +# # def predict(self, X): +# # return self.clf.predict(X) +# # +# # def set_params(self, **params): +# # for key, value in iteritems(params): +# # if key == 'p': +# # self.p = value +# # if key == 'model_type': +# # self.model_type = value +# # if key == 'max_rules': +# # self.max_rules = value +# +# # def get_stats(self): +# # return {"Binary_attributes": self.clf.model_.rules} + + +classifier_class_name = "SCM" + +class SCM(scm, BaseMonoviewClassifier): + """ + SCM Classifier + Parameters + ---------- + random_state (default : None) + model_type : string (default: "conjunction") + max_rules : int number maximum of rules (default : 10) + p : float value(default : 0.1 ) + + kwarg : others arguments + + Attributes + ---------- + param_names + + distribs + + classed_params + + weird_strings + + """ + + def __init__(self, random_state=None, model_type="conjunction", + max_rules=10, p=0.1, **kwargs): + """ + + Parameters + ---------- + random_state + model_type + max_rules + p + kwargs + """ + super(SCM, self).__init__( + random_state=random_state, + model_type=model_type, + max_rules=max_rules, + p=p + ) + self.param_names = ["model_type", "max_rules", "p", "random_state"] + self.distribs = [["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1), [random_state]] + self.classed_params = [] + self.weird_strings = {} + + # def canProbas(self): + # """ + # Used to know if the classifier can return label probabilities + # + # Returns + # ------- + # return False in any case + # """ + # return False + + def getInterpret(self, directory, y_test): + interpretString = "Model used : " + str(self.model_) + return interpretString + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"model_type": args.SCM_model_type, +# "p": args.SCM_p, +# "max_rules": args.SCM_max_rules} +# return kwargsDict + + +def paramsToSet(nIter, random_state): + paramsSet = [] + for _ in range(nIter): + paramsSet.append( + {"model_type": random_state.choice(["conjunction", "disjunction"]), + "max_rules": random_state.randint(1, 15), + "p": random_state.random_sample()}) + return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm_pregen.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm_pregen.py new file mode 100644 index 0000000000000000000000000000000000000000..4b7ea990f2f5fd0b3d09acc14952e98770509fd7 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm_pregen.py @@ -0,0 +1,203 @@ +import os + +import numpy as np +from pyscm.scm import SetCoveringMachineClassifier as scm + +from ..monoview.additions.PregenUtils import PregenClassifier +from ..monoview.monoview_utils import CustomRandint, CustomUniform, \ + BaseMonoviewClassifier + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + +classifier_class_name = "SCMPregen" + +class SCMPregen(BaseMonoviewClassifier, PregenClassifier, scm): + """ + + Parameters + ---------- + random_state : int seed, RandomState instance, or None (default=None) + The seed of the pseudo random number generator to use when + shuffling the data. + + model_type + max_rules + p + n_stumps + self_complemented + estimators_generator + max_depth + kwargs + + Attributes + ---------- + param_names + + distribs + classed_params + weird_strings + self_complemented + n_stumps + estimators_generator + max_depth + """ + def __init__(self, random_state=None, model_type="conjunction", + max_rules=10, p=0.1, n_stumps=10, self_complemented=True, + estimators_generator="Stumps", max_depth=1, **kwargs): + super(SCMPregen, self).__init__( + random_state=random_state, + model_type=model_type, + max_rules=max_rules, + p=p + ) + self.param_names = ["model_type", "max_rules", "p", "n_stumps", + "random_state", "estimators_generator", "max_depth"] + self.distribs = [["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1), [n_stumps], + [random_state], ["Stumps", "Tree"], + CustomRandint(low=1, high=5)] + self.classed_params = [] + self.weird_strings = {} + self.self_complemented = self_complemented + self.n_stumps = n_stumps + self.estimators_generator = estimators_generator + self.max_depth=1 + + def get_params(self, deep=True): + """ + + Parameters + ---------- + deep : boolean (default : True) not used + + Returns + ------- + parameters dictionary + """ + params = super(SCMPregen, self).get_params(deep) + params["estimators_generator"] = self.estimators_generator + params["max_depth"] = self.max_depth + params["n_stumps"] = self.n_stumps + return params + + def fit(self, X, y, tiebreaker=None, iteration_callback=None, + **fit_params): + """ + fit function + + Parameters + ---------- + X {array-like, sparse matrix}, shape (n_samples, n_features) + For kernel="precomputed", the expected shape of X is + (n_samples_test, n_samples_train). + + y : { array-like, shape (n_samples,) + Target values class labels in classification + + tiebreaker + + iteration_callback : (default : None) + + fit_params : others parameters + + Returns + ------- + self : object + Returns self. + """ + pregen_X, _ = self.pregen_voters(X, y) + list_files = os.listdir(".") + a = int(self.random_state.randint(0, 10000)) + if "pregen_x" + str(a) + ".csv" in list_files: + a = int(np.random.randint(0, 10000)) + file_name = "pregen_x" + str(a) + ".csv" + while file_name in list_files: + a = int(np.random.randint(0, 10000)) + file_name = "pregen_x" + str(a) + ".csv" + else: + file_name = "pregen_x" + str(a) + ".csv" + np.savetxt(file_name, pregen_X, delimiter=',') + place_holder = np.genfromtxt(file_name, delimiter=',') + os.remove(file_name) + super(SCMPregen, self).fit(place_holder, y, tiebreaker=tiebreaker, + iteration_callback=iteration_callback, + **fit_params) + return self + + def predict(self, X): + """ + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Training vectors, where n_samples is the number of samples + and n_features is the number of features. + For kernel="precomputed", the expected shape of X is + (n_samples, n_samples). + + Returns + ------- + y_pred : array, shape (n_samples,) + """ + pregen_X, _ = self.pregen_voters(X) + list_files = os.listdir(".") + a = int(self.random_state.randint(0, 10000)) + if "pregen_x" + str(a) + ".csv" in list_files: + a = int(np.random.randint(0, 10000)) + file_name = "pregen_x" + str(a) + ".csv" + while file_name in list_files: + a = int(np.random.randint(0, 10000)) + file_name = "pregen_x" + str(a) + ".csv" + else: + file_name = "pregen_x" + str(a) + ".csv" + np.savetxt(file_name, pregen_X, delimiter=',') + place_holder = np.genfromtxt(file_name, delimiter=',') + os.remove(file_name) + return self.classes_[self.model_.predict(place_holder)] + + # def canProbas(self): + # """ + # Used to know if the classifier can return label probabilities + # Returns + # ------- + # False in any case + # """ + # + # return False + + def getInterpret(self, directory, y_test): + """ + + Parameters + ---------- + directory + y_test + + Returns + ------- + interpret_string string of interpretation + """ + interpret_string = "Model used : " + str(self.model_) + return interpret_string + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"model_type": args.SCP_model_type, +# "p": args.SCP_p, +# "max_rules": args.SCP_max_rules, +# "n_stumps": args.SCP_stumps} +# return kwargsDict + + +def paramsToSet(nIter, randomState): + paramsSet = [] + for _ in range(nIter): + paramsSet.append( + {"model_type": randomState.choice(["conjunction", "disjunction"]), + "max_rules": randomState.randint(1, 15), + "p": randomState.random_sample()}) + return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py index b4a0e3d74a2097cd3e36d34e0740d1db10989cf2..4d77b7fd460fe3295a72d5384c9a1eca2894269e 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py @@ -37,14 +37,12 @@ class SGD(SGDClassifier, BaseMonoviewClassifier): """ def __init__(self, random_state=None, loss='hinge', - penalty='l2', alpha=0.0001, max_iter=5, tol=None, **kwargs): + penalty='l2', alpha=0.0001, **kwargs): super(SGD, self).__init__( loss=loss, penalty=penalty, alpha=alpha, - max_iter=5, - tol=None, random_state=random_state ) self.param_names = ["loss", "penalty", "alpha", "random_state"] diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py index aa305849e6903b42bf63eb9e7b440ec3a20f85c6..90637f5d70b256275e0cad083701c3f748b2a422 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py @@ -66,8 +66,14 @@ def getTotalMetricScores(metric, trainLabels, testLabels, validationIndices, enumerate(metric[1])) else: metricKWARGS = {} - trainScore = metricModule.score(labels[learningIndices], trainLabels, + try: + trainScore = metricModule.score(labels[learningIndices], trainLabels, **metricKWARGS) + except: + print(labels[learningIndices]) + print(trainLabels) + import pdb; + pdb.set_trace() testScore = metricModule.score(labels[validationIndices], testLabels, **metricKWARGS) return [trainScore, testScore] diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py index b5020c21c1d641beb0ad28690e07398648c883b8..85bf7742c98b56261f1c5faf0e756b5e9bedc7d6 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py @@ -11,6 +11,7 @@ from .multiview_utils import MultiviewResult from . import analyze_results from .. import multiview_classifiers from ..utils import hyper_parameter_search +from ..utils.dataset import get_shape # Author-Info __author__ = "Baptiste Bauvin" diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py index 4c5e34719f0692260492bd4b1b95524a1d756bb5..8006f46e71ba3c90d4f5626d765045750cfe13bf 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py @@ -16,14 +16,11 @@ class MultiviewResult(object): self.y_test_multiclass_pred = test_labels_multiclass def get_classifier_name(self): - try: - multiview_classifier_module = getattr(multiview_classifiers, - self.classifier_name) - multiview_classifier = getattr(multiview_classifier_module, - multiview_classifier_module.classifier_class_name)(42) - return multiview_classifier.short_name - except: - return self.classifier_name + multiview_classifier_module = getattr(multiview_classifiers, + self.classifier_name) + multiview_classifier = getattr(multiview_classifier_module, + multiview_classifier_module.classifier_class_name)(42) + return multiview_classifier.short_name def get_names(classed_list): diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dc8665a06cb54657c49364482cfdcdbc046ca244 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/__init__.py @@ -0,0 +1 @@ +from . import fat_late_fusion, analyze_results \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/analyze_results.py new file mode 100644 index 0000000000000000000000000000000000000000..6e58780dc111ceec257df0ee15b489adf174077e --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/analyze_results.py @@ -0,0 +1,21 @@ +from ...multiview import analyze_results + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +def execute(classifier, trainLabels, + testLabels, DATASET, + classificationKWARGS, classification_indices, + labels_dictionary, views, nbCores, times, + name, KFolds, + hyper_param_search, nIter, metrics, + views_indices, randomState, labels, classifierModule): + return analyze_results.execute(classifier, trainLabels, + testLabels, DATASET, + classificationKWARGS, classificationIndices, + labels_dictionary, views, nbCores, times, + name, KFolds, + hyper_param_search, nIter, metrics, + views_indices, randomState, labels, classifierModule) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/fat_late_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/fat_late_fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..b93e79a4fc5713eb9adc9e363be949eac89e35f6 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/fat_late_fusion.py @@ -0,0 +1,82 @@ +import numpy as np + +from ...utils.multiclass import isBiclass, genMulticlassMonoviewDecision + + +def genName(config): + return "fat_late_fusion" + + +def getBenchmark(benchmark, args=None): + benchmark["multiview"]["fat_late_fusion"] = ["take_everything"] + return benchmark + + +def getArgs(args, benchmark, views, views_indices, randomState, directory, resultsMonoview, classificationIndices): + argumentsList = [] + multiclass_preds = [monoviewResult.y_test_multiclass_pred for monoviewResult in resultsMonoview] + if isBiclass(multiclass_preds): + monoviewDecisions = np.array([monoviewResult.full_labels_pred for monoviewResult in resultsMonoview]) + else: + monoviewDecisions = np.array([genMulticlassMonoviewDecision(monoviewResult, classificationIndices) for monoviewResult in resultsMonoview]) + if len(args.FLF_weights) == 0: + weights = [1.0 for _ in range(monoviewDecisions.shape[0])] + else: + weights = args.FLF_weights + arguments = {"CL_type": "fat_late_fusion", + "views": views, + "NB_VIEW": len(resultsMonoview), + "views_indices": range(len(resultsMonoview)), + "NB_CLASS": len(args.CL_classes), + "LABELS_NAMES": args.CL_classes, + "FatLateFusionKWARGS": { + "monoviewDecisions": monoviewDecisions, + "weights": weights + } + } + argumentsList.append(arguments) + return argumentsList + + +def genParamsSets(classificationKWARGS, randomState, nIter=1): + """Used to generate parameters sets for the random hyper parameters optimization function""" + nbMonoviewClassifiers = len(classificationKWARGS["monoviewDecisions"]) + weights = [randomState.random_sample(nbMonoviewClassifiers) for _ in range(nIter)] + nomralizedWeights = [[weightVector/np.sum(weightVector)] for weightVector in weights] + return nomralizedWeights + + +class FatLateFusionClass: + + def __init__(self, randomState, NB_CORES=1, **kwargs): + if kwargs["weights"] == []: + self.weights = [1.0/len(["monoviewDecisions"]) for _ in range(len(["monoviewDecisions"]))] + else: + self.weights = np.array(kwargs["weights"])/np.sum(np.array(kwargs["weights"])) + self.monoviewDecisions = kwargs["monoviewDecisions"] + + def setParams(self, paramsSet): + self.weights = paramsSet[0] + + def fit_hdf5(self, DATASET, labels, trainIndices=None, views_indices=None, metric=["f1_score", None]): + pass + + def predict_hdf5(self, DATASET, usedIndices=None, views_indices=None): + if usedIndices is None: + usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) + votes = np.zeros((len(usedIndices), DATASET.get("Metadata").attrs["nbClass"]), dtype=float) + for usedIndex, exampleIndex in enumerate(usedIndices): + for monoviewDecisionIndex, monoviewDecision in enumerate(self.monoviewDecisions): + votes[usedIndex, monoviewDecision[exampleIndex]] += self.weights[monoviewDecisionIndex] + predictedLabels = np.argmax(votes, axis=1) + return predictedLabels + + def predict_probas_hdf5(self, DATASET, usedIndices=None): + pass + + def getConfigString(self, classificationKWARGS): + return "weights : "+", ".join(map(str, list(self.weights))) + + def getSpecificAnalysis(self, classificationKWARGS): + stringAnalysis = '' + return stringAnalysis diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fce28aa3a7727ea6998ab5f0f2e2b61f31ada922 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/__init__.py @@ -0,0 +1 @@ +from . import fat_scm_late_fusion, analyze_results \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/analyze_results.py new file mode 100644 index 0000000000000000000000000000000000000000..d5fcd8a976689cd4aeac84bdbc9a9a03c3b95224 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/analyze_results.py @@ -0,0 +1,21 @@ +from ...multiview import analyze_results + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +def execute(classifier, trainLabels, + testLabels, DATASET, + classificationKWARGS, classification_indices, + labels_dictionary, views, nbCores, times, + name, KFolds, + hyper_param_search, nIter, metrics, + views_indices, random_state, labels, classifierModule): + return analyze_results.execute(classifier, trainLabels, + testLabels, DATASET, + classificationKWARGS, classification_indices, + labels_dictionary, views, nbCores, times, + name, KFolds, + hyper_param_search, nIter, metrics, + views_indices, random_state, labels, classifierModule) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/fat_scm_late_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/fat_scm_late_fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..34d3e982fed33d263447ce8a6e745b426f9b4768 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/fat_scm_late_fusion.py @@ -0,0 +1,132 @@ +import numpy as np +from pyscm.scm import SetCoveringMachineClassifier as scm +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.externals.six import iteritems + + +from ...utils.multiclass import isBiclass, genMulticlassMonoviewDecision + +def genName(config): + return "fat_scm_late_fusion" + + +def getBenchmark(benchmark, args=None): + benchmark["multiview"]["fat_scm_late_fusion"] = ["take_everything"] + return benchmark + + + +def getArgs(args, benchmark, views, views_indices, random_state, directory, resultsMonoview, classificationIndices): + argumentsList = [] + multiclass_preds = [monoviewResult.y_test_multiclass_pred for monoviewResult in resultsMonoview] + if isBiclass(multiclass_preds): + monoviewDecisions = np.array([monoviewResult.full_labels_pred for monoviewResult in resultsMonoview]) + else: + monoviewDecisions = np.array([genMulticlassMonoviewDecision(monoviewResult, classification_indices) for monoviewResult in resultsMonoview]) + monoviewDecisions = np.transpose(monoviewDecisions) + #monoviewDecisions = np.transpose(np.array([monoviewResult[1][3] for monoviewResult in resultsMonoview])) + arguments = {"CL_type": "fat_scm_late_fusion", + "views": ["all"], + "NB_VIEW": len(resultsMonoview), + "views_indices": range(len(resultsMonoview)), + "NB_CLASS": len(args.CL_classes), + "LABELS_NAMES": args.CL_classes, + "FatSCMLateFusionKWARGS": { + "monoviewDecisions": monoviewDecisions, + "p": args.FSCMLF_p, + "max_attributes": args.FSCMLF_max_attributes, + "model":args.FSCMLF_model, + } + } + argumentsList.append(arguments) + return argumentsList + + +def genParamsSets(classificationKWARGS, random_state, nIter=1): + """Used to generate parameters sets for the random hyper parameters optimization function""" + paramsSets = [] + for _ in range(nIter): + max_attributes = random_state.randint(1, 20) + p = random_state.random_sample() + model = random_state.choice(["conjunction", "disjunction"]) + paramsSets.append([p, max_attributes, model]) + + return paramsSets + + +class FatSCMLateFusionClass: + + def __init__(self, random_state, NB_CORES=1, **kwargs): + if kwargs["p"]: + self.p = kwargs["p"] + else: + self.p = 0.5 + if kwargs["max_attributes"]: + self.max_attributes = kwargs["max_attributes"] + else: + self.max_attributes = 5 + if kwargs["model"]: + self.model = kwargs["model"] + else: + self.model = "conjunction" + self.monoviewDecisions = kwargs["monoviewDecisions"] + self.random_state = random_state + + def setParams(self, paramsSet): + self.p = paramsSet[0] + self.max_attributes = paramsSet[1] + self.model = paramsSet[2] + + def fit_hdf5(self, DATASET, labels, trainIndices=None, views_indices=None, metric=["f1_score", None]): + features = self.monoviewDecisions[trainIndices] + self.SCMClassifier = DecisionStumpSCMNew(p=self.p, max_rules=self.max_attributes, model_type=self.model, + random_state=self.random_state) + self.SCMClassifier.fit(features, labels[trainIndices].astype(int)) + + def predict_hdf5(self, DATASET, usedIndices=None, views_indices=None): + if usedIndices is None: + usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) + predictedLabels = self.SCMClassifier.predict(self.monoviewDecisions[usedIndices]) + return predictedLabels + + def predict_probas_hdf5(self, DATASET, usedIndices=None): + pass + + def getConfigString(self, classificationKWARGS): + return "p : "+str(self.p)+", max_aributes : "+str(self.max_attributes)+", model : "+self.model + + def getSpecificAnalysis(self, classificationKWARGS): + stringAnalysis = 'Rules used : ' + str(self.SCMClassifier.clf.model_) + return stringAnalysis + + +class DecisionStumpSCMNew(BaseEstimator, ClassifierMixin): + """docstring for SCM + A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like + CV, gridsearch, and so on ...""" + + def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): + super(DecisionStumpSCMNew, self).__init__() + self.model_type = model_type + self.p = p + self.max_rules = max_rules + self.random_state = random_state + + def fit(self, X, y): + self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) + self.clf.fit(X=X, y=y) + + def predict(self, X): + return self.clf.predict(X) + + def set_params(self, **params): + for key, value in iteritems(params): + if key == 'p': + self.p = value + if key == 'model_type': + self.model_type = value + if key == 'max_rules': + self.max_rules = value + + def get_stats(self): + return {"Binary_attributes": self.clf.model_.rules} diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py new file mode 100644 index 0000000000000000000000000000000000000000..508d2a94d6c78d86cea917e2ae9164fcec4a8d49 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py @@ -0,0 +1,41 @@ +from sklearn.tree import DecisionTreeClassifier + + +from multimodalboost.mumbo import MumboClassifier +from ..multiview.multiview_utils import BaseMultiviewClassifier, \ + get_examples_views_indices +from ..utils.hyper_parameter_search import CustomRandint + +classifier_class_name = "Mumbo" + +class Mumbo(BaseMultiviewClassifier, MumboClassifier): + + def __init__(self, base_estimator=None, + n_estimators=50, + random_state=None, + best_view_mode="edge"): + super().__init__(random_state) + super(BaseMultiviewClassifier, self).__init__(base_estimator=base_estimator, + n_estimators=n_estimators, + random_state=random_state, + best_view_mode=best_view_mode) + self.param_names = ["base_estimator", "n_estimators", "random_state", "best_view_mode"] + self.distribs = [[DecisionTreeClassifier(max_depth=1)], + CustomRandint(5,200), [random_state], ["edge", "error"]] + + def fit(self, X, y, train_indices=None, view_indices=None): + train_indices, view_indices = get_examples_views_indices(X, + train_indices, + view_indices) + numpy_X, view_limits = X.to_numpy_array(example_indices=train_indices, + view_indices=view_indices) + return super(Mumbo, self).fit(numpy_X, y[train_indices], + view_limits) + + def predict(self, X, example_indices=None, view_indices=None): + example_indices, view_indices = get_examples_views_indices(X, + example_indices, + view_indices) + numpy_X, view_limits = X.to_numpy_array(example_indices=example_indices, + view_indices=view_indices) + return super(Mumbo, self).predict(numpy_X) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d6773304b2c117c67cdf8399b4840a4e54f76f03 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/__init__.py @@ -0,0 +1 @@ +from . import analyze_results, pseudo_cq_fusion \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/analyze_results.py new file mode 100644 index 0000000000000000000000000000000000000000..3823e68753d996524dd83c3475fb0fac8ee435e8 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/analyze_results.py @@ -0,0 +1,21 @@ +from ...multiview import analyze_results + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +def execute(classifier, trainLabels, + testLabels, DATASET, + classificationKWARGS, classificationIndices, + labels_dictionary, views, nbCores, times, + name, KFolds, + hyper_param_search, nIter, metrics, + views_indices, randomState, labels, classifierModule): + return analyze_results.execute(classifier, trainLabels, + testLabels, DATASET, + classificationKWARGS, classificationIndices, + labels_dictionary, views, nbCores, times, + name, KFolds, + hyper_param_search, nIter, metrics, + views_indices, randomState, labels, classifierModule) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/pseudo_cq_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/pseudo_cq_fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..bfd219d329c368594f6eab0a466c7eb5a4d3d358 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/pseudo_cq_fusion.py @@ -0,0 +1,41 @@ +from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.additions import \ + diversity_utils +from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.difficulty_fusion_old import difficulty +from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.double_fault_fusion_old import doubleFault + + +def genName(config): + return "pseudo_cq_fusion" + + +def getBenchmark(benchmark, args=None): + benchmark["multiview"]["pseudo_cq_fusion"] = ["take_everything"] + return benchmark + + +def pseudoCQ(difficulty, doubleFlaut): + return difficulty/float(doubleFlaut) + + +def getArgs(args, benchmark, views, views_indices, randomState, directory, resultsMonoview, classificationIndices): + return diversity_utils.getArgs(args, benchmark, views, + views_indices, randomState, directory, + resultsMonoview, classificationIndices, + [doubleFault, difficulty], "pseudo_cq_fusion") + + +def genParamsSets(classificationKWARGS, randomState, nIter=1): + return diversity_utils.genParamsSets(classificationKWARGS, randomState, nIter=nIter) + + + +class PseudoCQFusionClass(diversity_utils.DiversityFusionClass): + + def __init__(self, randomState, NB_CORES=1, **kwargs): + diversity_utils.DiversityFusionClass.__init__(self, randomState, NB_CORES=1, **kwargs) + + def getSpecificAnalysis(self, classificationKWARGS): + + stringAnalysis = "Classifiers used for each view : " + ', '.join(self.classifiers_names) +\ + ', with a pseudo CQ of ' + str(self.div_measure) + return stringAnalysis \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/scm_late_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/scm_late_fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..a8ec6bb2063760101b5be106141f9245843527fc --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/scm_late_fusion.py @@ -0,0 +1,125 @@ +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.externals.six import iteritems +import itertools +from pyscm.scm import SetCoveringMachineClassifier as scm + + +from ..multiview_classifiers.additions.late_fusion_utils import \ + LateFusionClassifier +from ..multiview.multiview_utils import get_examples_views_indices +from ..monoview.monoview_utils import CustomRandint, CustomUniform + +classifier_class_name = "SCMLateFusionClassifier" + + +class DecisionStumpSCMNew(BaseEstimator, ClassifierMixin): + """docstring for SCM + A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like + CV, gridsearch, and so on ...""" + + def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): + super(DecisionStumpSCMNew, self).__init__() + self.model_type = model_type + self.p = p + self.max_rules = max_rules + self.random_state = random_state + + def fit(self, X, y): + self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) + self.clf.fit(X=X, y=y) + + def predict(self, X): + return self.clf.predict(X) + + def set_params(self, **params): + for key, value in iteritems(params): + if key == 'p': + self.p = value + if key == 'model_type': + self.model_type = value + if key == 'max_rules': + self.max_rules = value + + def get_stats(self): + return {"Binary_attributes": self.clf.model_.rules} + + +class SCMLateFusionClassifier(LateFusionClassifier): + def __init__(self, random_state=None, classifier_names=None, + classifier_configs=None, nb_cores=1, + p=1, max_rules=5, order=1, model_type="conjunction", weights=None): + self.need_probas=False + super(SCMLateFusionClassifier, self).__init__(random_state=random_state, + classifier_names=classifier_names, + classifier_configs=classifier_configs, + nb_cores=nb_cores + ) + self.scm_classifier = None + self.p = p + self.max_rules = max_rules + self.order = order + self.model_type = model_type + self.param_names+=["model_type", "max_rules", "p", "order"] + self.distribs+=[["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1), [1,2,3]] + + def fit(self, X, y, train_indices=None, view_indices=None): + super(SCMLateFusionClassifier, self).fit(X, y, + train_indices=train_indices, + view_indices=view_indices) + self.scm_fusion_fit(X, y, train_indices=train_indices, view_indices=view_indices) + return self + + def predict(self, X, example_indices=None, view_indices=None): + example_indices, view_indices = get_examples_views_indices(X, + example_indices, + view_indices) + monoview_decisions = np.zeros((len(example_indices), X.nb_view), + dtype=int) + for index, view_index in enumerate(view_indices): + monoview_decision = self.monoview_estimators[index].predict( + X.get_v(view_index, example_indices)) + monoview_decisions[:, index] = monoview_decision + features = self.generate_interactions(monoview_decisions) + predicted_labels = self.scm_classifier.predict(features) + return predicted_labels + + def scm_fusion_fit(self, X, y, train_indices=None, view_indices=None): + train_indices, view_indices = get_examples_views_indices(X, train_indices, view_indices) + + self.scm_classifier = DecisionStumpSCMNew(p=self.p, max_rules=self.max_rules, model_type=self.model_type, + random_state=self.random_state) + monoview_decisions = np.zeros((len(train_indices), X.nb_view), dtype=int) + for index, view_index in enumerate(view_indices): + monoview_decisions[:, index] = self.monoview_estimators[index].predict( + X.get_v(view_index, train_indices)) + features = self.generate_interactions(monoview_decisions) + features = np.array([np.array([feat for feat in feature]) + for feature in features]) + self.scm_classifier.fit(features, y[train_indices].astype(int)) + + def generate_interactions(self, monoview_decisions): + if self.order is None: + self.order = monoview_decisions.shape[1] + if self.order == 1: + return monoview_decisions + else: + genrated_intercations = [monoview_decisions[:, i] + for i in range(monoview_decisions.shape[1])] + for order_index in range(self.order - 1): + combins = itertools.combinations(range(monoview_decisions.shape[1]), + order_index + 2) + for combin in combins: + generated_decision = monoview_decisions[:, combin[0]] + for index in range(len(combin) - 1): + if self.model_type == "disjunction": + generated_decision = np.logical_and(generated_decision, + monoview_decisions[:, combin[index + 1]]) + else: + generated_decision = np.logical_or(generated_decision, + monoview_decisions[:, combin[index + 1]]) + genrated_intercations.append(generated_decision) + return np.transpose(np.array(genrated_intercations)) + diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py index 159623e4dea06e3014fa96a13d2b588ca828c981..e63ebbb63b35e5d69baa113f68889ee9ca389ce4 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py @@ -84,7 +84,7 @@ class WeightedLinearEarlyFusion(BaseMultiviewClassifier, BaseFusionClassifier): example_indices, self.view_indices = get_examples_views_indices(dataset, example_indices, view_indices) - if self.view_weights is None: + if self.view_weights is None or self.view_weights=="None": self.view_weights = np.ones(len(self.view_indices), dtype=float) else: self.view_weights = np.array(self.view_weights) diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis.py index f93d72a633ba4dd9e074a18438f95972cd598600..a94f8e1ed9a8fb838a5ea897eb7ba4f540abb73b 100644 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis.py +++ b/multiview_platform/mono_multi_view_classifiers/result_analysis.py @@ -61,200 +61,6 @@ def plot_results_noise(directory, noise_results, metric_to_plot, name, width=0.1 df.to_csv(directory+name+"_noise_analysis.csv") -def plot_metric_scores(train_scores, test_scores, names, nb_results, metric_name, - file_name, - tag="", train_STDs=None, test_STDs=None): - r"""Used to plot and save the score barplot for a specific metric. - - Parameters - ---------- - train_scores : list or np.array of floats - The scores of each classifier on the training set. - test_scores : list or np.array of floats - The scores of each classifier on the testing set. - names : list or np.array of strs - The names of all the classifiers. - nb_results: int - The number of classifiers to plot. - metric_name : str - The plotted metric's name - file_name : str - The name of the file where the figure will be saved. - tag : str - Some text to personalize the title, must start with a whitespace. - train_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the training set. - test_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the testing set. - - Returns - ------- - """ - - figKW, barWidth = get_fig_size(nb_results) - - names, train_scores, test_scores, train_STDs, test_STDs = sort_by_test_score( - train_scores, test_scores, names, - train_STDs, test_STDs) - - f, ax = plt.subplots(nrows=1, ncols=1, **figKW) - ax.set_title(metric_name + "\n" + tag + " scores for each classifier") - - rects = ax.bar(range(nb_results), test_scores, barWidth, color="0.1", - yerr=test_STDs) - rect2 = ax.bar(np.arange(nb_results) + barWidth, train_scores, barWidth, - color="0.8", yerr=train_STDs) - autolabel(rects, ax, set=1, std=test_STDs) - autolabel(rect2, ax, set=2, std=train_STDs) - ax.legend((rects[0], rect2[0]), ('Test', 'Train')) - ax.set_ylim(-0.1, 1.1) - ax.set_xticks(np.arange(nb_results) + barWidth/2) - ax.set_xticklabels(names, rotation="vertical") - - try: - plt.tight_layout() - except: - pass - f.savefig(file_name + '.png', transparent=True) - plt.close() - import pandas as pd - if train_STDs is None: - dataframe = pd.DataFrame(np.transpose(np.concatenate(( - train_scores.reshape((train_scores.shape[0], 1)), - test_scores.reshape((train_scores.shape[0], 1))), axis=1)), - columns=names) - else: - dataframe = pd.DataFrame(np.transpose(np.concatenate(( - train_scores.reshape((train_scores.shape[0], 1)), - train_STDs.reshape((train_scores.shape[0], 1)), - test_scores.reshape((train_scores.shape[0], 1)), - test_STDs.reshape((train_scores.shape[0], 1))), axis=1)), - columns=names) - dataframe.to_csv(file_name + ".csv") - - -def plot_2d(data, classifiers_names, nbClassifiers, nbExamples, - fileName, minSize=10, - width_denominator=2.0, height_denominator=20.0, stats_iter=1, - use_plotly=True, example_ids=None): - r"""Used to generate a 2D plot of the errors. - - Parameters - ---------- - data : np.array of shape `(nbClassifiers, nbExamples)` - A matrix with zeros where the classifier failed to classifiy the example, ones where it classified it well - and -100 if the example was not classified. - classifiers_names : list of str - The names of the classifiers. - nbClassifiers : int - The number of classifiers. - nbExamples : int - The number of examples. - nbCopies : int - The number of times the data is copied (classifier wise) in order for the figure to be more readable - fileName : str - The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) - minSize : int, optinal, default: 10 - The minimum width and height of the figure. - width_denominator : float, optional, default: 1.0 - To obtain the image width, the number of classifiers will be divided by this number. - height_denominator : float, optional, default: 1.0 - To obtain the image width, the number of examples will be divided by this number. - stats_iter : int, optional, default: 1 - The number of statistical iterations realized. - - Returns - ------- - """ - fig, ax = plt.subplots(nrows=1, ncols=1,) - cmap, norm = iterCmap(stats_iter) - cax = plt.imshow(data, cmap=cmap, norm=norm, - aspect='auto') - plt.title('Errors depending on the classifier') - ticks = np.arange(0, nbClassifiers, 1) - labels = classifiers_names - plt.xticks(ticks, labels, rotation="vertical") - cbar = fig.colorbar(cax, ticks=[-100 * stats_iter / 2, 0, stats_iter]) - cbar.ax.set_yticklabels(['Unseen', 'Always Wrong', 'Always Right']) - - fig.savefig(fileName + "error_analysis_2D.png", bbox_inches="tight", transparent=True) - plt.close() - ### The following part is used to generate an interactive graph. - if use_plotly: - import plotly - hover_text = [["Failed "+ str(stats_iter-data[i,j])+" time(s)" - for j in range(data.shape[1])] - for i in range(data.shape[0]) ] - fig = plotly.graph_objs.Figure(data=plotly.graph_objs.Heatmap( - x=list(classifiers_names), - y=[_ for _ in example_ids], - z=data, - text=hover_text, - hoverinfo=["y", "x", "text"], - colorscale="Greys", - colorbar=dict(tickvals=[0, stats_iter], - ticktext=["Always Wrong", "Always Right"]), - reversescale=True)) - fig.update_layout( - xaxis={"showgrid": False, "showticklabels": False, "ticks": ''}, - yaxis={"showgrid": False, "showticklabels": False, "ticks": ''}) - plotly.offline.plot(fig, filename=fileName + "error_analysis_2D.html", auto_open=False) - del fig - - -def plot_errors_bar(error_on_examples, nbClassifiers, nbExamples, fileName): - r"""Used to generate a barplot of the muber of classifiers that failed to classify each examples - - Parameters - ---------- - error_on_examples : np.array of shape `(nbExamples,)` - An array counting how many classifiers failed to classifiy each examples. - classifiers_names : list of str - The names of the classifiers. - nbClassifiers : int - The number of classifiers. - nbExamples : int - The number of examples. - fileName : str - The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) - - Returns - ------- - """ - fig, ax = plt.subplots() - x = np.arange(nbExamples) - plt.bar(x, error_on_examples) - plt.ylim([0, nbClassifiers]) - plt.title("Number of classifiers that failed to classify each example") - fig.savefig(fileName + "error_analysis_bar.png", transparent=True) - plt.close() - - -def iterCmap(statsIter): - r"""Used to generate a colormap that will have a tick for each iteration : the whiter the better. - - Parameters - ---------- - statsIter : int - The number of statistical iterations. - - Returns - ------- - cmap : matplotlib.colors.ListedColorMap object - The colormap. - norm : matplotlib.colors.BoundaryNorm object - The bounds for the colormap. - """ - cmapList = ["red", "0.0"] + [str(float((i + 1)) / statsIter) for i in - range(statsIter)] - cmap = mpl.colors.ListedColormap(cmapList) - bounds = [-100 * statsIter - 0.5, -0.5] - for i in range(statsIter): - bounds.append(i + 0.5) - bounds.append(statsIter + 0.5) - norm = mpl.colors.BoundaryNorm(bounds, cmap.N) - return cmap, norm - def autolabel(rects, ax, set=1, std=None): r"""Used to print the score below the bars. @@ -291,34 +97,6 @@ def autolabel(rects, ax, set=1, std=None): "%.2f" % height, weight=weight, ha='center', va='bottom', size="small") -def get_fig_size(nb_results, min_size=15, multiplier=1.0, bar_width=0.35): - r"""Used to get the image size to save the figure and the bar width, depending on the number of scores to plot. - - Parameters - ---------- - nb_results : int - The number of couple of bar to plot. - min_size : int - The minimum size of the image, if there are few classifiers to plot. - multiplier : float - The ratio between the image size and the number of classifiers. - bar_width : float - The width of the bars in the figure. Mainly here to centralize bar_width. - - Returns - ------- - fig_kwargs : dict of arguments - The argument restraining the size of the figure, usable directly in the `subplots` function of - `matplotlib.pyplot`. - bar_width : float - The width of the bars in the figure. Mainly here to centralize bar_width. - """ - size = nb_results * multiplier - if size < min_size: - size = min_size - fig_kwargs = {"figsize": (size, size / 3)} - return fig_kwargs, bar_width - def get_metrics_scores_biclass(metrics, results): r"""Used to extract metrics scores in case of biclass classification @@ -328,7 +106,7 @@ def get_metrics_scores_biclass(metrics, results): metrics : list of lists The metrics names with configuration metrics[i][0] = name of metric i results : list of MonoviewResult and MultiviewResults objects - A list containing all the results for all the monoview experimentations. + A list containing all the resluts for all the monoview experimentations. Returns ------- @@ -339,28 +117,25 @@ def get_metrics_scores_biclass(metrics, results): -`metricScores[metric_name]["train_scores"]` is a list of all the available classifiers scores on the train set, -`metricScores[metric_name]["test_scores"]` is a list of all the available classifiers scores on the test set. """ - classifier_names=[] - classifier_names = [classifierResult.get_classifier_name() - for classifierResult in results - if classifierResult.get_classifier_name() - not in classifier_names ] - metrics_scores = dict((metric[0], pd.DataFrame(data=np.zeros((2, - len(classifier_names))), - index=["train", "test"], - columns=classifier_names)) - for metric in metrics) + metrics_scores = {} for metric in metrics: + classifiers_names = [] + train_scores = [] + test_scores = [] + for classifierResult in results: - metrics_scores[metric[0]].loc["train", classifierResult.get_classifier_name()] = classifierResult.metrics_scores[metric[0]][0] - metrics_scores[metric[0]].loc[ - "test", classifierResult.get_classifier_name()] = \ - classifierResult.metrics_scores[metric[0]][1] + train_scores.append(classifierResult.metrics_scores[metric[0]][0]) + test_scores.append(classifierResult.metrics_scores[metric[0]][1]) + classifiers_names.append(classifierResult.get_classifier_name()) + metrics_scores[metric[0]] = {"classifiers_names": classifiers_names, + "train_scores": train_scores, + "test_scores": test_scores} return metrics_scores -def get_example_errors_biclass(groud_truth, results): +def getExampleErrorsBiclass(groud_truth, results): r"""Used to get for each classifier and each example whether the classifier has misclassified the example or not. Parameters @@ -379,15 +154,46 @@ def get_example_errors_biclass(groud_truth, results): """ example_errors = {} - for classifier_result in results: - error_on_examples = np.equal(classifier_result.full_labels_pred, + for classifierResult in results: + error_on_examples = np.equal(classifierResult.full_labels_pred, groud_truth).astype(int) - unseen_examples = np.where(groud_truth == -100)[0] - error_on_examples[unseen_examples] = -100 - example_errors[classifier_result.get_classifier_name()] = error_on_examples + unseenExamples = np.where(groud_truth == -100)[0] + error_on_examples[unseenExamples] = -100 + example_errors[classifierResult.get_classifier_name()] = { + "error_on_examples": error_on_examples} + return example_errors +def get_fig_size(nb_results, min_size=15, multiplier=1.0, bar_width=0.35): + r"""Used to get the image size to save the figure and the bar width, depending on the number of scores to plot. + + Parameters + ---------- + nb_results : int + The number of couple of bar to plot. + min_size : int + The minimum size of the image, if there are few classifiers to plot. + multiplier : float + The ratio between the image size and the number of classifiers. + bar_width : float + The width of the bars in the figure. Mainly here to centralize bar_width. + + Returns + ------- + fig_kwargs : dict of arguments + The argument restraining the size of the figure, usable directly in the `subplots` function of + `matplotlib.pyplot`. + bar_width : float + The width of the bars in the figure. Mainly here to centralize bar_width. + """ + size = nb_results * multiplier + if size < min_size: + size = min_size + fig_kwargs = {"figsize": (size, size / 3)} + return fig_kwargs, bar_width + + def sort_by_test_score(train_scores, test_scores, names, train_STDs=None, test_STDs=None): r"""Used to sort the results (names and both scores) in descending test score order. @@ -433,7 +239,77 @@ def sort_by_test_score(train_scores, test_scores, names, train_STDs=None, return sorted_names, sorted_train_scores, sorted_test_scores, sorted_train_STDs, sorted_test_STDs +def plotMetricScores(train_scores, test_scores, names, nb_results, metric_name, + file_name, + tag="", train_STDs=None, test_STDs=None): + r"""Used to plot and save the score barplot for a specific metric. + + Parameters + ---------- + train_scores : list or np.array of floats + The scores of each classifier on the training set. + test_scores : list or np.array of floats + The scores of each classifier on the testing set. + names : list or np.array of strs + The names of all the classifiers. + nb_results: int + The number of classifiers to plot. + metric_name : str + The plotted metric's name + file_name : str + The name of the file where the figure will be saved. + tag : str + Some text to personalize the title, must start with a whitespace. + train_STDs : np.array of floats or None + The array containing the standard deviations for the averaged scores on the training set. + test_STDs : np.array of floats or None + The array containing the standard deviations for the averaged scores on the testing set. + + Returns + ------- + """ + + figKW, barWidth = get_fig_size(nb_results) + + names, train_scores, test_scores, train_STDs, test_STDs = sort_by_test_score( + train_scores, test_scores, names, + train_STDs, test_STDs) + + f, ax = plt.subplots(nrows=1, ncols=1, **figKW) + ax.set_title(metric_name + "\n" + tag + " scores for each classifier") + + rects = ax.bar(range(nb_results), test_scores, barWidth, color="0.1", + yerr=test_STDs) + rect2 = ax.bar(np.arange(nb_results) + barWidth, train_scores, barWidth, + color="0.8", yerr=train_STDs) + autolabel(rects, ax, set=1, std=test_STDs) + autolabel(rect2, ax, set=2, std=train_STDs) + print("nb_results", nb_results) + ax.legend((rects[0], rect2[0]), ('Test', 'Train')) + ax.set_ylim(-0.1, 1.1) + ax.set_xticks(np.arange(nb_results) + barWidth) + ax.set_xticklabels(names, rotation="vertical") + try: + plt.tight_layout() + except: + pass + f.savefig(file_name + '.png', transparent=True) + plt.close() + import pandas as pd + if train_STDs is None: + dataframe = pd.DataFrame(np.transpose(np.concatenate(( + train_scores.reshape((train_scores.shape[0], 1)), + test_scores.reshape((train_scores.shape[0], 1))), axis=1)), + columns=names) + else: + dataframe = pd.DataFrame(np.transpose(np.concatenate(( + train_scores.reshape((train_scores.shape[0], 1)), + train_STDs.reshape((train_scores.shape[0], 1)), + test_scores.reshape((train_scores.shape[0], 1)), + test_STDs.reshape((train_scores.shape[0], 1))), axis=1)), + columns=names) + dataframe.to_csv(file_name + ".csv") def publishMetricsGraphs(metrics_scores, directory, database_name, labels_names): @@ -456,40 +332,134 @@ def publishMetricsGraphs(metrics_scores, directory, database_name, labels_names) results """ results=[] - for metric_name, metric_dataframe in metrics_scores.items(): + for metric_name, metric_scores in metrics_scores.items(): logging.debug( "Start:\t Biclass score graph generation for " + metric_name) - train_scores, test_scores, classifier_names, \ - file_name, nb_results,results = init_plot(results, metric_name, - metric_dataframe, directory, - database_name, labels_names) - - plot_metric_scores(train_scores, test_scores, classifier_names, - nb_results, metric_name, file_name, - tag=" "+" vs ".join(labels_names)) - logging.debug("Done:\t Biclass score graph generation for "+metric_name) + + nb_results = len(metric_scores["test_scores"]) + file_name = directory + time.strftime( + "%Y_%m_%d-%H_%M_%S") + "-" + database_name + "-" + "_vs_".join( + labels_names) + "-" + metric_name + + plotMetricScores(np.array(metric_scores["train_scores"]), + np.array(metric_scores["test_scores"]), + np.array(metric_scores["classifiers_names"]), nb_results, + metric_name, file_name, + tag=" " + " vs ".join(labels_names)) + + logging.debug( + "Done:\t Biclass score graph generation for " + metric_name) + results+=[[classifiers_name, metric_name, testMean, testSTD] + for classifiers_name, testMean, testSTD in zip(np.array(metric_scores["classifiers_names"]), + np.array(metric_scores["test_scores"]), + np.zeros(len(np.array(metric_scores["test_scores"]))))] return results +def iterCmap(statsIter): + r"""Used to generate a colormap that will have a tick for each iteration : the whiter the better. -def init_plot(results, metric_name, metric_dataframe, - directory, database_name, labels_names): + Parameters + ---------- + statsIter : int + The number of statistical iterations. + + Returns + ------- + cmap : matplotlib.colors.ListedColorMap object + The colormap. + norm : matplotlib.colors.BoundaryNorm object + The bounds for the colormap. + """ + cmapList = ["red", "0.0"] + [str(float((i + 1)) / statsIter) for i in + range(statsIter)] + cmap = mpl.colors.ListedColormap(cmapList) + bounds = [-100 * statsIter - 0.5, -0.5] + for i in range(statsIter): + bounds.append(i + 0.5) + bounds.append(statsIter + 0.5) + norm = mpl.colors.BoundaryNorm(bounds, cmap.N) + return cmap, norm - train = np.array(metric_dataframe.loc["train"]) - test = np.array(metric_dataframe.loc["test"]) - classifier_names = np.array(metric_dataframe.columns) - nb_results = metric_dataframe.shape[1] +def publish2Dplot(data, classifiers_names, nbClassifiers, nbExamples, nbCopies, + fileName, minSize=10, + width_denominator=2.0, height_denominator=20.0, stats_iter=1): + r"""Used to generate a 2D plot of the errors. + + Parameters + ---------- + data : np.array of shape `(nbClassifiers, nbExamples)` + A matrix with zeros where the classifier failed to classifiy the example, ones where it classified it well + and -100 if the example was not classified. + classifiers_names : list of str + The names of the classifiers. + nbClassifiers : int + The number of classifiers. + nbExamples : int + The number of examples. + nbCopies : int + The number of times the data is copied (classifier wise) in order for the figure to be more readable + fileName : str + The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) + minSize : int, optinal, default: 10 + The minimum width and height of the figure. + width_denominator : float, optional, default: 1.0 + To obtain the image width, the number of classifiers will be divided by this number. + height_denominator : float, optional, default: 1.0 + To obtain the image width, the number of examples will be divided by this number. + stats_iter : int, optional, default: 1 + The number of statistical iterations realized. - file_name = directory + time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + database_name + "-" + "_vs_".join( - labels_names) + "-" + metric_name + Returns + ------- + """ + figWidth = max(nbClassifiers / width_denominator, minSize) + figHeight = max(nbExamples / height_denominator, minSize) + figKW = {"figsize": (figWidth, figHeight)} + fig, ax = plt.subplots(nrows=1, ncols=1, **figKW) + cmap, norm = iterCmap(stats_iter) + cax = plt.imshow(data, interpolation='none', cmap=cmap, norm=norm, + aspect='auto') + plt.title('Errors depending on the classifier') + ticks = np.arange(nbCopies / 2 - 0.5, nbClassifiers * nbCopies, nbCopies) + labels = classifiers_names + plt.xticks(ticks, labels, rotation="vertical") + cbar = fig.colorbar(cax, ticks=[-100 * stats_iter / 2, 0, stats_iter]) + cbar.ax.set_yticklabels(['Unseen', 'Always Wrong', 'Always Right']) + fig.tight_layout() + fig.savefig(fileName + "error_analysis_2D.png", bbox_inches="tight", transparent=True) + plt.close() - results += [[classifiers_name, metric_name, testMean, testSTD] - for classifiers_name, testMean, testSTD in - zip(classifier_names, test, np.zeros(len(test)))] - return train, test, classifier_names, file_name, nb_results, results -def gen_error_data(example_errors): +def publishErrorsBarPlot(error_on_examples, nbClassifiers, nbExamples, fileName): + r"""Used to generate a barplot of the muber of classifiers that failed to classify each examples + + Parameters + ---------- + error_on_examples : np.array of shape `(nbExamples,)` + An array counting how many classifiers failed to classifiy each examples. + classifiers_names : list of str + The names of the classifiers. + nbClassifiers : int + The number of classifiers. + nbExamples : int + The number of examples. + fileName : str + The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) + + Returns + ------- + """ + fig, ax = plt.subplots() + x = np.arange(nbExamples) + plt.bar(x, error_on_examples) + plt.ylim([0, nbClassifiers]) + plt.title("Number of classifiers that failed to classify each example") + fig.savefig(fileName + "error_analysis_bar.png", transparent=True) + plt.close() + + +def gen_error_data(example_errors, base_file_name, nbCopies=2): r"""Used to format the error data in order to plot it efficiently. The data is saves in a `.csv` file. Parameters @@ -523,38 +493,42 @@ def gen_error_data(example_errors): error_on_examples : np.array of shape `(nbExamples,)` An array counting how many classifiers failed to classifiy each examples. """ - nb_classifiers = len(example_errors) - nb_examples = len(list(example_errors.values())[0]) - classifiers_names = list(example_errors.keys()) + nbClassifiers = len(example_errors) + nbExamples = len(list(example_errors.values())[0]["error_on_examples"]) + classifiers_names = example_errors.keys() - data_2d = np.zeros((nb_examples, nb_classifiers)) + data = np.zeros((nbExamples, nbClassifiers * nbCopies)) + temp_data = np.zeros((nbExamples, nbClassifiers)) for classifierIndex, (classifier_name, error_on_examples) in enumerate( example_errors.items()): - data_2d[:, classifierIndex] = error_on_examples - error_on_examples = -1 * np.sum(data_2d, axis=1) / nb_classifiers + for iter_index in range(nbCopies): + data[:, classifierIndex * nbCopies + iter_index] = error_on_examples[ + "error_on_examples"] + temp_data[:, classifierIndex] = error_on_examples["error_on_examples"] + error_on_examples = -1 * np.sum(data, axis=1) / nbCopies + nbClassifiers + + np.savetxt(base_file_name + "2D_plot_data.csv", data, delimiter=",") + np.savetxt(base_file_name + "bar_plot_data.csv", temp_data, delimiter=",") - return nb_classifiers, nb_examples, classifiers_names, data_2d, error_on_examples + return nbClassifiers, nbExamples, nbCopies, classifiers_names, data, error_on_examples -def publishExampleErrors(example_errors, directory, databaseName, labels_names, example_ids): +def publishExampleErrors(example_errors, directory, databaseName, labels_names): logging.debug("Start:\t Biclass Label analysis figure generation") base_file_name = directory + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-" + "_vs_".join( labels_names) + "-" - nb_classifiers, nb_examples, classifiers_names, \ - data_2d, error_on_examples = gen_error_data(example_errors) - - np.savetxt(base_file_name + "2D_plot_data.csv", data_2d, delimiter=",") - np.savetxt(base_file_name + "bar_plot_data.csv", error_on_examples, - delimiter=",") + nbClassifiers, nbExamples, nCopies, classifiers_names, data, error_on_examples = gen_error_data( + example_errors, + base_file_name) - plot_2d(data_2d, classifiers_names, nb_classifiers, nb_examples, - base_file_name, example_ids=example_ids) + publish2Dplot(data, classifiers_names, nbClassifiers, nbExamples, nCopies, + base_file_name) - plot_errors_bar(error_on_examples, nb_classifiers, nb_examples, - base_file_name) + publishErrorsBarPlot(error_on_examples, nbClassifiers, nbExamples, + base_file_name) logging.debug("Done:\t Biclass Label analysis figures generation") @@ -580,7 +554,7 @@ def get_arguments(benchmark_argument_dictionaries, flag): return benchmarkArgumentDictionary -def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metrics, example_ids): +def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metrics): r"""Used to extract and format the results of the different biclass experimentations performed. Parameters @@ -607,7 +581,7 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric label combination, regrouping the scores for each metrics and the information useful to plot errors on examples. """ logging.debug("Srart:\t Analzing all biclass resuls") - biclass_results = {} + biclass_results = [{} for _ in range(stats_iter)] for flag, result in results: iteridex, [classifierPositive, classifierNegative] = flag @@ -615,7 +589,7 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric arguments = get_arguments(benchmark_argument_dictionaries, flag) metrics_scores = get_metrics_scores_biclass(metrics, result) - example_errors = get_example_errors_biclass(arguments["labels"], result) + example_errors = getExampleErrorsBiclass(arguments["labels"], result) directory = arguments["directory"] @@ -626,15 +600,12 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric results = publishMetricsGraphs(metrics_scores, directory, database_name, labels_names) publishExampleErrors(example_errors, directory, database_name, - labels_names, example_ids) - if not str(classifierPositive) + str(classifierNegative) in biclass_results: - biclass_results[str(classifierPositive) + str(classifierNegative)] = {} - biclass_results[str(classifierPositive) + str(classifierNegative)][ - "metrics_scores"] = [i for i in range(stats_iter)] - biclass_results[str(classifierPositive) + str(classifierNegative)][ - "example_errors"] = [i for i in range(stats_iter)] - biclass_results[str(classifierPositive) + str(classifierNegative)]["metrics_scores"][iteridex] = metrics_scores - biclass_results[str(classifierPositive) + str(classifierNegative)]["example_errors"][iteridex] = example_errors + labels_names) + + biclass_results[iteridex][ + str(classifierPositive) + str(classifierNegative)] = { + "metrics_scores": metrics_scores, + "example_errors": example_errors} logging.debug("Done:\t Analzing all biclass resuls") return results, biclass_results @@ -712,10 +683,10 @@ def publishMulticlassScores(multiclass_results, metrics, stats_iter, direcories, nbResults = classifiers_names.shape[0] fileName = directory + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-" + metric[ - 0] + 0] + ".png" - plot_metric_scores(train_scores, validationScores, classifiers_names, - nbResults, metric[0], fileName, tag=" multiclass") + plotMetricScores(train_scores, validationScores, classifiers_names, + nbResults, metric[0], fileName, tag=" multiclass") logging.debug( "Done:\t Multiclass score graph generation for " + metric[0]) @@ -724,7 +695,7 @@ def publishMulticlassScores(multiclass_results, metrics, stats_iter, direcories, def publishMulticlassExmapleErrors(multiclass_results, directories, - databaseName, example_ids): + databaseName): for iter_index, multiclassResult in enumerate(multiclass_results): directory = directories[iter_index] logging.debug("Start:\t Multiclass Label analysis figure generation") @@ -736,18 +707,18 @@ def publishMulticlassExmapleErrors(multiclass_results, directories, multiclassResult, base_file_name) - plot_2d(data, classifiers_names, nbClassifiers, nbExamples, - nCopies, base_file_name, example_ids=example_ids) + publish2Dplot(data, classifiers_names, nbClassifiers, nbExamples, + nCopies, base_file_name) - plot_errors_bar(error_on_examples, nbClassifiers, nbExamples, - base_file_name) + publishErrorsBarPlot(error_on_examples, nbClassifiers, nbExamples, + base_file_name) logging.debug("Done:\t Multiclass Label analysis figure generation") def analyzeMulticlass(results, stats_iter, benchmark_argument_dictionaries, nb_examples, nb_labels, multiclass_labels, - metrics, classification_indices, directories, example_ids): + metrics, classification_indices, directories): """Used to transform one versus one results in multiclass results and to publish it""" multiclass_results = [{} for _ in range(stats_iter)] @@ -799,7 +770,7 @@ def analyzeMulticlass(results, stats_iter, benchmark_argument_dictionaries, benchmark_argument_dictionaries[0]["args"]["Base"]["name"]) publishMulticlassExmapleErrors(multiclass_results, directories, benchmark_argument_dictionaries[0][ - "args"].name, example_ids) + "args"].name) return results, multiclass_results @@ -808,79 +779,69 @@ def numpy_mean_and_std(scores_array): def publish_iter_biclass_metrics_scores(iter_results, directory, labels_dictionary, - data_base_name, stats_iter, + classifiers_dict, data_base_name, stats_iter, min_size=10): results=[] - for labels_combination, iter_result in iter_results.items(): - current_directory = directory + labels_dictionary[ - int(labels_combination[0])] + "-vs-" + labels_dictionary[ - int(labels_combination[1])] + "/" - if not os.path.exists(os.path.dirname(current_directory + "a")): + for labelsCombination, iterResult in iter_results.items(): + currentDirectory = directory + labels_dictionary[ + int(labelsCombination[0])] + "-vs-" + labels_dictionary[ + int(labelsCombination[1])] + "/" + if not os.path.exists(os.path.dirname(currentDirectory + "a")): try: - os.makedirs(os.path.dirname(current_directory + "a")) + os.makedirs(os.path.dirname(currentDirectory + "a")) except OSError as exc: if exc.errno != errno.EEXIST: raise - for metric_name, scores in iter_result.items(): - train = np.array(scores["mean"].loc["train"]) - test = np.array(scores["mean"].loc["test"]) - names = np.array(scores["mean"].columns) - train_std = np.array(scores["std"].loc["train"]) - test_std = np.array(scores["std"].loc["test"]) - # trainMeans, trainSTDs = numpy_mean_and_std(scores["train_scores"]) - # testMeans, testSTDs = numpy_mean_and_std(scores["test_scores"]) - - # names = np.array([name for name in classifiers_dict.keys()]) - fileName = current_directory + time.strftime( + for metricName, scores in iterResult["metrics_scores"].items(): + trainMeans, trainSTDs = numpy_mean_and_std(scores["train_scores"]) + testMeans, testSTDs = numpy_mean_and_std(scores["test_scores"]) + + names = np.array([name for name in classifiers_dict.keys()]) + fileName = currentDirectory + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" + data_base_name + "-Mean_on_" + str( - stats_iter) + "_iter-" + metric_name + ".png" + stats_iter) + "_iter-" + metricName + ".png" nbResults = names.shape[0] - plot_metric_scores(train, test, names, nbResults, - metric_name, fileName, tag=" averaged", - train_STDs=train_std, test_STDs=test_std) - results+=[[classifier_name, metric_name, test_mean, test_std] for classifier_name, test_mean, test_std in zip(names, test, test_std)] + plotMetricScores(trainMeans, testMeans, names, nbResults, + metricName, fileName, tag=" averaged", + train_STDs=trainSTDs, test_STDs=testSTDs) + results+=[[classifiersName, metricName, testMean, testSTD] for classifiersName, testMean, testSTD in zip(names, testMeans, testSTDs)] return results -def gen_error_data_glob(combi_results, stats_iter): - nb_examples = next(iter(combi_results.values())).shape[0] - nb_classifiers = len(combi_results) - data = np.zeros((nb_examples, nb_classifiers), dtype=int) - classifier_names = [] - for clf_index, (classifier_name, error_data) in enumerate(combi_results.items()): - data[:, clf_index] = error_data - classifier_names.append(classifier_name) - error_on_examples = -1 * np.sum(data, axis=1) + (nb_classifiers * stats_iter) - return nb_examples, nb_classifiers, data, error_on_examples, classifier_names +def gen_error_dat_glob(combi_results, stats_iter, base_file_name): + nbExamples = combi_results["error_on_examples"].shape[1] + nbClassifiers = combi_results["error_on_examples"].shape[0] + data = np.transpose(combi_results["error_on_examples"]) + error_on_examples = -1 * np.sum(data, axis=1) + (nbClassifiers * stats_iter) + np.savetxt(base_file_name + "clf_errors.csv", data, delimiter=",") + np.savetxt(base_file_name + "example_errors.csv", error_on_examples, + delimiter=",") + return nbExamples, nbClassifiers, data, error_on_examples -def publish_iter_biclass_example_errors(iter_results, directory, - labels_dictionary, stats_iter, - example_ids): - for labels_combination, combi_results in iter_results.items(): +def publish_iter_biclass_example_errors(iter_results, directory, labels_dictionary, + classifiers_dict, stats_iter, min_size=10): + for labelsCombination, combiResults in iter_results.items(): base_file_name = directory + labels_dictionary[ - int(labels_combination[0])] + "-vs-" + \ + int(labelsCombination[0])] + "-vs-" + \ labels_dictionary[ - int(labels_combination[1])] + "/" + time.strftime( + int(labelsCombination[1])] + "/" + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" - + classifiers_names = [classifier_name for classifier_name in + classifiers_dict.values()] logging.debug( "Start:\t Global biclass label analysis figure generation") - nbExamples, nbClassifiers, data, \ - error_on_examples, classifier_names = gen_error_data_glob(combi_results, - stats_iter) + nbExamples, nbClassifiers, data, error_on_examples = gen_error_dat_glob( + combiResults, stats_iter, base_file_name) - np.savetxt(base_file_name + "clf_errors.csv", data, delimiter=",") - np.savetxt(base_file_name + "example_errors.csv", error_on_examples, - delimiter=",") + publish2Dplot(data, classifiers_names, nbClassifiers, nbExamples, 1, + base_file_name, stats_iter=stats_iter) - plot_2d(data, classifier_names, nbClassifiers, nbExamples, - base_file_name, stats_iter=stats_iter, example_ids=example_ids) - plot_errors_bar(error_on_examples, nbClassifiers * stats_iter, - nbExamples, base_file_name) + publishErrorsBarPlot(error_on_examples, nbClassifiers * stats_iter, + nbExamples, base_file_name) logging.debug( "Done:\t Global biclass label analysis figures generation") @@ -900,28 +861,28 @@ def publish_iter_multiclass_metrics_scores(iter_multiclass_results, classifiers_ "%Y_%m_%d-%H_%M_%S") + "-" + data_base_name + "-Mean_on_" + str( stats_iter) + "_iter-" + metric_name + ".png" - plot_metric_scores(trainMeans, testMeans, classifiers_names, nb_results, - metric_name, file_name, tag=" averaged multiclass", - train_STDs=trainSTDs, test_STDs=testSTDs) + plotMetricScores(trainMeans, testMeans, classifiers_names, nb_results, + metric_name, file_name, tag=" averaged multiclass", + train_STDs=trainSTDs, test_STDs=testSTDs) results+=[[classifiers_name, metric_name,testMean, testSTD] for classifiers_name, testMean, testSTD in zip(classifiers_names, testMeans, testSTDs)] return results def publish_iter_multiclass_example_errors(iter_multiclass_results, directory, - classifiers_names, stats_iter, example_ids, min_size=10): + classifiers_names, stats_iter, min_size=10): logging.debug( "Start:\t Global multiclass label analysis figures generation") base_file_name = directory + time.strftime("%Y_%m_%d-%H_%M_%S") + "-" - nb_examples, nb_classifiers, data, error_on_examples = gen_error_data_glob( + nb_examples, nb_classifiers, data, error_on_examples = gen_error_dat_glob( iter_multiclass_results, stats_iter, base_file_name) - plot_2d(data, classifiers_names, nb_classifiers, nb_examples, 1, - base_file_name, stats_iter=stats_iter, example_ids=example_ids) + publish2Dplot(data, classifiers_names, nb_classifiers, nb_examples, 1, + base_file_name, stats_iter=stats_iter) - plot_errors_bar(error_on_examples, nb_classifiers * stats_iter, nb_examples, - base_file_name) + publishErrorsBarPlot(error_on_examples, nb_classifiers * stats_iter, nb_examples, + base_file_name) logging.debug("Done:\t Global multiclass label analysis figures generation") @@ -930,7 +891,8 @@ def gen_classifiers_dict(results, metrics): classifiers_dict = dict((classifier_name, classifierIndex) for classifierIndex, classifier_name in enumerate( - list(results[list(results.keys())[0]]["metrics_scores"][0][metrics[0][0]].columns))) + results[0][list(results[0].keys())[0]]["metrics_scores"][metrics[0][0]][ + "classifiers_names"])) return classifiers_dict, len(classifiers_dict) @@ -958,74 +920,52 @@ def add_new_metric(iter_biclass_results, metric, labels_combination, nb_classifi return iter_biclass_results -def format_previous_results(biclass_results): - """ - Formats each statistical iteration's result into a mean/std analysis for - the metrics and adds the errors of each statistical iteration. - - Parameters - ---------- - biclass_results : The raw results, for each statistical iteration i contains - - biclass_results[i]["metrics_scores"] is a dictionary with a pd.dataframe - for each metrics - - biclass_results[i]["example_errors"], a dicaitonary with a np.array - for each classifier. - - Returns - ------- - metrics_analysis : The mean and std dataframes for each metrics - - error_analysis : A dictionary containing the added errors - arrays for each classifier - - """ - metrics_analysis = dict((key, {}) for key in biclass_results.keys()) - error_analysis = dict((key, {}) for key in biclass_results.keys()) - for label_combination, biclass_result in biclass_results.items(): - - concat_dict = {} - for iter_index, metrics_score in enumerate( - biclass_result["metrics_scores"]): - for metric_name, dataframe in metrics_score.items(): - if metric_name not in concat_dict: - concat_dict[metric_name] = dataframe - else: - concat_dict[metric_name] = pd.concat( - [concat_dict[metric_name], dataframe]) - - for metric_name, dataframe in concat_dict.items(): - metrics_analysis[label_combination][metric_name] = {} - metrics_analysis[label_combination][metric_name][ - "mean"] = dataframe.groupby(dataframe.index).mean() - metrics_analysis[label_combination][metric_name][ - "std"] = dataframe.groupby(dataframe.index).std(ddof=0) - - added_example_errors = {} - for example_errors in biclass_result["example_errors"]: - for classifier_name, errors in example_errors.items(): - if classifier_name not in added_example_errors: - added_example_errors[classifier_name] = errors - else: - added_example_errors[classifier_name] += errors - error_analysis[label_combination] = added_example_errors - return metrics_analysis, error_analysis +def analyzebiclass_iter(biclass_results, metrics, stats_iter, directory, + labels_dictionary, data_base_name, nb_examples): + """Used to format the results in order to plot the mean results on the iterations""" + iter_biclass_results = {} + classifiers_dict, nb_classifiers = gen_classifiers_dict(biclass_results, + metrics) + for iter_index, biclass_result in enumerate(biclass_results): + for labelsComination, results in biclass_result.items(): + for metric in metrics: -def analyzebiclass_iter(biclass_results, stats_iter, directory, - labels_dictionary, data_base_name, example_ids): - """Used to format the results in order to plot the mean results on the iterations""" - metrics_analysis, error_analysis = format_previous_results(biclass_results) - - results = publish_iter_biclass_metrics_scores(metrics_analysis, - directory, labels_dictionary, - data_base_name, stats_iter) - publish_iter_biclass_example_errors(error_analysis, directory, - labels_dictionary, - stats_iter, example_ids) + iter_biclass_results = add_new_labels_combination( + iter_biclass_results, labelsComination, nb_classifiers, + nb_examples) + iter_biclass_results = add_new_metric(iter_biclass_results, metric, + labelsComination, + nb_classifiers, stats_iter) + + metric_results = results["metrics_scores"][metric[0]] + for classifier_name, trainScore, testScore in zip( + metric_results["classifiers_names"], + metric_results["train_scores"], + metric_results["test_scores"], ): + iter_biclass_results[labelsComination]["metrics_scores"][ + metric[0]]["train_scores"][ + classifiers_dict[classifier_name], iter_index] = trainScore + iter_biclass_results[labelsComination]["metrics_scores"][ + metric[0]]["test_scores"][ + classifiers_dict[classifier_name], iter_index] = testScore + for classifier_name, error_on_example in results[ + "example_errors"].items(): + iter_biclass_results[labelsComination]["error_on_examples"][ + classifiers_dict[classifier_name], :] += error_on_example[ + "error_on_examples"] + + results = publish_iter_biclass_metrics_scores( + iter_biclass_results, directory, + labels_dictionary, classifiers_dict, + data_base_name, stats_iter) + publish_iter_biclass_example_errors(iter_biclass_results, directory, + labels_dictionary, classifiers_dict, + stats_iter) return results def analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics, - data_base_name, nb_examples, example_ids): + data_base_name, nb_examples): """Used to mean the multiclass results on the iterations executed with different random states""" logging.debug("Start:\t Getting mean results for multiclass classification") @@ -1062,19 +1002,19 @@ def analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics, iter_multiclass_results, classifiers_names, data_base_name, directory, stats_iter) publish_iter_multiclass_example_errors(iter_multiclass_results, directory, - classifiers_names, stats_iter, example_ids) + classifiers_names, stats_iter) return results def get_results(results, stats_iter, nb_multiclass, benchmark_argument_dictionaries, multiclass_labels, metrics, classification_indices, directories, directory, labels_dictionary, - nb_examples, nb_labels, example_ids): + nb_examples, nb_labels): """Used to analyze the results of the previous benchmarks""" data_base_name = benchmark_argument_dictionaries[0]["args"]["Base"]["name"] results_means_std, biclass_results = analyze_biclass(results, benchmark_argument_dictionaries, - stats_iter, metrics, example_ids) + stats_iter, metrics) if nb_multiclass > 1: results_means_std, multiclass_results = analyzeMulticlass(results, stats_iter, @@ -1082,12 +1022,12 @@ def get_results(results, stats_iter, nb_multiclass, benchmark_argument_dictionar nb_examples, nb_labels, multiclass_labels, metrics, classification_indices, - directories, example_ids) + directories) if stats_iter > 1: results_means_std = analyzebiclass_iter( - biclass_results, stats_iter, directory, - labels_dictionary, data_base_name, example_ids) + biclass_results, metrics, stats_iter, directory, + labels_dictionary, data_base_name, nb_examples) if nb_multiclass > 1: results_means_std = analyze_iter_multiclass(multiclass_results, directory, stats_iter, - metrics, data_base_name, nb_examples, example_ids) + metrics, data_base_name, nb_examples) return results_means_std diff --git a/multiview_platform/mono_multi_view_classifiers/utils/configuration.py b/multiview_platform/mono_multi_view_classifiers/utils/configuration.py index f297dcf09deebab08b29573a45344fbd7e40a822..a492aff70e93e2a0a27e3c3576c8344562194c58 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/configuration.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/configuration.py @@ -1,20 +1,11 @@ +import builtins +from distutils.util import strtobool as tobool import yaml +import os def get_the_args(path_to_config_file="../config_files/config.yml"): - """ - The function for extracting the args for a '.yml' file. - - Parameters - ---------- - path_to_config_file : str, path to the yml file containing the configuration - - Returns - ------- - yaml_config : dict, the dictionary conaining the configuration for the - benchmark - - """ + """This is the main function for extracting the args for a '.yml' file""" with open(path_to_config_file, 'r') as stream: yaml_config = yaml.safe_load(stream) return yaml_config diff --git a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py index 85666b66617bd20d054b0c0d32a486e12dd88412..6c40d787545f5a155763571d180db58085040ea5 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py @@ -66,8 +66,7 @@ class Dataset(): def __init__(self, views=None, labels=None, are_sparse=False, file_name="dataset.hdf5", view_names=None, path="", - hdf5_file=None, labels_names=None, is_temp=False, - example_ids=None): + hdf5_file=None, labels_names=None, is_temp=False): self.is_temp = False if hdf5_file is not None: self.dataset=hdf5_file @@ -105,13 +104,6 @@ class Dataset(): meta_data_grp.attrs["datasetLength"] = len(labels) dataset_file.close() self.update_hdf5_dataset(os.path.join(path, file_name)) - if example_ids is not None: - example_ids = [example_id if not is_just_number(example_id) - else "ID_"+example_id for example_id in example_ids] - self.example_ids = example_ids - else: - self.example_ids = ["ID_"+str(i) - for i in range(labels.shape[0])] def rm(self): """ @@ -152,15 +144,8 @@ class Dataset(): ------- """ - self.nb_view = self.dataset["Metadata"].attrs["nbView"] + self.nb_view = self.dataset.get("Metadata").attrs["nbView"] self.view_dict = self.get_view_dict() - if "example_ids" in self.dataset["Metadata"].keys(): - self.example_ids = [example_id.decode() - if not is_just_number(example_id.decode()) - else "ID_"+example_id.decode() - for example_id in self.dataset["Metadata"]["example_ids"]] - else: - self.example_ids = [str(i) for i in range(self.dataset["Labels"].shape[0])] def get_nb_examples(self): """ @@ -169,100 +154,65 @@ class Dataset(): ------- """ - return self.dataset["Metadata"].attrs["datasetLength"] + return self.dataset.get("Metadata").attrs["datasetLength"] def get_view_dict(self): - """ - Returns the dictionary with view indices as keys and the corresponding - names as values - """ view_dict = {} for view_index in range(self.nb_view): - view_dict[self.dataset["View" + str(view_index)].attrs["name"]] = view_index + view_dict[self.dataset.get("View" + str(view_index)).attrs["name"]] = view_index return view_dict def get_label_names(self, decode=True, example_indices=None): - """ - Used to get the list of the label names for the give set of examples - - Parameters - ---------- - decode : bool - If True, will decode the label names before lsiting them - - example_indices : numpy.ndarray - The array containig the indices of the needed examples - - Returns - ------- - - """ example_indices = self.init_example_indces(example_indices) selected_labels = self.get_labels(example_indices) if decode: return [label_name.decode("utf-8") - for label, label_name in enumerate(self.dataset["Labels"].attrs["names"]) + for label, label_name in enumerate(self.dataset.get("Labels").attrs["names"]) if label in selected_labels] else: return [label_name - for label, label_name in enumerate(self.dataset["Labels"].attrs["names"]) + for label, label_name in enumerate(self.dataset.get("Labels").attrs["names"]) if label in selected_labels] def init_example_indces(self, example_indices=None): - """If no example indices are provided, selects all the examples.""" if example_indices is None: return range(self.get_nb_examples()) else: return example_indices def get_v(self, view_index, example_indices=None): - """ - Selects the view to extract - Parameters - ---------- - view_index : int - The index of the view to extract - example_indices : numpy.ndarray - The array containing the indices of the examples to extract. - - Returns - ------- - A numpy.ndarray containing the view data for the needed examples - """ example_indices = self.init_example_indces(example_indices) if type(example_indices) is int: - return self.dataset["View" + str(view_index)][example_indices, :] + return self.dataset.get("View" + str(view_index))[example_indices, :] else: example_indices = np.array(example_indices) sorted_indices = np.argsort(example_indices) example_indices = example_indices[sorted_indices] - if not self.dataset["View" + str(view_index)].attrs["sparse"]: - return self.dataset["View" + str(view_index)][()][example_indices, :][ + if not self.dataset.get("View" + str(view_index)).attrs["sparse"]: + return self.dataset.get("View" + str(view_index))[example_indices, :][ np.argsort(sorted_indices), :] else: sparse_mat = sparse.csr_matrix( - (self.dataset["View" + str(view_index)]["data"][()], - self.dataset["View" + str(view_index)]["indices"][()], - self.dataset["View" + str(view_index)]["indptr"][()]), - shape=self.dataset["View" + str(view_index)].attrs["shape"])[ + (self.dataset.get("View" + str(view_index)).get("data").value, + self.dataset.get("View" + str(view_index)).get("indices").value, + self.dataset.get("View" + str(view_index)).get("indptr").value), + shape=self.dataset.get("View" + str(view_index)).attrs["shape"])[ example_indices, :][ np.argsort(sorted_indices), :] return sparse_mat - def get_shape(self, view_index=0, example_indices=None): - """Gets the shape of the needed view""" - return self.get_v(view_index,example_indices=example_indices).shape + def get_shape(self, example_indices=None): + return self.get_v(0,example_indices=example_indices).shape def get_nb_class(self, example_indices=None): - """Gets the number of class of the dataset""" example_indices = self.init_example_indces(example_indices) - return len(np.unique(self.dataset["Labels"][()][example_indices])) + return len(np.unique(self.dataset.get("Labels").value[example_indices])) def get_labels(self, example_indices=None): example_indices = self.init_example_indces(example_indices) - return self.dataset["Labels"][()][example_indices] + return self.dataset.get("Labels").value[example_indices] def copy_view(self, target_dataset=None, source_view_name=None, target_view_index=None, example_indices=None): @@ -270,7 +220,7 @@ class Dataset(): new_d_set = target_dataset.create_dataset("View"+str(target_view_index), data=self.get_v(self.view_dict[source_view_name], example_indices=example_indices)) - for key, value in self.dataset["View"+str(self.view_dict[source_view_name])].attrs.items(): + for key, value in self.dataset.get("View"+str(self.view_dict[source_view_name])).attrs.items(): new_d_set.attrs[key] = value def init_view_names(self, view_names=None): @@ -290,23 +240,15 @@ class Dataset(): dataset_file_path = os.path.join(path,self.get_name()+"_temp_filter.hdf5") new_dataset_file = h5py.File(dataset_file_path,"w") self.dataset.copy("Metadata", new_dataset_file) - if "example_ids" in self.dataset["Metadata"].keys(): - ex_ids = new_dataset_file["Metadata"]["example_ids"] - ex_ids = np.array([self.example_ids[example_indices]]).astype(np.dtype("S10")) - else: - new_dataset_file["Metadata"].create_dataset("example_ids", - (len(self.example_ids), ), - data=np.array(self.example_ids).astype(np.dtype("S10")), - dtype=np.dtype("S10")) - new_dataset_file["Metadata"].attrs["datasetLength"] = len(example_indices) - new_dataset_file["Metadata"].attrs["nbClass"] = np.unique(labels) + new_dataset_file.get("Metadata").attrs["datasetLength"] = len(example_indices) + new_dataset_file.get("Metadata").attrs["nbClass"] = np.unique(labels) new_dataset_file.create_dataset("Labels", data=labels) - new_dataset_file["Labels"].attrs["names"] = [label_name.encode() + new_dataset_file.get("Labels").attrs["names"] = [label_name.encode() if not isinstance(label_name, bytes) else label_name for label_name in label_names] view_names = self.init_view_names(view_names) - new_dataset_file["Metadata"].attrs["nbView"] = len(view_names) + new_dataset_file.get("Metadata").attrs["nbView"] = len(view_names) for new_index, view_name in enumerate(view_names): self.copy_view(target_dataset=new_dataset_file, source_view_name=view_name, @@ -329,18 +271,18 @@ class Dataset(): self.copy_view(target_dataset=noisy_dataset, source_view_name=self.get_view_name(view_index), target_view_index=view_index) - for view_index in range(noisy_dataset["Metadata"].attrs["nbView"]): + for view_index in range(noisy_dataset.get("Metadata").attrs["nbView"]): view_key = "View" + str(view_index) - view_dset = noisy_dataset[view_key] + view_dset = noisy_dataset.get(view_key) try: view_limits = self.dataset[ - "Metadata/View" + str(view_index) + "_limits"][()] + "Metadata/View" + str(view_index) + "_limits"].value except: import pdb;pdb.set_trace() view_ranges = view_limits[:, 1] - view_limits[:, 0] - normal_dist = random_state.normal(0, noise_std, view_dset[()].shape) + normal_dist = random_state.normal(0, noise_std, view_dset.value.shape) noise = normal_dist * view_ranges - noised_data = view_dset[()] + noise + noised_data = view_dset.value + noise noised_data = np.where(noised_data < view_limits[:, 0], view_limits[:, 0], noised_data) noised_data = np.where(noised_data > view_limits[:, 1], @@ -447,12 +389,9 @@ class Dataset(): return selected_label_names -def is_just_number(string): - try: - float(string) - return True - except ValueError: - return False + + + def datasets_already_exist(pathF, name, nbCores): """Used to check if it's necessary to copy datasets""" @@ -463,6 +402,51 @@ def datasets_already_exist(pathF, name, nbCores): pathF + name + str(coreIndex) + ".hdf5") return allDatasetExist +# def get_v(dataset, view_index, used_indices=None): +# # """Used to extract a view as a numpy array or a sparse mat from the HDF5 dataset""" +# # if used_indices is None: +# # used_indices = range(dataset.get("Metadata").attrs["datasetLength"]) +# # if type(used_indices) is int: +# # return dataset.get("View" + str(view_index))[used_indices, :] +# # else: +# # used_indices = np.array(used_indices) +# # sorted_indices = np.argsort(used_indices) +# # used_indices = used_indices[sorted_indices] +# # +# # if not dataset.get("View" + str(view_index)).attrs["sparse"]: +# # return dataset.get("View" + str(view_index))[used_indices, :][ +# # np.argsort(sorted_indices), :] +# # else: +# # sparse_mat = sparse.csr_matrix( +# # (dataset.get("View" + str(view_index)).get("data").value, +# # dataset.get("View" + str(view_index)).get("indices").value, +# # dataset.get("View" + str(view_index)).get("indptr").value), +# # shape=dataset.get("View" + str(view_index)).attrs["shape"])[ +# # used_indices, :][ +# # np.argsort(sorted_indices), :] +# # +# # return sparse_mat + + +def get_shape(dataset, view_index): + """Used to get the dataset shape even if it's sparse""" + if not dataset.get("View" + str(view_index)).attrs["sparse"]: + return dataset.get("View" + str(view_index)).shape + else: + return dataset.get("View" + str(view_index)).attrs["shape"] + + +def get_value(dataset): + """Used to get the value of a view in the HDF5 dataset even if it sparse""" + if not dataset.attrs["sparse"]: + return dataset.value + else: + sparse_mat = sparse.csr_matrix((dataset.get("data").value, + dataset.get("indices").value, + dataset.get("indptr").value), + shape=dataset.attrs["shape"]) + return sparse_mat + def extract_subset(matrix, used_indices): """Used to extract a subset of a matrix even if it's sparse""" @@ -571,3 +555,10 @@ def input_(timeout=15): return sys.stdin.readline().strip() else: return "y" + +def get_monoview_shared(path, name, view_name, labels_names, classification_indices): + """ATM is not used with shared memory, but soon :)""" + hdf5_dataset_file = h5py.File(path + name + ".hdf5", "w") + X = hdf5_dataset_file.get(view_name).value + y = hdf5_dataset_file.get("Labels").value + return X, y diff --git a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py index 11e7bd3bfcf459cade4825f8cb749344620d01dc..a3f2e1d1d480a3bac9f12ac83931549741d4a757 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py @@ -4,8 +4,10 @@ import logging import h5py import numpy as np +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array -from ..utils.dataset import Dataset +from ..utils.dataset import Dataset, copy_hdf5 # Author-Info __author__ = "Baptiste Bauvin" @@ -38,13 +40,12 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, nb_features=10): """Used to generate a plausible dataset to test the algorithms""" - if not os.path.exists(os.path.dirname(path + "plausible.hdf5")): + if not os.path.exists(os.path.dirname(path + "Plausible.hdf5")): try: - os.makedirs(os.path.dirname(path + "plausible.hdf5")) + os.makedirs(os.path.dirname(path + "Plausible.hdf5")) except OSError as exc: if exc.errno != errno.EEXIST: raise - example_ids = ["exmaple_id_"+str(i) for i in range(nb_examples)] views = [] view_names = [] are_sparse = [] @@ -63,8 +64,6 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, fake_zero_indices = random_state.randint(int(nb_examples / 2), nb_examples, int(nb_examples / 12)) - for index in np.concatenate((fake_one_indices, fake_zero_indices)): - example_ids[index]+="noised" view_data[fake_one_indices] = np.ones( (len(fake_one_indices), nb_features)) @@ -75,14 +74,12 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, view_names.append("ViewNumber" + str(view_index)) are_sparse.append(False) - - dataset = Dataset(views=views, labels=labels, labels_names=label_names, view_names=view_names, - are_sparse=are_sparse, file_name="plausible.hdf5", - path=path, example_ids=example_ids) + are_sparse=are_sparse, file_name="Plausible.hdf5", + path=path) labels_dictionary = {0: "No", 1: "Yes"} - return dataset, labels_dictionary, "plausible" + return dataset, labels_dictionary, "Plausible" elif nb_class >= 3: firstBound = int(nb_examples / 3) rest = nb_examples - 2 * int(nb_examples / 3) @@ -118,10 +115,10 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, dataset = Dataset(views=views, labels=labels, labels_names=label_names, view_names=view_names, are_sparse=are_sparse, - file_name="plausible.hdf5", - path=path, example_ids=example_ids) + file_name="Plausible.hdf5", + path=path) labels_dictionary = {0: "No", 1: "Yes", 2: "Maybe"} - return dataset, labels_dictionary, "plausible" + return dataset, labels_dictionary, "Plausible" class DatasetError(Exception): diff --git a/multiview_platform/mono_multi_view_classifiers/utils/parameters.py b/multiview_platform/mono_multi_view_classifiers/utils/parameters.py new file mode 100644 index 0000000000000000000000000000000000000000..2b61691f20124cb20fd7872aa8c44f5757397f02 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/utils/parameters.py @@ -0,0 +1,145 @@ +import numpy as np + + +class Parameter_pdata(object): + class __Parameter_pdata: + nbr_i = 0 + # option de renormalisation des donnees + # la séparation se faisant à une permutation pret et à un facteur de + # renormalisation pret, on peut choisir de normaliser les données au debut + # de l'algo et/ou à chaque iteration de l'algo et/ou à la fin de l'algo + # on normalise A ou S + _data_norm = {'FlagInit': True, 'FlagIter': False, 'FlagEnd': False} + # % on normalise suivant les colonnes (1) 'dim' (norme des colonnes à 1) ou les + # 'dim'% lignes (2) (norme des lignes à 1) + _Norm = {'p': 1, 'dim': 1, 'x': 'A'} + _list_mode = ['real', 'simul'] + _list_x = ['A', 'S'] + + def __init__(self): + self._Norm['p'] = 1 + self._Norm['dim'] = 1 + self._Norm['x'] = self._list_x[0] + self.mode = self._list_mode[1] + self.sigma = 20000 + self.dim = 1 + if self.nbr_i > 0: + raise ValueError("Instance of class Parameter_pdata can be only one") + self.nbr_i += 1 + + def __str__(self): + return repr(self) + + instance = None + + # def __init__(self, arg): + # if not Parameter_pdata.instance: + # Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata(arg) + # else: + # Parameter_pdata.instance.val = arg + + def __new__(cls): # _new_ est toujours une méthode de classe + if not Parameter_pdata.instance: + Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata() + return Parameter_pdata.instance + + def __getattr__(self, attr): + return getattr(self.instance, attr) + + # def __setattr__(self, attr, val): + # return setattr(self.instance, attr, val) + + def __setattr__(self, name): + return setattr(self.instance, name) + + +class Parameter_palgo(object): + class __Parameter_palgo: + + nbr_i = 0 + _list_algo = ['BCVMFB', 'PALS', 'STALS', 'LSfro', 'LSkl'] + _stop = {'DifA': False, 'DifS': False, + 'ObjFct': True, 'threshold': np.finfo(float).eps} + _pfwt = {'w': 'db6', 'family_pfwt': 'db', + 'level': 10, 'K': 4, + 'Ls': 3000, 'L1': 3000, 'L2': 3000} + # _wavelette_type = ['db', 'db6'] + # 'LS' pour Lee et Seung + # 'Lips' pour la constante de Lipschitz + # 'PALM' pas de preconditionnement + _list_precond = ['LS', 'Lips', 'PALM'] + + def __init__(self): + self.flagWave = False + self.val = None + algo_value = self._list_algo[1] + self._algo = algo_value + self.gamma = 0.99 + self.inf = np.inf + self.eps = np.finfo(float).eps + self.niter = 1000 + self.eta_inf = 'eps' + self.eta_sup = 'inf' + self.alpha_A = 0.0 + self.p_A = 1 + self.p_S = 1 + self.alpha_S = 0.0 + # self.level = 10 + self.alpha_S_eval = False + self.stopThreshold = 10e-5, + self.precond = 'LS' # 'LS' pour Lee et Seung + self.F = None + self.Fstar = None + self.verbose = False + + if self.nbr_i > 0: + raise ValueError("Instance of class Parameter_pdata can be only one") + self.nbr_i += 1 + + def __str__(self): + return repr(self) + repr(self.val) + + @property + def algo(self): + return self._algo + + @algo.setter + def algo(self, algo_value): + if algo_value not in self._list_algo: + raise NameError("parameter algo must be in %s" % self._list_algo) + else: + self._algo = algo_value + + instance = None + + # def __init__(self, arg): + # if not Parameter_pdata.instance: + # Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata(arg) + # else: + # Parameter_pdata.instance.val = arg + + def __new__(cls): # _new_ est toujours une méthode de classe + if not Parameter_palgo.instance: + Parameter_palgo.instance = Parameter_palgo.__Parameter_palgo() + return Parameter_palgo.instance + + def __getattr__(self, attr): + return getattr(self.instance, attr) + + # def __setattr__(self, attr, val): + # return setattr(self.instance, attr, val) + + def __setattr__(self, name): + return setattr(self.instance, name) + + +if __name__ == '__main__': + a = Parameter_pdata() + a = Parameter_pdata() + b = Parameter_pdata() + b.val = 6 + b.x = 8 + a.x = 10 + param = Parameter_palgo() + algo = param._list_algo[3] + param.algo = algo diff --git a/multiview_platform/tests/test_ExecClassif.py b/multiview_platform/tests/test_ExecClassif.py index ad86757828f53a732ded8785cbf0f199bbbdbc9d..abbcd77f933e6c9f49dc74213388551bbe85e61d 100644 --- a/multiview_platform/tests/test_ExecClassif.py +++ b/multiview_platform/tests/test_ExecClassif.py @@ -219,7 +219,7 @@ def fakeBenchmarkExec_monocore(dataset_var=1, a=4, args=1): def fakegetResults(results, stats_iter, nb_multiclass, benchmark_arguments_dictionaries, multi_class_labels, metrics, classification_indices, directories, directory, - labels_dictionary, nb_examples, nb_labels, example_ids): + labels_dictionary, nb_examples, nb_labels): return 3 @@ -368,7 +368,8 @@ class Test_execOneBenchmark(unittest.TestCase): 1, 2, 1, 1, 2, 1, 21]), exec_monoview_multicore=fakeExecMono, - exec_multiview_multicore=fakeExecMulti,) + exec_multiview_multicore=fakeExecMulti, + init_multiview_arguments=fakeInitMulti) cls.assertEqual(flag, None) cls.assertEqual(results , @@ -427,7 +428,8 @@ class Test_execOneBenchmark_multicore(unittest.TestCase): flag=None, labels=np.array([0, 1, 2, 3, 4, 2, 2, 12, 1, 2, 1, 1, 2, 1, 21]), exec_monoview_multicore=fakeExecMono, - exec_multiview_multicore=fakeExecMulti,) + exec_multiview_multicore=fakeExecMulti, + init_multiview_arguments=fakeInitMulti) cls.assertEqual(flag, None) cls.assertEqual(results , diff --git a/multiview_platform/tests/test_ResultAnalysis.py b/multiview_platform/tests/test_ResultAnalysis.py index bcf63fc7644acae02f6466a4198079bde42bc0af..bc739072790e9730058d7a9f916f66d512e7c31f 100644 --- a/multiview_platform/tests/test_ResultAnalysis.py +++ b/multiview_platform/tests/test_ResultAnalysis.py @@ -1,267 +1,56 @@ -import unittest -import numpy as np -import pandas as pd -import time - -from ..mono_multi_view_classifiers import result_analysis -from ..mono_multi_view_classifiers.multiview.multiview_utils import MultiviewResult -from ..mono_multi_view_classifiers.monoview.monoview_utils import MonoviewResult - - -class Test_get_arguments(unittest.TestCase): - - def setUp(self): - self.benchamrk_argument_dictionaries = [{"flag":"good_flag", "valid":True}, - {"flag":"bad_flag", "valid":False}] - - def test_benchmark_wanted(self): - argument_dict = result_analysis.get_arguments(self.benchamrk_argument_dictionaries, "good_flag") - self.assertTrue(argument_dict["valid"]) - - -class Test_get_metrics_scores_biclass(unittest.TestCase): - - - def test_simple(self): - metrics = [["accuracy_score"], ["f1_score"]] - results = [MonoviewResult(0, - "ada", - "0", - {"accuracy_score":[0.9, 0.95], - "f1_score":[0.91, 0.96]} - , "", "", "", "")] - metrics_scores = result_analysis.get_metrics_scores_biclass(metrics, - results) - self.assertIsInstance(metrics_scores, dict) - self.assertIsInstance(metrics_scores["accuracy_score"], pd.DataFrame) - np.testing.assert_array_equal(np.array(metrics_scores["accuracy_score"].loc["train"]), np.array([0.9])) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score"].loc["test"]), - np.array([0.95])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["train"]), - np.array([0.91])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["test"]), - np.array([0.96])) - np.testing.assert_array_equal(np.array(metrics_scores["f1_score"].columns), - np.array(["ada-0"])) - - def multiple_monoview_classifiers(self): - metrics = [["accuracy_score"], ["f1_score"]] - results = [MonoviewResult(0, - "ada", - "0", - {"accuracy_score": [0.9, 0.95], - "f1_score": [0.91, 0.96]} - , "", "", "", ""), - MonoviewResult(0, - "dt", - "1", - {"accuracy_score": [0.8, 0.85], - "f1_score": [0.81, 0.86]} - , "", "", "", "") - ] - metrics_scores = result_analysis.get_metrics_scores_biclass(metrics, - results) - self.assertIsInstance(metrics_scores, dict) - self.assertIsInstance(metrics_scores["accuracy_score"], pd.DataFrame) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score"].loc["train"]), - np.array([0.9, 0.8])) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score"].loc["test"]), - np.array([0.95, 0.85])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["train"]), - np.array([0.91, 0.81])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["test"]), - np.array([0.96, 0.86])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].columns), - np.array(["ada-0", "dt-1"])) - - def mutiview_result(self): - metrics = [["accuracy_score"], ["f1_score"]] - results = [MultiviewResult("mv", "", {"accuracy_score": [0.7, 0.75], - "f1_score": [0.71, 0.76]}, "", ""), - MonoviewResult(0, - "dt", - "1", - {"accuracy_score": [0.8, 0.85], - "f1_score": [0.81, 0.86]} - , "", "", "", "") - ] - metrics_scores = result_analysis.get_metrics_scores_biclass(metrics, - results) - self.assertIsInstance(metrics_scores, dict) - self.assertIsInstance(metrics_scores["accuracy_score"], pd.DataFrame) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score"].loc["train"]), - np.array([0.7, 0.8])) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score"].loc["test"]), - np.array([0.75, 0.85])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["train"]), - np.array([0.71, 0.81])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["test"]), - np.array([0.76, 0.86])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].columns), - np.array(["mv", "dt-1"])) - -class Test_get_example_errors_biclass(unittest.TestCase): - - def test_simple(self): - ground_truth = np.array([0,1,0,1,0,1,0,1, -100]) - results = [MultiviewResult("mv", "", {"accuracy_score": [0.7, 0.75], - "f1_score": [0.71, 0.76]}, - np.array([0,0,0,0,1,1,1,1,1]), - ""), - MonoviewResult(0, - "dt", - "1", - {"accuracy_score": [0.8, 0.85], - "f1_score": [0.81, 0.86]} - , np.array([0,0,1,1,0,0,1,1,0]), "", "", "") - ] - example_errors = result_analysis.get_example_errors_biclass(ground_truth, - results) - self.assertIsInstance(example_errors, dict) - np.testing.assert_array_equal(example_errors["mv"], - np.array([1,0,1,0,0,1,0,1,-100])) - np.testing.assert_array_equal(example_errors["dt-1"], - np.array([1, 0, 0, 1, 1, 0, 0, 1,-100])) - - -class Test_init_plot(unittest.TestCase): - - def test_simple(self): - results = [] - metric_name = "acc" - data = np.random.RandomState(42).uniform(0,1,(2,2)) - metric_dataframe = pd.DataFrame(index=["train", "test"], columns=["dt-1", "mv"], data=data) - directory = "dir" - database_name = 'db' - labels_names = ['lb1', "lb2"] - train, test, classifier_names, \ - file_name, nb_results, results = result_analysis.init_plot(results, - metric_name, - metric_dataframe, - directory, - database_name, - labels_names) - self.assertEqual(file_name, "dir"+time.strftime( - "%Y_%m_%d-%H_%M_%S")+"-db-lb1_vs_lb2-acc") - np.testing.assert_array_equal(train, data[0,:]) - np.testing.assert_array_equal(test, data[1, :]) - np.testing.assert_array_equal(classifier_names, np.array(["dt-1", "mv"])) - self.assertEqual(nb_results, 2) - self.assertEqual(results, [["dt-1", "acc", data[1,0], 0], ["mv", "acc", data[1,1], 0]]) - -class Test_gen_error_data(unittest.TestCase): - - def test_simple(self): - random_state = np.random.RandomState(42) - ada_data = random_state.randint(0,2,size=7) - mv_data = random_state.randint(0, 2, size=7) - example_errors = {"ada-1": ada_data, - "mv": mv_data} - nb_classifiers, nb_examples, classifiers_names, \ - data_2d, error_on_examples = result_analysis.gen_error_data(example_errors) - self.assertEqual(nb_classifiers, 2) - self.assertEqual(nb_examples, 7) - self.assertEqual(classifiers_names, ["ada-1", "mv"]) - np.testing.assert_array_equal(data_2d, np.array([ada_data, mv_data]).transpose()) - np.testing.assert_array_equal(error_on_examples, -1*(ada_data+mv_data)/nb_classifiers) - - -class Test_format_previous_results(unittest.TestCase): - - def test_simple(self): - biclass_results = {"01":{"metrics_scores":[], "example_errors":[]}} - random_state = np.random.RandomState(42) - - # Gen metrics data - metrics_1_data = random_state.uniform(size=(2,2)) - metrics_2_data = random_state.uniform(size=(2,2)) - metric_1_df = pd.DataFrame(data=metrics_1_data, index=["train", "test"], - columns=["ada-1", "mv"]) - metric_2_df = pd.DataFrame(data=metrics_2_data, index=["train", "test"], - columns=["ada-1", "mv"]) - biclass_results["01"]["metrics_scores"].append({"acc": metric_1_df}) - biclass_results["01"]["metrics_scores"].append({"acc": metric_2_df}) - - # Gen error data - ada_error_data_1 = random_state.randint(0,2,7) - ada_error_data_2 = random_state.randint(0, 2, 7) - ada_sum = ada_error_data_1+ada_error_data_2 - mv_error_data_1 = random_state.randint(0, 2, 7) - mv_error_data_2 = random_state.randint(0, 2, 7) - mv_sum = mv_error_data_1+mv_error_data_2 - biclass_results["01"]["example_errors"].append({}) - biclass_results["01"]["example_errors"].append({}) - biclass_results["01"]["example_errors"][0]["ada-1"] = ada_error_data_1 - biclass_results["01"]["example_errors"][0]["mv"] = mv_error_data_1 - biclass_results["01"]["example_errors"][1]["ada-1"] = ada_error_data_2 - biclass_results["01"]["example_errors"][1]["mv"] = mv_error_data_2 - - # Running the function - metric_analysis, error_analysis = result_analysis.format_previous_results(biclass_results) - mean_df = pd.DataFrame(data=np.mean(np.array([metrics_1_data, - metrics_2_data]), - axis=0), - index=["train", "test"], - columns=["ada-1", "mvm"]) - std_df = pd.DataFrame(data=np.std(np.array([metrics_1_data, - metrics_2_data]), - axis=0), - index=["train", "test"], - columns=["ada-1", "mvm"]) - - # Testing - np.testing.assert_array_equal(metric_analysis["01"]["acc"]["mean"].loc["train"], - mean_df.loc["train"]) - np.testing.assert_array_equal(metric_analysis["01"]["acc"]["mean"].loc["test"], - mean_df.loc["test"]) - np.testing.assert_array_equal(metric_analysis["01"]["acc"]["std"].loc["train"], - std_df.loc["train"]) - np.testing.assert_array_equal(metric_analysis["01"]["acc"]["std"].loc["test"], - std_df.loc["test"]) - np.testing.assert_array_equal(ada_sum, error_analysis["01"]["ada-1"]) - np.testing.assert_array_equal(mv_sum, error_analysis["01"]["mv"]) - - -class Test_gen_error_data_glob(unittest.TestCase): - - def test_simple(self): - random_state = np.random.RandomState(42) - - ada_error_data_1 = random_state.randint(0,2,7) - ada_error_data_2 = random_state.randint(0, 2, 7) - ada_sum = ada_error_data_1+ada_error_data_2 - mv_error_data_1 = random_state.randint(0, 2, 7) - mv_error_data_2 = random_state.randint(0, 2, 7) - mv_sum = mv_error_data_1+mv_error_data_2 - - combi_results = {"ada-1":ada_sum, "mv": mv_sum} - - stats_iter = 2 - - nb_examples, nb_classifiers, \ - data, error_on_examples, \ - classifier_names = result_analysis.gen_error_data_glob(combi_results, - stats_iter) - self.assertEqual(nb_examples, 7) - self.assertEqual(nb_classifiers, 2) - np.testing.assert_array_equal(data, np.array([ada_sum, mv_sum]).transpose()) - np.testing.assert_array_equal(error_on_examples, -1*np.sum(np.array([ada_sum, mv_sum]), axis=0)+(nb_classifiers*stats_iter)) - self.assertEqual(classifier_names, ["ada-1", "mv"]) - - - - - +# import unittest +# import numpy as np +# +# from ..mono_multi_view_classifiers import ResultAnalysis +# +# +# class Test_getMetricsScoresBiclass(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# cls.metrics = [["accuracy_score"]] +# cls.monoViewResults = [["", ["chicken_is_heaven", ["View0"], {"accuracy_score": [0.5,0.7]}]]] +# cls.multiviewResults = [["Mumbo", {"":""}, {"accuracy_score":[0.6,0.8]}]] +# +# def test_simple(cls): +# res = ResultAnalysis.getMetricsScoresBiclass(cls.metrics, cls.monoViewResults, cls.multiviewResults) +# cls.assertIn("accuracy_score",res) +# cls.assertEqual(type(res["accuracy_score"]), dict) +# cls.assertEqual(res["accuracy_score"]["classifiers_names"], ["chicken_is_heaven-View0", "Mumbo"]) +# cls.assertEqual(res["accuracy_score"]["train_scores"], [0.5, 0.6]) +# cls.assertEqual(res["accuracy_score"]["test_scores"], [0.7, 0.8]) +# +# def test_only_multiview(cls): +# cls.monoViewResults = [] +# res = ResultAnalysis.getMetricsScoresBiclass(cls.metrics, cls.monoViewResults, cls.multiviewResults) +# cls.assertIn("accuracy_score",res) +# cls.assertEqual(type(res["accuracy_score"]), dict) +# cls.assertEqual(res["accuracy_score"]["classifiers_names"], ["Mumbo"]) +# cls.assertEqual(res["accuracy_score"]["train_scores"], [0.6]) +# cls.assertEqual(res["accuracy_score"]["test_scores"], [0.8]) +# +# def test_only_monoview(cls): +# cls.multiviewResults = [] +# res = ResultAnalysis.getMetricsScoresBiclass(cls.metrics, cls.monoViewResults, cls.multiviewResults) +# cls.assertIn("accuracy_score",res) +# cls.assertEqual(type(res["accuracy_score"]), dict) +# cls.assertEqual(res["accuracy_score"]["classifiers_names"], ["chicken_is_heaven-View0"]) +# cls.assertEqual(res["accuracy_score"]["train_scores"], [0.5]) +# cls.assertEqual(res["accuracy_score"]["test_scores"], [0.7]) +# +# +# class Test_getExampleErrorsBiclass(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# cls.usedBenchmarkArgumentDictionary = {"labels": np.array([0,1,1,-100,-100,0,1,1,-100])} +# cls.monoViewResults = [["", ["chicken_is_heaven", ["View0"], {}, np.array([1,1,1,-100,-100,0,1,1,-100])]]] +# cls.multiviewResults = [["Mumbo", {"":""}, {}, np.array([0,0,1,-100,-100,0,1,1,-100])]] +# +# def test_simple(cls): +# res = ResultAnalysis.getExampleErrorsBiclass(cls.usedBenchmarkArgumentDictionary, cls.monoViewResults, +# cls.multiviewResults) +# cls.assertIn("chicken_is_heaven-View0", res) +# cls.assertIn("Mumbo", res) +# np.testing.assert_array_equal(res["Mumbo"], np.array([1,0,1,-100,-100,1,1,1,-100])) +# np.testing.assert_array_equal(res["chicken_is_heaven-View0"], np.array([0,1,1,-100,-100,1,1,1,-100])) diff --git a/multiview_platform/tests/test_utils/test_GetMultiviewDB.py b/multiview_platform/tests/test_utils/test_GetMultiviewDB.py index a9f5dae8922e31bc6f364076171f6f06ffe1db2d..a61bfbf3d0e967b8d849893f52c3e7e5967d545e 100644 --- a/multiview_platform/tests/test_utils/test_GetMultiviewDB.py +++ b/multiview_platform/tests/test_utils/test_GetMultiviewDB.py @@ -21,7 +21,7 @@ class Test_get_classic_db_hdf5(unittest.TestCase): self.views = [self.rs.randint(0, 10, size=(self.nb_examples, 7)) for _ in range(self.nb_view)] self.labels = self.rs.randint(0, self.nb_class, self.nb_examples) - self.dataset_file = h5py.File(os.path.join(tmp_path, self.file_name), 'w') + self.dataset_file = h5py.File(os.path.join(tmp_path, self.file_name)) self.view_names = ["ViewN" + str(index) for index in range(len(self.views))] self.are_sparse = [False for _ in self.views] diff --git a/multiview_platform/tests/test_utils/test_dataset.py b/multiview_platform/tests/test_utils/test_dataset.py index 6125243c08f1d6d82098f632fa28966b3a9564af..dcfcb353d97ae45109c3f86c6a8d2e705f9d7207 100644 --- a/multiview_platform/tests/test_utils/test_dataset.py +++ b/multiview_platform/tests/test_utils/test_dataset.py @@ -22,7 +22,7 @@ class Test_Dataset(unittest.TestCase): cls.views = [cls.rs.randint(0, 10, size=(cls.nb_examples, cls.nb_attr)) for _ in range(cls.nb_view)] cls.labels = cls.rs.randint(0, cls.nb_class, cls.nb_examples) - cls.dataset_file = h5py.File(os.path.join(tmp_path, cls.file_name), "w") + cls.dataset_file = h5py.File(os.path.join(tmp_path, cls.file_name)) cls.view_names = ["ViewN" + str(index) for index in range(len(cls.views))] cls.are_sparse = [False for _ in cls.views] for view_index, (view_name, view, is_sparse) in enumerate( @@ -50,7 +50,7 @@ class Test_Dataset(unittest.TestCase): def test_filter(self): """Had to create a new dataset to aviod playing with the class one""" file_name = "test_filter.hdf5" - dataset_file_filter = h5py.File(os.path.join(tmp_path, file_name), "w") + dataset_file_filter = h5py.File(os.path.join(tmp_path, file_name)) for view_index, (view_name, view, is_sparse) in enumerate( zip(self.view_names, self.views, self.are_sparse)): view_dataset = dataset_file_filter.create_dataset( @@ -155,7 +155,7 @@ class Test_Dataset(unittest.TestCase): source_view_name="ViewN0", target_view_index=1) self.assertIn("View1", list(new_dataset.keys())) - np.testing.assert_array_equal(dataset_object.get_v(0), new_dataset["View1"][()]) + np.testing.assert_array_equal(dataset_object.get_v(0), new_dataset["View1"].value) self.assertEqual(new_dataset["View1"].attrs["name"], "ViewN0") new_dataset.close() os.remove(os.path.join(tmp_path, "test_copy.hdf5")) @@ -180,7 +180,7 @@ class Test_Dataset(unittest.TestCase): def test_select_views_and_labels(self): file_name = "test_filter.hdf5" - dataset_file_select = h5py.File(os.path.join(tmp_path, file_name), "w") + dataset_file_select = h5py.File(os.path.join(tmp_path, file_name)) for view_index, (view_name, view, is_sparse) in enumerate( zip(self.view_names, self.views, self.are_sparse)): view_dataset = dataset_file_select.create_dataset( @@ -208,7 +208,7 @@ class Test_Dataset(unittest.TestCase): def test_add_gaussian_noise(self): file_name = "test_noise.hdf5" - dataset_file_select = h5py.File(os.path.join(tmp_path, file_name), "w") + dataset_file_select = h5py.File(os.path.join(tmp_path, file_name)) limits = np.zeros((self.nb_attr, 2)) limits[:, 1] += 100 meta_data_grp = dataset_file_select.create_group("Metadata") diff --git a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py index 03a9655bbc10e0c8001a479897fe084db48f95a5..b5dfe409c98f7ecc18ddeccf6eaca53216f53aed 100644 --- a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py +++ b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py @@ -55,7 +55,7 @@ class Test_randomized_search(unittest.TestCase): def test_simple(self): best_params, test_folds_preds = hyper_parameter_search.randomized_search( - self.dataset, self.labels[()], "multiview", self.random_state, tmp_path, + self.dataset, self.labels.value, "multiview", self.random_state, tmp_path, weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds, 1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices) diff --git a/requirements.txt b/requirements.txt index 2db0c7eda8a0cf170f342967ce54b19a7e70ea1d..3899b3fa4a24155369d0b13c09a4f8639428e4c1 100755 --- a/requirements.txt +++ b/requirements.txt @@ -15,5 +15,4 @@ m2r==0.2.1 docutils==0.12 pyyaml==3.12 cvxopt==1.2.0 --e git+https://github.com/IvanoLauriola/MKLpy.git#egg=MKLpy -plotly==4.2.1 +-e git+https://github.com/IvanoLauriola/MKLpy.git#egg=MKLpy \ No newline at end of file diff --git a/setup.py b/setup.py index 6ce0e776fa03b342229182d40253500f14a4c5ec..f715ce8708c8a718f3229a381ddd929108cc226e 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,7 @@ def setup_package(): install_requires=['numpy>=1.16', 'scipy>=0.16','scikit-learn==0.19', 'matplotlib', 'h5py', 'joblib', 'pandas', 'm2r', 'pyyaml', 'pyscm @ git+https://github.com/aldro61/pyscm', - 'MKLpy @ git+https://github.com/IvanoLauriola/MKLpy', - 'cvxopt', 'plotly==4.2.1'], + 'cvxopt', 'MKLpy @ git+https://github.com/IvanoLauriola/MKLpy'], # Il est d'usage de mettre quelques metadata à propos de sa lib # Pour que les robots puissent facilement la classer.