diff --git a/config_files/config.yml b/config_files/config.yml index bcd321ed418dff029bdeda2adde64812b4aa3369..ffa9ea2e6bb7f09ca423ddc3bad36b7eb5e300a8 100644 --- a/config_files/config.yml +++ b/config_files/config.yml @@ -1,41 +1,75 @@ # The base configuration of the benchmark Base : - log: true - name: ["Plausible"] + # Enable logging + log: True + # The name of each dataset in the directory on which the benchmark should be run + name: ["plausible"] + # A label for the resul directory label: "_" + # The type of dataset, currently supported ".hdf5", and ".csv" type: ".hdf5" + # The views to use in the banchmark, an empty value will result in using all the views views: + # The path to the directory where the datasets are stored pathf: "../data/" + # The niceness of the processes, useful to lower their priority nice: 0 + # The random state of the benchmark, useful for reproducibility random_state: 42 + # The number of parallel computing threads nb_cores: 1 + # Used to run the benchmark on the full dataset full: False - debug: True + # Used to be able to run more than one benchmark per minute + debug: False + # To add noise to the data, will add gaussian noise with noise_std add_noise: False noise_std: 0.0 + # The directory in which the results will be stored res_dir: "../results/" # All the classification-realted configuration options Classification: + # If the dataset is multiclass, will use this multiclass-to-biclass method multiclass_method: "oneVersusOne" + # The ratio number of test exmaples/number of train examples split: 0.8 + # The nubmer of folds in the cross validation process when hyper-paramter optimization is performed nb_folds: 2 + # The number of classes to select in the dataset nb_class: 2 + # The name of the classes to select in the dataset classes: + # The type of algorithms to run during the benchmark (monoview and/or multiview) type: ["monoview","multiview"] + # The name of the monoview algorithms to run, ["all"] to run all the available classifiers algos_monoview: ["all"] + # The names of the multiview algorithms to run, ["all"] to run all the available classifiers algos_multiview: ["all"] + # The number of times the benchamrk is repeated with different train/test + # split, to have more statistically significant results stats_iter: 2 + # The metrics that will be use din the result analysis metrics: ["accuracy_score", "f1_score"] + # The metric that will be used in the hyper-parameter optimization process metric_princ: "f1_score" + # The type of hyper-parameter optimization method hps_type: "randomized_search" + # The number of iteration in the hyper-parameter optimization process hps_iter: 2 +# The following arguments are classifier-specific, and are documented in each +# of the corresponding modules. + +# In order to run multiple sets of parameters, use multiple values in the +# following lists, and set hps_type to None. + ##################################### # The Monoview Classifier arguments # ##################################### + random_forest: n_estimators: [25] max_depth: [3] diff --git a/config_files/config_test.yml b/config_files/config_test.yml index e98dc442c64126c478dc9d38794a98f4449bef35..d675a0606b3e8376651c8bb58fa280ca4b1ef1fc 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -4,12 +4,12 @@ Base : name: ["control_vs_malade"] label: "_" type: ".hdf5" - views: + views: ["300nm", "350nm"] pathf: "../data/" nice: 0 random_state: 42 nb_cores: 1 - full: False + full: True debug: True add_noise: False noise_std: 0.0 @@ -22,10 +22,10 @@ Classification: nb_folds: 5 nb_class: 2 classes: - type: ["multiview"] - algos_monoview: ["all"] - algos_multiview: ["lp_norm_mkl",] - stats_iter: 5 + type: ["monoview",] + algos_monoview: ["decision_tree"] + algos_multiview: ["all"] + stats_iter: 4 metrics: ["accuracy_score", "f1_score"] metric_princ: "f1_score" hps_type: "randomized_search" diff --git a/data/Plausible.hdf5 b/data/Plausible.hdf5 deleted file mode 100644 index 4f10a2ad8f524e8692771be0ab2f3f3709f37c16..0000000000000000000000000000000000000000 Binary files a/data/Plausible.hdf5 and /dev/null differ diff --git a/data/Plausible0.hdf5 b/data/Plausible0.hdf5 deleted file mode 100644 index c7e0dd9d3a42182c5879b66d3ac225656171d2e0..0000000000000000000000000000000000000000 Binary files a/data/Plausible0.hdf5 and /dev/null differ diff --git a/data/Plausible1.hdf5 b/data/Plausible1.hdf5 deleted file mode 100644 index c7e0dd9d3a42182c5879b66d3ac225656171d2e0..0000000000000000000000000000000000000000 Binary files a/data/Plausible1.hdf5 and /dev/null differ diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py index 7dae037afa71cd77c1010cd54c1970107068346f..88ae2c8717d873d46a2ceb85f1144abcc0adb5c5 100644 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ b/multiview_platform/mono_multi_view_classifiers/exec_classif.py @@ -57,15 +57,13 @@ def init_benchmark(cl_type, monoview_algos, multiview_algos, args): Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. """ benchmark = {"monoview": {}, "multiview": {}} - all_multiview_packages = [name for _, name, isPackage - in pkgutil.iter_modules( - ['./mono_multi_view_classifiers/multiview_classifiers/']) if isPackage] + if "monoview" in cl_type: if monoview_algos == ['all']: benchmark["monoview"] = [name for _, name, isPackage in pkgutil.iter_modules([ - "./mono_multi_view_classifiers/monoview_classifiers"]) + "./mono_multi_view_classifiers/monoview_classifiers"]) if not isPackage] else: @@ -82,34 +80,6 @@ def init_benchmark(cl_type, monoview_algos, multiview_algos, args): return benchmark -# def gen_views_dictionnary(dataset_var, views): -# r"""Used to generate a dictionary mapping a view name (key) to it's index in the dataset (value). -# -# Parameters -# ---------- -# dataset_var : `h5py` dataset file -# The full dataset on which the benchmark will be done -# views : List of strings -# Names of the selected views on which the banchmark will be done -# -# Returns -# ------- -# viewDictionary : Dictionary -# Dictionary mapping the view names totheir indexin the full dataset. -# """ -# datasets_names = dataset_var.get_view_dict().keys() -# views_dictionary = {} -# for dataset_name in datasets_names: -# if dataset_name[:4] == "View": -# view_name = dataset_var.get(dataset_name).attrs["name"] -# if type(view_name) == bytes: -# view_name = view_name.decode("utf-8") -# if view_name in views: -# views_dictionary[view_name] = int(dataset_name[4:]) -# -# return views_dictionary - - def init_argument_dictionaries(benchmark, views_dictionary, nb_class, init_kwargs): argument_dictionaries = {"monoview": [], "multiview": []} @@ -263,6 +233,17 @@ def get_path_dict(multiview_classifier_args): def is_dict_in(dictionary): + """ + Returns True if any of the dictionary value is a dictionary itself. + + Parameters + ---------- + dictionary + + Returns + ------- + + """ paths = [] for key, value in dictionary.items(): if isinstance(value, dict): @@ -271,6 +252,24 @@ def is_dict_in(dictionary): def gen_multiple_kwargs_combinations(cl_kwrags): + """ + Generates all the possible combination of the asked args + + Parameters + ---------- + cl_kwrags : dict + The arguments, with one at least having multiple values + + Returns + ------- + kwargs_combination : list + The list of all the combinations of arguments + + reduced_kwargs_combination : list + The reduced names and values of the arguments will be used in the naming + process of the different classifiers + + """ values = list(cl_kwrags.values()) listed_values = [[_] if type(_) is not list else _ for _ in values] values_cartesian_prod = [_ for _ in itertools.product(*listed_values)] @@ -292,6 +291,39 @@ def gen_multiple_args_dictionnaries(nb_class, kwargs_init, classifier, view_name=None, view_index=None, views_dictionary=None, framework="monoview"): + """ + Used in the case of mutliple arguments asked in the config file. + Will combine the arguments to explore all the possibilities. + + Parameters + ---------- + nb_class : int, + The number of classes in the dataset + + kwargs_init : dict + The arguments given in the config file + + classifier : str + The name of the classifier for which multiple arguments have been asked + + view_name : str + The name of the view in consideration. + + view_index : int + The index of the view in consideration + + views_dictionary : dict + The dictionary of all the views indices and their names + + framework : str + Either monoview or multiview + + Returns + ------- + args_dictionaries : list + The list of all the possible combination of asked arguments + + """ if framework=="multiview": classifier_config = get_path_dict(kwargs_init[classifier]) else: @@ -322,12 +354,12 @@ def init_kwargs(args, classifiers_names, framework="monoview"): ---------- args : parsed args objects All the args passed by the user. - classifiers-names : list of strings + classifiers_names : list of strings List of the benchmarks's monoview classifiers names. Returns ------- - monoviewKWARGS : Dictionary of dictionaries + kwargs : Dictionary Dictionary resuming all the specific arguments for the benchmark, one dictionary for each classifier. For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" @@ -351,7 +383,25 @@ def init_kwargs(args, classifiers_names, framework="monoview"): def init_kwargs_func(args, benchmark): - monoview_kwargs = init_kwargs(args, benchmark["monoview"]) + """ + Dispached the kwargs initialization to monoview and multiview and creates + the kwargs variable + + Parameters + ---------- + args : parsed args objects + All the args passed by the user. + + benchmark : dict + The name of the mono- and mutli-view classifiers to run in the benchmark + + Returns + ------- + + kwargs : dict + The arguments for each mono- and multiview algorithms + """ + monoview_kwargs = init_kwargs(args, benchmark["monoview"], framework="monoview") multiview_kwargs = init_kwargs(args, benchmark["multiview"], framework="multiview") kwargs = {"monoview":monoview_kwargs, "multiview":multiview_kwargs} return kwargs @@ -373,31 +423,45 @@ def init_kwargs_func(args, benchmark): # return multiview_kwargs -def init_multiview_arguments(args, benchmark, views, views_indices, - argument_dictionaries, random_state, directory, - results_monoview, classification_indices): - """Used to add each monoview exeperience args to the list of monoview experiences args""" - logging.debug("Start:\t Initializing multiview classifiers arguments") - multiview_arguments = [] - if "multiview" in benchmark: - for multiview_algo_name in benchmark["multiview"]: - mutliview_module = getattr(multiview_classifiers, - multiview_algo_name) - - multiview_arguments += mutliview_module.getArgs(args, benchmark, - views, views_indices, - random_state, - directory, - results_monoview, - classification_indices) - argument_dictionaries["multiview"] = multiview_arguments - logging.debug("Start:\t Initializing multiview classifiers arguments") - return argument_dictionaries +# def init_multiview_arguments(args, benchmark, views, views_indices, +# argument_dictionaries, random_state, directory, +# results_monoview, classification_indices): +# """Used to add each monoview exeperience args to the list of monoview experiences args""" +# logging.debug("Start:\t Initializing multiview classifiers arguments") +# multiview_arguments = [] +# if "multiview" in benchmark: +# for multiview_algo_name in benchmark["multiview"]: +# mutliview_module = getattr(multiview_classifiers, +# multiview_algo_name) +# +# multiview_arguments += mutliview_module.getArgs(args, benchmark, +# views, views_indices, +# random_state, +# directory, +# results_monoview, +# classification_indices) +# argument_dictionaries["multiview"] = multiview_arguments +# logging.debug("Start:\t Initializing multiview classifiers arguments") +# return argument_dictionaries def arange_metrics(metrics, metric_princ): """Used to get the metrics list in the right order so that - the first one is the principal metric specified in args""" + the first one is the principal metric specified in args + + Parameters + ---------- + metrics : list of lists + The metrics that will be used in the benchmark + + metric_princ : str + The name of the metric that need to be used for the hyper-parameter + optimization process + + Returns + ------- + metrics : list of lists + The metrics list, but arranged so the first one is the principal one.""" if [metric_princ] in metrics: metric_index = metrics.index([metric_princ]) first_metric = metrics[0] @@ -410,6 +474,31 @@ def arange_metrics(metrics, metric_princ): def benchmark_init(directory, classification_indices, labels, labels_dictionary, k_folds): + """ + Initializes the benchmark, by saving the indices of the train + examples and the cross validation folds. + + Parameters + ---------- + directory : str + The benchmark's result directory + + classification_indices : numpy array + The indices of the examples, splitted for the train/test split + + labels : numpy array + The labels of the dataset + + labels_dictionary : dict + The dictionary with labels as keys and their names as values + + k_folds : sklearn.model_selection.Folds object + The folds for the cross validation process + + Returns + ------- + + """ logging.debug("Start:\t Benchmark initialization") if not os.path.exists(os.path.dirname(directory + "train_labels.csv")): try: @@ -448,8 +537,7 @@ def exec_one_benchmark(core_index=-1, labels_dictionary=None, directory=None, benchmark=None, views=None, views_indices=None, flag=None, labels=None, exec_monoview_multicore=exec_monoview_multicore, - exec_multiview_multicore=exec_multiview_multicore, - init_multiview_arguments=init_multiview_arguments): + exec_multiview_multicore=exec_multiview_multicore,): """Used to run a benchmark using one core. ExecMonoview_multicore, initMultiviewArguments and exec_multiview_multicore args are only used for tests""" @@ -469,14 +557,6 @@ def exec_one_benchmark(core_index=-1, labels_dictionary=None, directory=None, for argument in argument_dictionaries["Monoview"]] logging.debug("Done:\t monoview benchmark") - logging.debug("Start:\t multiview arguments initialization") - # argument_dictionaries = initMultiviewArguments(args, benchmark, views, - # views_indices, - # argument_dictionaries, - # random_state, directory, - # resultsMonoview, - # classification_indices) - logging.debug("Done:\t multiview arguments initialization") logging.debug("Start:\t multiview benchmark") results_multiview = [ @@ -501,8 +581,7 @@ def exec_one_benchmark_multicore(nb_cores=-1, labels_dictionary=None, benchmark=None, views=None, views_indices=None, flag=None, labels=None, exec_monoview_multicore=exec_monoview_multicore, - exec_multiview_multicore=exec_multiview_multicore, - init_multiview_arguments=init_multiview_arguments): + exec_multiview_multicore=exec_multiview_multicore,): """Used to run a benchmark using multiple cores. ExecMonoview_multicore, initMultiviewArguments and exec_multiview_multicore args are only used for tests""" @@ -568,13 +647,11 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, hyper_param_search=None, metrics=None, argument_dictionaries=None, benchmark=None, views=None, views_indices=None, - flag=None, labels=None, - exec_monoview_multicore=exec_monoview_multicore, - exec_multiview_multicore=exec_multiview_multicore, - init_multiview_arguments=init_multiview_arguments): + flag=None, labels=None,): results_monoview, labels_names = benchmark_init(directory, classification_indices, labels, labels_dictionary, k_folds) + logging.getLogger('matplotlib.font_manager').disabled = True logging.debug("Start:\t monoview benchmark") for arguments in argument_dictionaries["monoview"]: X = dataset_var.get_v(arguments["view_index"]) @@ -696,7 +773,8 @@ def exec_benchmark(nb_cores, stats_iter, nb_multiclass, directory, labels_dictionary, nb_examples, - nb_labels) + nb_labels, + dataset_var.example_ids) logging.debug("Done:\t Analyzing predictions") delete(benchmark_arguments_dictionaries, nb_cores, dataset_var) return results_mean_stds diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py index cc637d2757c1b04aae97e7b634ab852d04fdecb2..875e3763ab77ea4ea771e6653aa80e2242b87ef2 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py @@ -16,7 +16,7 @@ from . import monoview_utils from .analyze_result import execute # Import own modules from .. import monoview_classifiers -from ..utils.dataset import get_value, extract_subset, Dataset +from ..utils.dataset import extract_subset, Dataset from ..utils import hyper_parameter_search # Author-Info @@ -97,15 +97,19 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, logging.debug("Start:\t Predicting") y_train_pred = classifier.predict(X_train) y_test_pred = classifier.predict(X_test) - full_labels_pred = np.zeros(Y.shape, dtype=int) - 100 + + #Filling the full prediction in the right order + full_pred = np.zeros(Y.shape, dtype=int) - 100 for trainIndex, index in enumerate(classificationIndices[0]): - full_labels_pred[index] = y_train_pred[trainIndex] + full_pred[index] = y_train_pred[trainIndex] for testIndex, index in enumerate(classificationIndices[1]): - full_labels_pred[index] = y_test_pred[testIndex] + full_pred[index] = y_test_pred[testIndex] + if X_test_multiclass != []: y_test_multiclass_pred = classifier.predict(X_test_multiclass) else: y_test_multiclass_pred = [] + logging.debug("Done:\t Predicting") t_end = time.time() - t_start @@ -124,7 +128,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, logging.debug("Done:\t Getting results") logging.debug("Start:\t Saving preds") - saveResults(stringAnalysis, outputFileName, full_labels_pred, y_train_pred, + saveResults(stringAnalysis, outputFileName, full_pred, y_train_pred, y_train, imagesAnalysis, y_test) logging.info("Done:\t Saving results") @@ -132,7 +136,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, if testFoldsPreds is None: testFoldsPreds = y_train_pred return monoview_utils.MonoviewResult(viewIndex, classifier_name, feat, metricsScores, - full_labels_pred, clKWARGS, + full_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds) # return viewIndex, [CL_type, feat, metricsScores, full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds] diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost_graalpy.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost_graalpy.py deleted file mode 100644 index 3ea726756552ef3a00f3feb4944e8301c73b28be..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost_graalpy.py +++ /dev/null @@ -1,277 +0,0 @@ -import logging - -import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.utils.validation import check_is_fitted - -from ..metrics import zero_one_loss -from ..monoview.additions.BoostUtils import StumpsClassifiersGenerator, \ - BaseBoost -from ..monoview.monoview_utils import CustomRandint, \ - BaseMonoviewClassifier, change_label_to_minus, change_label_to_zero - -classifier_class_name = "AdaboostGraalpy" - -class AdaBoostGP(BaseEstimator, ClassifierMixin, BaseBoost): - """Scikit-Learn compatible AdaBoost classifier. Original code by Pascal Germain, adapted by Jean-Francis Roy. - - - Parameters - ---------- - - n_iterations : int, optional - The number of iterations of the algorithm. Defaults to 200. - - iterations_to_collect_as_hyperparameters : list - Iteration numbers to collect while learning, that will be converted as hyperparameter values at evaluation time. - Defaults to None. - classifiers_generator : Transformer, optional - A transformer to convert input samples in voters' outputs. Default: Decision stumps transformer, with 10 stumps - per attributes. - callback_function : function, optional - A function to call at each iteration that is supplied learning information. Defaults to None. - - n_stumps : int ( default : 10) - - self_complemented : boolean (default : True - - Attributes - ---------- - n_iterations : int, optional - The number of iterations of the algorithm. Defaults to 200. - iterations_to_collect_as_hyperparameters : list - Iteration numbers to collect while learning, that will be converted as hyperparameter values at evaluation time. - Defaults to None. - classifiers_generator : Transformer, optional - A transformer to convert input samples in voters' outputs. Default: Decision stumps transformer, with 10 stumps - per attributes. - callback_function : function, optional - A function to call at each iteration that is supplied learning information. Defaults to None. - - """ - - def __init__(self, n_iterations=200, - iterations_to_collect_as_hyperparameters=True, - classifiers_generator=None, callback_function=None, - n_stumps=10, self_complemented=True): - - self.n_iterations = n_iterations - self.n_stumps = n_stumps - self.iterations_to_collect_as_hyperparameters = iterations_to_collect_as_hyperparameters - self.estimators_generator = classifiers_generator - self.callback_function = callback_function - self.self_complemented = self_complemented - - def fit(self, X, y): - """Fits the algorithm on training data. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - The input data. - y : ndarray of shape (n_samples, ) - The input labels. - - Returns - ------- - self - - """ - y_neg = change_label_to_minus(y) - - if self.estimators_generator is None: - self.estimators_generator = StumpsClassifiersGenerator( - n_stumps_per_attribute=self.n_stumps, - self_complemented=self.self_complemented) - - # Step 1: We fit the classifiers generator and get its classification matrix. - self.estimators_generator.fit(X, y_neg) - # hint: This is equivalent to construct a new X - classification_matrix = self._binary_classification_matrix(X) - - n_samples, n_voters = classification_matrix.shape - # logging.debug("n_voters = {}".format(n_voters)) - - # Step 2: We initialize the weights on the samples and the weak classifiers. - sample_weights = np.ones(n_samples) / n_samples - alpha_weights = np.zeros(n_voters) - self.losses = [] - - # Step 3: We loop for each iteration. - self.collected_weight_vectors_ = [] - for t in range(self.n_iterations): - - # Step 4: We find the classifier that maximizes the success, weighted by the sample weights. - classifier_successes = np.dot(classification_matrix.T, - sample_weights * y_neg) - - best_voter_index = np.argmax(classifier_successes) - success = classifier_successes[best_voter_index] - - if success >= 1.0: - logging.info("AdaBoost stopped : perfect classifier found!") - self.weights_ = np.zeros(n_voters) - self.weights_[best_voter_index] = 1.0 - return self - - # Step 5: We calculate the alpha_t parameter and update the alpha weights. - alpha = 0.5 * np.log((1.0 + success) / (1.0 - success)) - alpha_weights[best_voter_index] += alpha - - # logging.debug("{} : {}".format(t, str(alpha))) - - # Step 6: We update the sample weights. - sample_weights *= np.exp( - -1 * alpha * y_neg * classification_matrix[:, best_voter_index]) - - normalization_constant = sample_weights.sum() - sample_weights = sample_weights / normalization_constant - - # We collect iteration information for later evaluation. - if self.iterations_to_collect_as_hyperparameters: - weights = alpha_weights / np.sum(alpha_weights) - self.collected_weight_vectors_.append(weights.copy()) - - loss = zero_one_loss.score(y_neg, np.sign(np.sum( - np.multiply(classification_matrix, - alpha_weights / np.sum(alpha_weights)), axis=1))) - self.losses.append(loss) - - if self.callback_function is not None: - self.callback_function(t, alpha_weights, normalization_constant, - self.estimators_generator, self.weights_) - - self.weights_ = alpha_weights / np.sum(alpha_weights) - self.losses = np.array(self.losses) - self.learner_info_ = { - 'n_nonzero_weights': np.sum(self.weights_ > 1e-12)} - - return self - - def predict(self, X): - """Predict inputs using the fit classifier. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - The data to classify. - - Returns - ------- - predictions : ndarray of shape (n_samples, ) - The estimated labels. - - """ - check_is_fitted(self, 'weights_') - classification_matrix = self._binary_classification_matrix(X) - - if self.iterations_to_collect_as_hyperparameters: - self.test_preds = [] - for weight_vector in self.collected_weight_vectors_: - preds = np.sum(np.multiply(classification_matrix, - weight_vector), axis=1) - self.test_preds.append(change_label_to_zero(np.sign(preds))) - self.test_preds = np.array(self.test_preds) - margins = np.squeeze( - np.asarray(np.dot(classification_matrix, self.weights_))) - return change_label_to_zero( - np.array([int(x) for x in np.sign(margins)])) - - -class AdaboostGraalpy(AdaBoostGP, BaseMonoviewClassifier): - """AdaboostGraalpy - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number generator to use when - shuffling the data. - - n_iterations : in number of iterations (default : 200) - - n_stumps : int (default 1) - - kwargs : others arguments - - - Attributes - ---------- - param_names : - - distribs : - - weird_strings : - - n_stumps : - - nbCores : - - """ - def __init__(self, random_state=None, n_iterations=200, n_stumps=1, - **kwargs): - - super(AdaboostGraalpy, self).__init__( - n_iterations=n_iterations, - n_stumps=n_stumps - ) - self.param_names = ["n_iterations", "n_stumps", "random_state"] - self.distribs = [CustomRandint(low=1, high=500), [n_stumps], - [random_state]] - self.classed_params = [] - self.weird_strings = {} - self.n_stumps = n_stumps - if "nbCores" not in kwargs: - self.nbCores = 1 - else: - self.nbCores = kwargs["nbCores"] - - # def canProbas(self): - # """ - # Used to know if the classifier can return label probabilities - # - # Returns - # ------- - # True in any case - # """ - # return True - - def getInterpret(self, directory, y_test): - """ - - Parameters - ---------- - directory : - - y_test : - - Returns - ------- - retur string of interpret - """ - np.savetxt(directory + "train_metrics.csv", self.losses, delimiter=',') - np.savetxt(directory + "y_test_step.csv", self.test_preds, - delimiter=',') - step_metrics = [] - for step_index in range(self.test_preds.shape[0] - 1): - step_metrics.append(zero_one_loss.score(y_test, - self.test_preds[step_index, - :])) - step_metrics = np.array(step_metrics) - np.savetxt(directory + "step_test_metrics.csv", step_metrics, - delimiter=',') - return "" - - -# def formatCmdArgs(args): -# """Used to format kwargs for the parsed args""" -# kwargsDict = {"n_iterations": args.AdG_n_iter, -# "n_stumps": args.AdG_stumps, } -# return kwargsDict - - -def paramsToSet(nIter, random_state): - """Used for weighted linear early fusion to generate random search sets""" - paramsSet = [] - for _ in range(nIter): - paramsSet.append({"n_iterations": random_state.randint(1, 500), }) - return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/cq_boost.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/cq_boost.py deleted file mode 100644 index fc9b44ed7d608d61b084d1915a6ee6084dbea05a..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/cq_boost.py +++ /dev/null @@ -1,76 +0,0 @@ -import numpy as np - -from ..monoview.additions.BoostUtils import getInterpretBase -from ..monoview.additions.CQBoostUtils import ColumnGenerationClassifier -from ..monoview.monoview_utils import CustomUniform, CustomRandint, \ - BaseMonoviewClassifier - -classifier_class_name = "CQBoost" - -class CQBoost(ColumnGenerationClassifier, BaseMonoviewClassifier): - - def __init__(self, random_state=None, mu=0.01, epsilon=1e-06, n_stumps=1, - n_max_iterations=None, estimators_generator="Stumps", - max_depth=1, **kwargs): - super(CQBoost, self).__init__( - random_state=random_state, - mu=mu, - epsilon=epsilon, - estimators_generator=estimators_generator, - n_max_iterations=n_max_iterations, - max_depth=max_depth - ) - self.param_names = ["mu", "epsilon", "n_stumps", "random_state", - "n_max_iterations", "estimators_generator", - "max_depth"] - self.distribs = [CustomUniform(loc=0.5, state=1.0, multiplier="e-"), - CustomRandint(low=1, high=15, multiplier="e-"), - [n_stumps], [random_state], [n_max_iterations], - ["Stumps", "Trees"], CustomRandint(low=1, high=5)] - self.classed_params = [] - self.weird_strings = {} - self.n_stumps = n_stumps - if "nbCores" not in kwargs: - self.nbCores = 1 - else: - self.nbCores = kwargs["nbCores"] - - # def canProbas(self): - # """Used to know if the classifier can return label probabilities""" - # return False - - def getInterpret(self, directory, y_test): - np.savetxt(directory + "train_metrics.csv", self.train_metrics, - delimiter=',') - np.savetxt(directory + "c_bounds.csv", self.c_bounds, - delimiter=',') - np.savetxt(directory + "y_test_step.csv", self.step_decisions, - delimiter=',') - step_metrics = [] - for step_index in range(self.step_decisions.shape[1] - 1): - step_metrics.append(self.plotted_metric.score(y_test, - self.step_decisions[:, - step_index])) - step_metrics = np.array(step_metrics) - np.savetxt(directory + "step_test_metrics.csv", step_metrics, - delimiter=',') - return getInterpretBase(self, directory, "CQBoost", self.weights_, - y_test) - - -# def formatCmdArgs(args): -# """Used to format kwargs for the parsed args""" -# kwargsDict = {"mu": args.CQB_mu, -# "epsilon": args.CQB_epsilon, -# "n_stumps": args.CQB_stumps, -# "n_max_iterations": args.CQB_n_iter} -# return kwargsDict - - -def paramsToSet(nIter, randomState): - """Used for weighted linear early fusion to generate random search sets""" - paramsSet = [] - for _ in range(nIter): - paramsSet.append({"mu": 10 ** -randomState.uniform(0.5, 1.5), - "epsilon": 10 ** -randomState.randint(1, 15)}) - return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq.py deleted file mode 100644 index ec0bd7e7c56b46720afd2e759cec7a65957d6acd..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq.py +++ /dev/null @@ -1,652 +0,0 @@ -from ..monoview.monoview_utils import CustomUniform, BaseMonoviewClassifier - -#### Algorithm code #### - -# -*- coding:utf-8 -*- -""" MinCq learning algorithm - -Related papers: -[1] From PAC-Bayes Bounds to Quadratic Programs for Majority Votes (Laviolette et al., 2011) -[2] Risk Bounds for the Majority Vote: From a PAC-Bayesian Analysis to a Learning Algorithm (Germain et al., 2014) - -http://graal.ift.ulaval.ca/majorityvote/ -""" -__author__ = 'Jean-Francis Roy' -import time -import logging -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.metrics.pairwise import rbf_kernel, linear_kernel, \ - polynomial_kernel -# from qp import QP -from ..monoview.additions.BoostUtils import ConvexProgram as QP - - -classifier_class_name = "MinCQ" - -# from majority_vote import MajorityVote -# from voter import StumpsVotersGenerator, KernelVotersGenerator - -class MinCqLearner(BaseEstimator, ClassifierMixin): - """ - MinCq algorithm learner. See [1, 2] - - Parameters - ---------- - mu : float - The fixed value of the first moment of the margin. - - voters_type : string, optional (default='kernel') - Specifies the type of voters. - It must be one of 'kernel', 'stumps' or 'manual'. If 'manual' is specified, the voters have to be manually set - using the "voters" parameter of the fit function. - - n_stumps_per_attribute : int, optional (default=10) - Specifies the amount of decision stumps per attribute. - It is only significant with 'stumps' voters_type. - - kernel : string, optional (default='rbf') - Specifies the kernel type to be used in the algorithm. - It must be one of 'linear', 'poly', 'rbf'. - - degree : int, optional (default=3) - Degree of the polynomial kernel function ('poly'). - Ignored by all other kernels. - - gamma : float, optional (default=0.0) - Kernel coefficient for 'rbf' and 'poly'. - If gamma is 0.0 then 1/n_features will be used instead. - """ - - def __init__(self, mu, voters_type, n_stumps_per_attribute=10, kernel='rbf', - degree=3, gamma=0.0, self_complemented=True): - assert 0 < mu <= 1, "MinCqLearner: mu parameter must be in (0, 1]" - self.mu = mu - self.voters_type = voters_type - self.n_stumps_per_attribute = n_stumps_per_attribute - self.kernel = kernel - self.degree = degree - self.gamma = gamma - self.log = False - self.self_complemented = self_complemented - - self.majority_vote = None - self.qp = None - - def fit(self, X, y, voters=None): - """ Learn a majority vote weights using MinCq. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Training data - - y_reworked : ndarray, shape=(n_samples,), optional - Training labels - - voters : shape=(n_voters,), optional - A priori generated voters - """ - # Preparation of the majority vote, using a voter generator that depends on class attributes - if (np.unique(y) != [-1, 1]).any(): - y_reworked = np.copy(y) - y_reworked[np.where(y_reworked == 0)] = -1 - else: - y_reworked = y - - assert self.voters_type in ['stumps', 'kernel', - 'manual'], "MinCqLearner: voters_type must be 'stumps', 'kernel' or 'manual'" - - if self.voters_type == 'manual': - if voters is None: - logging.error( - "Manually set voters is True, but no voters have been set.") - return self - - else: - voters_generator = None - - if self.voters_type == 'stumps': - assert self.n_stumps_per_attribute >= 1, 'MinCqLearner: n_stumps_per_attribute must be positive' - voters_generator = StumpsVotersGenerator( - self.n_stumps_per_attribute) - - elif self.voters_type == 'kernel': - assert self.kernel in ['linear', 'poly', - 'rbf'], "MinCqLearner: kernel must be 'linear', 'poly' or 'rbf'" - - gamma = self.gamma - if gamma == 0.0: - gamma = 1.0 / np.shape(X)[1] - - if self.kernel == 'linear': - voters_generator = KernelVotersGenerator(linear_kernel) - elif self.kernel == 'poly': - voters_generator = KernelVotersGenerator(polynomial_kernel, - degree=self.degree, - gamma=gamma) - elif self.kernel == 'rbf': - voters_generator = KernelVotersGenerator(rbf_kernel, - gamma=gamma) - - voters = voters_generator.generate(X, y_reworked, - self_complemented=self.self_complemented) - - if self.log: - logging.info("MinCq training started...") - logging.info("Training dataset shape: {}".format(str(np.shape(X)))) - logging.info("Number of voters: {}".format(len(voters))) - self.majority_vote = MajorityVote(voters) - n_base_voters = len(self.majority_vote.weights) - - # Preparation and resolution of the quadratic program - - if self.log: - logging.info("Preparing QP...") - self._prepare_qp(X, y_reworked) - beg = time.time() - try: - if self.log: - logging.info("Solving QP...") - solver_weights = self.qp.solve() - - # Conversion of the weights of the n first voters to weights on the implicit 2n voters. - # See Section 7.1 of [2] for an explanation. - self.majority_vote.weights = np.array( - [2 * q - 1.0 / n_base_voters for q in solver_weights]) - if self.log: - logging.info( - "First moment of the margin on the training set: {:.4f}".format( - np.mean(y_reworked * self.majority_vote.margin(X)))) - - except Exception as e: - logging.error( - "{}: Error while solving the quadratic program: {}.".format( - str(self), str(e))) - self.majority_vote = None - self.cbound_train = self.majority_vote.cbound_value(X, y_reworked) - end=time.time() - self.train_time=end-beg - return self - - def predict(self, X, save_data=True): - """ Using previously learned majority vote weights, predict the labels of new data points. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Samples to predict - - Returns - ------- - predictions : ndarray, shape=(n_samples,) - The predicted labels - """ - if self.log: - logging.info("Predicting...") - if self.majority_vote is None: - logging.error( - "{}: Error while predicting: MinCq has not been fit or fitting has failed. Will output invalid labels".format( - str(self))) - return np.zeros((len(X),)) - if save_data: - self.x_test = X - - vote = self.majority_vote.vote(X) - vote[np.where(vote == -1)] = 0 - return vote - - def predict_proba(self, X): - """ Using previously learned majority vote weights, predict the labels of new data points with a confidence - level. The confidence level is the margin of the majority vote. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Samples to predict - - Returns - ------- - predictions : ndarray, shape=(n_samples,) - The predicted labels - """ - probabilities = np.zeros((np.shape(X)[0], 2)) - - # The margin is between -1 and 1, we rescale it to be between 0 and 1. - margins = self.majority_vote.margin(X) - margins += 1 - margins /= 2 - - # Then, the conficence for class +1 is set to the margin, and confidence for class -1 is set to 1 - margin. - probabilities[:, 1] = margins - probabilities[:, 0] = 1 - margins - return probabilities - - def _prepare_qp(self, X, y): - """ Prepare MinCq's quadratic program. See Program 1 of [2] for more details on its content. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Training data - - y : ndarray, shape=(n_samples,) - Training labels - """ - - self.qp = QP() - - n_features = len(self.majority_vote.voters) - n_examples = len(X) - classification_matrix = self.majority_vote.classification_matrix(X) - - # Objective function. - self.qp.quadratic_func = 2.0 / n_examples * classification_matrix.T.dot( - classification_matrix) - self.qp.linear_func = np.matrix( - np.matrix(-1.0 * np.mean(self.qp.quadratic_func / 2.0, axis=1))).T - - # First moment of the margin fixed to mu. - a_matrix = 2.0 / n_examples * y.T.dot(classification_matrix) - self.qp.add_equality_constraints(a_matrix, - self.mu + 1.0 / 2 * np.mean(a_matrix)) - - # Lower and upper bounds on the variables - self.qp.add_lower_bound(0.0) - self.qp.add_upper_bound(1.0 / n_features) - - -class MajorityVote(object): - """ A Majority Vote of real-valued functions. - - Parameters - ---------- - voters : ndarray of Voter instances - The voters of the majority vote. Each voter must take an example as an input, and output a real value in [-1,1]. - - weights : ndarray, optional (default: uniform distribution) - The weights associated to each voter. - """ - - def __init__(self, voters, weights=None): - self._voters = np.array(voters) - - if weights is not None: - assert (len(voters) == len(weights)) - self._weights = np.array(weights) - else: - self._weights = np.array([1.0 / len(voters)] * len(voters)) - - def vote(self, X): - """ Returns the vote of the Majority Vote on a list of samples. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Input data to classify. - - Returns - ------- - votes : ndarray, shape=(n_samples,), where each value is either -1 or 1 - The vote of the majority vote for each sample. - """ - margins = self.margin(X) - return np.array([int(x) for x in np.sign(margins)]) - - def margin(self, X): - """ Returns the margin of the Majority Vote on a list of samples. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Input data on which to calculate the margin. - - Returns - ------- - margins : ndarray, shape=(n_samples,), where each value is either -1 or 1 - The margin of the majority vote for each sample. - """ - classification_matrix = self.classification_matrix(X) - return np.squeeze( - np.asarray(np.dot(classification_matrix, self.weights))) - - def classification_matrix(self, X): - """ Returns the classification matrix of the majority vote. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Input data to classify - - Returns - ------- - classification_matrix : ndrray, shape=(n_samples, n_voters) - A matrix that contains the value output by each voter, for each sample. - - """ - return np.matrix([v.vote(X) for v in self._voters]).T - - @property - def weights(self): - return self._weights - - @weights.setter - def weights(self, weights): - self._weights = np.array(weights) - - @property - def voters(self): - return self._voters - - @voters.setter - def voters(self, voters): - self._voters = np.array(voters) - - def cbound_value(self, X, y): - """ Returns the value of the C-bound, evaluated on given examples. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_feature) - Input data - y : ndarray, shape=(n_samples, ) - Input labels, where each label is either -1 or 1. - """ - assert np.all(np.in1d(y, [-1, - 1])), 'cbound_value: labels should be either -1 or 1' - - classification_matrix = self.classification_matrix(X) - first_moment = float( - 1.0 / len(y) * classification_matrix.dot(self.weights).dot(y)) - second_moment = float(1.0 / len(y) * self.weights.T.dot( - classification_matrix.T.dot(classification_matrix)).dot( - self.weights)) - - return 1 - (first_moment ** 2 / second_moment) - - -# -*- coding:utf-8 -*- -__author__ = "Jean-Francis Roy" - -import numpy as np - - -class Voter(object): - """ Base class for a voter (function X -> [-1, 1]), where X is an array of samples - """ - - def __init__(self): - pass - - def vote(self, X): - """ Returns the output of the voter, on a sample list X - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Input data to classify - - Returns - ------- - votes : ndarray, shape=(n_samples,) - The result the the voter function, for each sample - """ - raise NotImplementedError("Voter.vote: Not implemented.") - - -class BinaryKernelVoter(Voter): - """ A Binary Kernel Voter, which outputs the value of a kernel function whose first example is fixed a priori. - The sign of the output depends on the label (-1 or 1) of the sample on which the kernel voter is based - - Parameters - ---------- - x : ndarray, shape=(n_features,) - The base sample's description vector - - y : int, -1 or 1 - The label of the base sample. Determines if the voter thinks "negative" or "positive" - - kernel_function : function - The kernel function takes two samples and returns a similarity value. If the kernel has parameters, they should - be set using kwargs parameter - - kwargs : keyword arguments (optional) - Additional parameters for the kernel function - """ - - def __init__(self, x, y, kernel_function, **kwargs): - assert (y in {-1, 1}) - super(BinaryKernelVoter, self).__init__() - self._x = x - self._y = y - self._kernel_function = kernel_function - self._kernel_kwargs = kwargs - - def vote(self, X): - base_point_array = np.array([self._x]) - votes = self._y * self._kernel_function(base_point_array, X, - **self._kernel_kwargs) - votes = np.squeeze(np.asarray(votes)) - - return votes - - -class DecisionStumpVoter(Voter): - """ - Generic Attribute Threshold Binary Classifier - - Parameters - ---------- - attribute_index : int - The attribute to consider for the classification - - threshold : float - The threshold value for classification rule - - direction : int (-1 or 1) - Used to reverse classification decision - - Attributes - ---------- - - attribute_index : - threshold : - direction : - """ - - def __init__(self, attribute_index, threshold, direction=1): - super(DecisionStumpVoter, self).__init__() - self.attribute_index = attribute_index - self.threshold = threshold - self.direction = direction - - def vote(self, points): - return [((point[ - self.attribute_index] > self.threshold) * 2 - 1) * self.direction - for point in points] - - -class VotersGenerator(object): - """ Base class to create a set of voters using training samples - """ - - def generate(self, X, y=None, self_complemented=False): - """ Generates the voters using samples. - - Parameters - ---------- - X : ndarray, shape=(n_samples, n_features) - Input data on which to base the voters - - y : ndarray, shape=(n_samples,), optional - Input labels, usually determines the decision polarity of each voter - - self_complemented : bool - Determines if complement voters should be generated or not - - Returns - ------- - voters : ndarray - An array of voters - """ - raise NotImplementedError("VotersGenerator.generate: not implemented") - - -class StumpsVotersGenerator(VotersGenerator): - """ Decision Stumps Voters generator. - - Parameters - ---------- - n_stumps_per_attribute : int, (default=10) - Determines how many decision stumps will be created for each attribute. - """ - - def __init__(self, n_stumps_per_attribute=10): - self._n_stumps_per_attribute = n_stumps_per_attribute - - def _find_extremums(self, X, i): - mini = np.Infinity - maxi = -np.Infinity - for x in X: - if x[i] < mini: - mini = x[i] - if x[i] > maxi: - maxi = x[i] - return mini, maxi - - def generate(self, X, y=None, self_complemented=False, - only_complements=False): - """ - - Parameters - ---------- - X - y - self_complemented - only_complements - - Returns - ------- - - """ - voters = [] - if len(X) != 0: - for i in range(len(X[0])): - t = self._find_extremums(X, i) - inter = (t[1] - t[0]) / (self._n_stumps_per_attribute + 1) - - if inter != 0: - # If inter is zero, the attribute is useless as it has a constant value. We do not add stumps for - # this attribute. - for x in range(self._n_stumps_per_attribute): - - if not only_complements: - voters.append( - DecisionStumpVoter(i, t[0] + inter * (x + 1), - 1)) - - if self_complemented or only_complements: - voters.append( - DecisionStumpVoter(i, t[0] + inter * (x + 1), - -1)) - - return np.array(voters) - - -class KernelVotersGenerator(VotersGenerator): - """ Utility function to create binary kernel voters for each (x, y) sample. - - Parameters - ---------- - kernel_function : function - The kernel function takes two samples and returns a similarity value. If the kernel has parameters, they should - be set using kwargs parameter - - kwargs : keyword arguments (optional) - Additional parameters for the kernel function - """ - - def __init__(self, kernel_function, **kwargs): - self._kernel_function = kernel_function - self._kernel_kwargs = kwargs - - def generate(self, X, y=None, self_complemented=False, - only_complements=False): - if y is None: - y = np.array([1] * len(X)) - - voters = [] - - for point, label in zip(X, y): - if not only_complements: - voters.append( - BinaryKernelVoter(point, label, self._kernel_function, - **self._kernel_kwargs)) - - if self_complemented or only_complements: - voters.append( - BinaryKernelVoter(point, -1 * label, self._kernel_function, - **self._kernel_kwargs)) - - return np.array(voters) - - -class MinCQ(MinCqLearner, BaseMonoviewClassifier): - - def __init__(self, random_state=None, mu=0.01, self_complemented=True, - n_stumps_per_attribute=10, **kwargs): - super(MinCQ, self).__init__(mu=mu, - voters_type='stumps', - n_stumps_per_attribute=n_stumps_per_attribute, - self_complemented=self_complemented - ) - self.param_names = ["mu", "n_stumps_per_attribute", "random_state"] - self.distribs = [CustomUniform(loc=0.5, state=2.0, multiplier="e-"), - [n_stumps_per_attribute], [random_state]] - self.random_state = random_state - self.classed_params = [] - self.weird_strings = {} - if "nbCores" not in kwargs: - self.nbCores = 1 - else: - self.nbCores = kwargs["nbCores"] - - # def canProbas(self): - # """Used to know if the classifier can return label probabilities""" - # return True - - def set_params(self, **params): - self.mu = params["mu"] - self.random_state = params["random_state"] - self.n_stumps_per_attribute = params["n_stumps_per_attribute"] - return self - - def get_params(self, deep=True): - return {"random_state": self.random_state, "mu": self.mu, - "n_stumps_per_attribute": self.n_stumps_per_attribute} - - def getInterpret(self, directory, y_test): - interpret_string = "Train C_bound value : " + str(self.cbound_train) - y_rework = np.copy(y_test) - y_rework[np.where(y_rework == 0)] = -1 - interpret_string += "\n Test c_bound value : " + str( - self.majority_vote.cbound_value(self.x_test, y_rework)) - np.savetxt(directory+"times.csv", np.array([self.train_time, 0])) - return interpret_string - - def get_name_for_fusion(self): - return "MCQ" - -# -# def formatCmdArgs(args): -# """Used to format kwargs for the parsed args""" -# kwargsDict = {"mu": args.MCQ_mu, -# "n_stumps_per_attribute": args.MCQ_stumps} -# return kwargsDict - - -def paramsToSet(nIter, randomState): - """Used for weighted linear early fusion to generate random search sets""" - paramsSet = [] - for _ in range(nIter): - paramsSet.append({}) - return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy.py deleted file mode 100644 index 8355dffc1a47dda9290a6cd57bbede64890d3454..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy.py +++ /dev/null @@ -1,141 +0,0 @@ -import numpy as np - -from ..monoview.additions.BoostUtils import StumpsClassifiersGenerator -from ..monoview.additions.MinCQUtils import RegularizedBinaryMinCqClassifier -from ..monoview.monoview_utils import BaseMonoviewClassifier, CustomUniform - - -classifier_class_name = "MinCQGraalpy" - -class MinCQGraalpy(RegularizedBinaryMinCqClassifier, BaseMonoviewClassifier): - """ - MinCQGraalpy extend of ``RegularizedBinaryMinCqClassifier `` - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number generator to use when - shuffling the data. - - mu : float, (default: 0.01) - - self_complemented : bool (default : True) - - n_stumps_per_attribute : (default: =1 - - kwargs : others arguments - - - Attributes - ---------- - param_names - - distribs - - n_stumps_per_attribute - - classed_params - - weird_strings - - nbCores : number of cores - - """ - def __init__(self, random_state=None, mu=0.01, self_complemented=True, - n_stumps_per_attribute=1, **kwargs): - super(MinCQGraalpy, self).__init__(mu=mu, - estimators_generator=StumpsClassifiersGenerator( - n_stumps_per_attribute=n_stumps_per_attribute, - self_complemented=self_complemented), - ) - self.param_names = ["mu", "n_stumps_per_attribute", "random_state"] - self.distribs = [CustomUniform(loc=0.05, state=2.0, multiplier="e-"), - [n_stumps_per_attribute], [random_state]] - self.n_stumps_per_attribute = n_stumps_per_attribute - self.classed_params = [] - self.weird_strings = {} - self.random_state = random_state - if "nbCores" not in kwargs: - self.nbCores = 1 - else: - self.nbCores = kwargs["nbCores"] - - # def canProbas(self): - # """ - # Used to know if the classifier can return label probabilities - # Returns - # ------- - # False - # """ - # return False - - def set_params(self, **params): - """ - set parameter 'self.mu', 'self.random_state - 'self.n_stumps_per_attribute - - Parameters - ---------- - params - - Returns - ------- - self : object - Returns self. - """ - self.mu = params["mu"] - self.random_state = params["random_state"] - self.n_stumps_per_attribute = params["n_stumps_per_attribute"] - return self - - def get_params(self, deep=True): - """ - - Parameters - ---------- - deep : bool (default : true) not used - - Returns - ------- - dictianary with "random_state", "mu", "n_stumps_per_attribute" - """ - return {"random_state": self.random_state, "mu": self.mu, - "n_stumps_per_attribute": self.n_stumps_per_attribute} - - def getInterpret(self, directory, y_test): - """ - - Parameters - ---------- - directory - y_test - - Returns - ------- - string of interpret_string - """ - interpret_string = "Cbound on train :" + str(self.train_cbound) - np.savetxt(directory + "times.csv", np.array([self.train_time, 0])) - # interpret_string += "Train C_bound value : "+str(self.cbound_train) - # y_rework = np.copy(y_test) - # y_rework[np.where(y_rework==0)] = -1 - # interpret_string += "\n Test c_bound value : "+str(self.majority_vote.cbound_value(self.x_test, y_rework)) - return interpret_string - - def get_name_for_fusion(self): - return "MCG" - - -# def formatCmdArgs(args): -# """Used to format kwargs for the parsed args""" -# kwargsDict = {"mu": args.MCG_mu, -# "n_stumps_per_attribute": args.MCG_stumps} -# return kwargsDict - - -def paramsToSet(nIter, random_state): - """Used for weighted linear early fusion to generate random search sets""" - paramsSet = [] - for _ in range(nIter): - paramsSet.append({}) - return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy_tree.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy_tree.py deleted file mode 100644 index ac7a409d82e0b0698f1af913b4c1f2f41b9114d6..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/min_cq_graalpy_tree.py +++ /dev/null @@ -1,152 +0,0 @@ -import numpy as np - -from ..monoview.additions.BoostUtils import TreeClassifiersGenerator -from ..monoview.additions.MinCQUtils import RegularizedBinaryMinCqClassifier -from ..monoview.monoview_utils import BaseMonoviewClassifier, CustomUniform - -classifier_class_name = "MinCQGraalpyTree" - -class MinCQGraalpyTree(RegularizedBinaryMinCqClassifier, - BaseMonoviewClassifier): - """ - - Parameters - ---------- - random_state : - - mu : (default : 0.01) - - self_complemented : ( default : True) - - n_stumps_per_attribute : int ( default : 1) - max_depth : - - kwargs : others parameters - - - Attributes - ---------- - param_name : - - distribs : - - classed_params : - - n_stumps_per_attribute : int - - weird_strings : - - max_depth : - - random_state : - - nbCores : - """ - def __init__(self, random_state=None, mu=0.01, self_complemented=True, - n_stumps_per_attribute=1, max_depth=2, **kwargs): - - super(MinCQGraalpyTree, self).__init__(mu=mu, - estimators_generator=TreeClassifiersGenerator( - n_trees=n_stumps_per_attribute, - max_depth=max_depth, - self_complemented=self_complemented), - ) - self.param_names = ["mu", "n_stumps_per_attribute", "random_state", - "max_depth"] - self.distribs = [CustomUniform(loc=0.05, state=2.0, multiplier="e-"), - [n_stumps_per_attribute], [random_state], [max_depth]] - self.n_stumps_per_attribute = n_stumps_per_attribute - self.classed_params = [] - self.weird_strings = {} - self.max_depth = max_depth - self.random_state = random_state - if "nbCores" not in kwargs: - self.nbCores = 1 - else: - self.nbCores = kwargs["nbCores"] - - # def canProbas(self): - # """ - # Used to know if the classifier can return label probabilities - # - # Returns - # ------- - # True - # """ - # return True - - def set_params(self, **params): - """ - set parameter in the input dictionary - - Parameters - ---------- - params : dict parameter to set - - Returns - ------- - self : object - Returns self. - """ - self.mu = params["mu"] - self.random_state = params["random_state"] - self.n_stumps_per_attribute = params["n_stumps_per_attribute"] - self.max_depth = params["max_depth"] - return self - - def get_params(self, deep=True): - """ - get parameter - - Parameters - ---------- - deep : (boolean (default : True) not used - - Returns - ------- - dictionary of parameter as key and its values - """ - return {"random_state": self.random_state, "mu": self.mu, - "n_stumps_per_attribute": self.n_stumps_per_attribute, - "max_depth": self.max_depth} - - def getInterpret(self, directory, y_test): - """ - - Parameters - ---------- - directory : - - y_test : - - - Returns - ------- - string for interpretation interpret_string - """ - interpret_string = "Cbound on train :" + str(self.train_cbound) - np.savetxt(directory + "times.csv", np.array([self.train_time, 0])) - # interpret_string += "Train C_bound value : "+str(self.cbound_train) - # y_rework = np.copy(y_test) - # y_rework[np.where(y_rework==0)] = -1 - # interpret_string += "\n Test c_bound value : "+str(self.majority_vote.cbound_value(self.x_test, y_rework)) - return interpret_string - - def get_name_for_fusion(self): - return "MCG" - - -# def formatCmdArgs(args): -# """Used to format kwargs for the parsed args""" -# kwargsDict = {"mu": args.MCGT_mu, -# "n_stumps_per_attribute": args.MCGT_trees, -# "max_depth": args.MCGT_max_depth} -# return kwargsDict - - -def paramsToSet(nIter, randomState): - """Used for weighted linear early fusion to generate random search sets""" - paramsSet = [] - for _ in range(nIter): - paramsSet.append({}) - return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm.py deleted file mode 100644 index eb829fb97321b974951aa0802661050f3af59c54..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm.py +++ /dev/null @@ -1,125 +0,0 @@ -from pyscm.scm import SetCoveringMachineClassifier as scm - -from ..monoview.monoview_utils import CustomRandint, CustomUniform, \ - BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -# class DecisionStumpSCMNew(scm, BaseEstimator, ClassifierMixin): -# """docstring for SCM -# A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like -# CV, gridsearch, and so on ...""" -# -# def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): -# super(DecisionStumpSCMNew, self).__init__(model_type=model_type, max_rules=max_rules, p=p, random_state=random_state) -# # self.model_type = model_type -# # self.p = p -# # self.max_rules = max_rules -# # self.random_state = random_state -# # self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) -# -# # def fit(self, X, y): -# # print(self.clf.model_type) -# # self.clf.fit(X=X, y=y) -# # -# # def predict(self, X): -# # return self.clf.predict(X) -# # -# # def set_params(self, **params): -# # for key, value in iteritems(params): -# # if key == 'p': -# # self.p = value -# # if key == 'model_type': -# # self.model_type = value -# # if key == 'max_rules': -# # self.max_rules = value -# -# # def get_stats(self): -# # return {"Binary_attributes": self.clf.model_.rules} - - -classifier_class_name = "SCM" - -class SCM(scm, BaseMonoviewClassifier): - """ - SCM Classifier - Parameters - ---------- - random_state (default : None) - model_type : string (default: "conjunction") - max_rules : int number maximum of rules (default : 10) - p : float value(default : 0.1 ) - - kwarg : others arguments - - Attributes - ---------- - param_names - - distribs - - classed_params - - weird_strings - - """ - - def __init__(self, random_state=None, model_type="conjunction", - max_rules=10, p=0.1, **kwargs): - """ - - Parameters - ---------- - random_state - model_type - max_rules - p - kwargs - """ - super(SCM, self).__init__( - random_state=random_state, - model_type=model_type, - max_rules=max_rules, - p=p - ) - self.param_names = ["model_type", "max_rules", "p", "random_state"] - self.distribs = [["conjunction", "disjunction"], - CustomRandint(low=1, high=15), - CustomUniform(loc=0, state=1), [random_state]] - self.classed_params = [] - self.weird_strings = {} - - # def canProbas(self): - # """ - # Used to know if the classifier can return label probabilities - # - # Returns - # ------- - # return False in any case - # """ - # return False - - def getInterpret(self, directory, y_test): - interpretString = "Model used : " + str(self.model_) - return interpretString - - -# def formatCmdArgs(args): -# """Used to format kwargs for the parsed args""" -# kwargsDict = {"model_type": args.SCM_model_type, -# "p": args.SCM_p, -# "max_rules": args.SCM_max_rules} -# return kwargsDict - - -def paramsToSet(nIter, random_state): - paramsSet = [] - for _ in range(nIter): - paramsSet.append( - {"model_type": random_state.choice(["conjunction", "disjunction"]), - "max_rules": random_state.randint(1, 15), - "p": random_state.random_sample()}) - return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm_pregen.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm_pregen.py deleted file mode 100644 index 4b7ea990f2f5fd0b3d09acc14952e98770509fd7..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/scm_pregen.py +++ /dev/null @@ -1,203 +0,0 @@ -import os - -import numpy as np -from pyscm.scm import SetCoveringMachineClassifier as scm - -from ..monoview.additions.PregenUtils import PregenClassifier -from ..monoview.monoview_utils import CustomRandint, CustomUniform, \ - BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "SCMPregen" - -class SCMPregen(BaseMonoviewClassifier, PregenClassifier, scm): - """ - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number generator to use when - shuffling the data. - - model_type - max_rules - p - n_stumps - self_complemented - estimators_generator - max_depth - kwargs - - Attributes - ---------- - param_names - - distribs - classed_params - weird_strings - self_complemented - n_stumps - estimators_generator - max_depth - """ - def __init__(self, random_state=None, model_type="conjunction", - max_rules=10, p=0.1, n_stumps=10, self_complemented=True, - estimators_generator="Stumps", max_depth=1, **kwargs): - super(SCMPregen, self).__init__( - random_state=random_state, - model_type=model_type, - max_rules=max_rules, - p=p - ) - self.param_names = ["model_type", "max_rules", "p", "n_stumps", - "random_state", "estimators_generator", "max_depth"] - self.distribs = [["conjunction", "disjunction"], - CustomRandint(low=1, high=15), - CustomUniform(loc=0, state=1), [n_stumps], - [random_state], ["Stumps", "Tree"], - CustomRandint(low=1, high=5)] - self.classed_params = [] - self.weird_strings = {} - self.self_complemented = self_complemented - self.n_stumps = n_stumps - self.estimators_generator = estimators_generator - self.max_depth=1 - - def get_params(self, deep=True): - """ - - Parameters - ---------- - deep : boolean (default : True) not used - - Returns - ------- - parameters dictionary - """ - params = super(SCMPregen, self).get_params(deep) - params["estimators_generator"] = self.estimators_generator - params["max_depth"] = self.max_depth - params["n_stumps"] = self.n_stumps - return params - - def fit(self, X, y, tiebreaker=None, iteration_callback=None, - **fit_params): - """ - fit function - - Parameters - ---------- - X {array-like, sparse matrix}, shape (n_samples, n_features) - For kernel="precomputed", the expected shape of X is - (n_samples_test, n_samples_train). - - y : { array-like, shape (n_samples,) - Target values class labels in classification - - tiebreaker - - iteration_callback : (default : None) - - fit_params : others parameters - - Returns - ------- - self : object - Returns self. - """ - pregen_X, _ = self.pregen_voters(X, y) - list_files = os.listdir(".") - a = int(self.random_state.randint(0, 10000)) - if "pregen_x" + str(a) + ".csv" in list_files: - a = int(np.random.randint(0, 10000)) - file_name = "pregen_x" + str(a) + ".csv" - while file_name in list_files: - a = int(np.random.randint(0, 10000)) - file_name = "pregen_x" + str(a) + ".csv" - else: - file_name = "pregen_x" + str(a) + ".csv" - np.savetxt(file_name, pregen_X, delimiter=',') - place_holder = np.genfromtxt(file_name, delimiter=',') - os.remove(file_name) - super(SCMPregen, self).fit(place_holder, y, tiebreaker=tiebreaker, - iteration_callback=iteration_callback, - **fit_params) - return self - - def predict(self, X): - """ - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training vectors, where n_samples is the number of samples - and n_features is the number of features. - For kernel="precomputed", the expected shape of X is - (n_samples, n_samples). - - Returns - ------- - y_pred : array, shape (n_samples,) - """ - pregen_X, _ = self.pregen_voters(X) - list_files = os.listdir(".") - a = int(self.random_state.randint(0, 10000)) - if "pregen_x" + str(a) + ".csv" in list_files: - a = int(np.random.randint(0, 10000)) - file_name = "pregen_x" + str(a) + ".csv" - while file_name in list_files: - a = int(np.random.randint(0, 10000)) - file_name = "pregen_x" + str(a) + ".csv" - else: - file_name = "pregen_x" + str(a) + ".csv" - np.savetxt(file_name, pregen_X, delimiter=',') - place_holder = np.genfromtxt(file_name, delimiter=',') - os.remove(file_name) - return self.classes_[self.model_.predict(place_holder)] - - # def canProbas(self): - # """ - # Used to know if the classifier can return label probabilities - # Returns - # ------- - # False in any case - # """ - # - # return False - - def getInterpret(self, directory, y_test): - """ - - Parameters - ---------- - directory - y_test - - Returns - ------- - interpret_string string of interpretation - """ - interpret_string = "Model used : " + str(self.model_) - return interpret_string - - -# def formatCmdArgs(args): -# """Used to format kwargs for the parsed args""" -# kwargsDict = {"model_type": args.SCP_model_type, -# "p": args.SCP_p, -# "max_rules": args.SCP_max_rules, -# "n_stumps": args.SCP_stumps} -# return kwargsDict - - -def paramsToSet(nIter, randomState): - paramsSet = [] - for _ in range(nIter): - paramsSet.append( - {"model_type": randomState.choice(["conjunction", "disjunction"]), - "max_rules": randomState.randint(1, 15), - "p": randomState.random_sample()}) - return paramsSet diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py index 4d77b7fd460fe3295a72d5384c9a1eca2894269e..b4a0e3d74a2097cd3e36d34e0740d1db10989cf2 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py @@ -37,12 +37,14 @@ class SGD(SGDClassifier, BaseMonoviewClassifier): """ def __init__(self, random_state=None, loss='hinge', - penalty='l2', alpha=0.0001, **kwargs): + penalty='l2', alpha=0.0001, max_iter=5, tol=None, **kwargs): super(SGD, self).__init__( loss=loss, penalty=penalty, alpha=alpha, + max_iter=5, + tol=None, random_state=random_state ) self.param_names = ["loss", "penalty", "alpha", "random_state"] diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py index 90637f5d70b256275e0cad083701c3f748b2a422..aa305849e6903b42bf63eb9e7b440ec3a20f85c6 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/analyze_results.py @@ -66,14 +66,8 @@ def getTotalMetricScores(metric, trainLabels, testLabels, validationIndices, enumerate(metric[1])) else: metricKWARGS = {} - try: - trainScore = metricModule.score(labels[learningIndices], trainLabels, + trainScore = metricModule.score(labels[learningIndices], trainLabels, **metricKWARGS) - except: - print(labels[learningIndices]) - print(trainLabels) - import pdb; - pdb.set_trace() testScore = metricModule.score(labels[validationIndices], testLabels, **metricKWARGS) return [trainScore, testScore] diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py index 85bf7742c98b56261f1c5faf0e756b5e9bedc7d6..b5020c21c1d641beb0ad28690e07398648c883b8 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py @@ -11,7 +11,6 @@ from .multiview_utils import MultiviewResult from . import analyze_results from .. import multiview_classifiers from ..utils import hyper_parameter_search -from ..utils.dataset import get_shape # Author-Info __author__ = "Baptiste Bauvin" diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py index 8006f46e71ba3c90d4f5626d765045750cfe13bf..4c5e34719f0692260492bd4b1b95524a1d756bb5 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py @@ -16,11 +16,14 @@ class MultiviewResult(object): self.y_test_multiclass_pred = test_labels_multiclass def get_classifier_name(self): - multiview_classifier_module = getattr(multiview_classifiers, - self.classifier_name) - multiview_classifier = getattr(multiview_classifier_module, - multiview_classifier_module.classifier_class_name)(42) - return multiview_classifier.short_name + try: + multiview_classifier_module = getattr(multiview_classifiers, + self.classifier_name) + multiview_classifier = getattr(multiview_classifier_module, + multiview_classifier_module.classifier_class_name)(42) + return multiview_classifier.short_name + except: + return self.classifier_name def get_names(classed_list): diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/__init__.py deleted file mode 100644 index dc8665a06cb54657c49364482cfdcdbc046ca244..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import fat_late_fusion, analyze_results \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/analyze_results.py deleted file mode 100644 index 6e58780dc111ceec257df0ee15b489adf174077e..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/analyze_results.py +++ /dev/null @@ -1,21 +0,0 @@ -from ...multiview import analyze_results - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def execute(classifier, trainLabels, - testLabels, DATASET, - classificationKWARGS, classification_indices, - labels_dictionary, views, nbCores, times, - name, KFolds, - hyper_param_search, nIter, metrics, - views_indices, randomState, labels, classifierModule): - return analyze_results.execute(classifier, trainLabels, - testLabels, DATASET, - classificationKWARGS, classificationIndices, - labels_dictionary, views, nbCores, times, - name, KFolds, - hyper_param_search, nIter, metrics, - views_indices, randomState, labels, classifierModule) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/fat_late_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/fat_late_fusion.py deleted file mode 100644 index b93e79a4fc5713eb9adc9e363be949eac89e35f6..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_late_fusion/fat_late_fusion.py +++ /dev/null @@ -1,82 +0,0 @@ -import numpy as np - -from ...utils.multiclass import isBiclass, genMulticlassMonoviewDecision - - -def genName(config): - return "fat_late_fusion" - - -def getBenchmark(benchmark, args=None): - benchmark["multiview"]["fat_late_fusion"] = ["take_everything"] - return benchmark - - -def getArgs(args, benchmark, views, views_indices, randomState, directory, resultsMonoview, classificationIndices): - argumentsList = [] - multiclass_preds = [monoviewResult.y_test_multiclass_pred for monoviewResult in resultsMonoview] - if isBiclass(multiclass_preds): - monoviewDecisions = np.array([monoviewResult.full_labels_pred for monoviewResult in resultsMonoview]) - else: - monoviewDecisions = np.array([genMulticlassMonoviewDecision(monoviewResult, classificationIndices) for monoviewResult in resultsMonoview]) - if len(args.FLF_weights) == 0: - weights = [1.0 for _ in range(monoviewDecisions.shape[0])] - else: - weights = args.FLF_weights - arguments = {"CL_type": "fat_late_fusion", - "views": views, - "NB_VIEW": len(resultsMonoview), - "views_indices": range(len(resultsMonoview)), - "NB_CLASS": len(args.CL_classes), - "LABELS_NAMES": args.CL_classes, - "FatLateFusionKWARGS": { - "monoviewDecisions": monoviewDecisions, - "weights": weights - } - } - argumentsList.append(arguments) - return argumentsList - - -def genParamsSets(classificationKWARGS, randomState, nIter=1): - """Used to generate parameters sets for the random hyper parameters optimization function""" - nbMonoviewClassifiers = len(classificationKWARGS["monoviewDecisions"]) - weights = [randomState.random_sample(nbMonoviewClassifiers) for _ in range(nIter)] - nomralizedWeights = [[weightVector/np.sum(weightVector)] for weightVector in weights] - return nomralizedWeights - - -class FatLateFusionClass: - - def __init__(self, randomState, NB_CORES=1, **kwargs): - if kwargs["weights"] == []: - self.weights = [1.0/len(["monoviewDecisions"]) for _ in range(len(["monoviewDecisions"]))] - else: - self.weights = np.array(kwargs["weights"])/np.sum(np.array(kwargs["weights"])) - self.monoviewDecisions = kwargs["monoviewDecisions"] - - def setParams(self, paramsSet): - self.weights = paramsSet[0] - - def fit_hdf5(self, DATASET, labels, trainIndices=None, views_indices=None, metric=["f1_score", None]): - pass - - def predict_hdf5(self, DATASET, usedIndices=None, views_indices=None): - if usedIndices is None: - usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) - votes = np.zeros((len(usedIndices), DATASET.get("Metadata").attrs["nbClass"]), dtype=float) - for usedIndex, exampleIndex in enumerate(usedIndices): - for monoviewDecisionIndex, monoviewDecision in enumerate(self.monoviewDecisions): - votes[usedIndex, monoviewDecision[exampleIndex]] += self.weights[monoviewDecisionIndex] - predictedLabels = np.argmax(votes, axis=1) - return predictedLabels - - def predict_probas_hdf5(self, DATASET, usedIndices=None): - pass - - def getConfigString(self, classificationKWARGS): - return "weights : "+", ".join(map(str, list(self.weights))) - - def getSpecificAnalysis(self, classificationKWARGS): - stringAnalysis = '' - return stringAnalysis diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/__init__.py deleted file mode 100644 index fce28aa3a7727ea6998ab5f0f2e2b61f31ada922..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import fat_scm_late_fusion, analyze_results \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/analyze_results.py deleted file mode 100644 index d5fcd8a976689cd4aeac84bdbc9a9a03c3b95224..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/analyze_results.py +++ /dev/null @@ -1,21 +0,0 @@ -from ...multiview import analyze_results - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def execute(classifier, trainLabels, - testLabels, DATASET, - classificationKWARGS, classification_indices, - labels_dictionary, views, nbCores, times, - name, KFolds, - hyper_param_search, nIter, metrics, - views_indices, random_state, labels, classifierModule): - return analyze_results.execute(classifier, trainLabels, - testLabels, DATASET, - classificationKWARGS, classification_indices, - labels_dictionary, views, nbCores, times, - name, KFolds, - hyper_param_search, nIter, metrics, - views_indices, random_state, labels, classifierModule) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/fat_scm_late_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/fat_scm_late_fusion.py deleted file mode 100644 index 34d3e982fed33d263447ce8a6e745b426f9b4768..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/fat_scm_late_fusion/fat_scm_late_fusion.py +++ /dev/null @@ -1,132 +0,0 @@ -import numpy as np -from pyscm.scm import SetCoveringMachineClassifier as scm -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.externals.six import iteritems - - -from ...utils.multiclass import isBiclass, genMulticlassMonoviewDecision - -def genName(config): - return "fat_scm_late_fusion" - - -def getBenchmark(benchmark, args=None): - benchmark["multiview"]["fat_scm_late_fusion"] = ["take_everything"] - return benchmark - - - -def getArgs(args, benchmark, views, views_indices, random_state, directory, resultsMonoview, classificationIndices): - argumentsList = [] - multiclass_preds = [monoviewResult.y_test_multiclass_pred for monoviewResult in resultsMonoview] - if isBiclass(multiclass_preds): - monoviewDecisions = np.array([monoviewResult.full_labels_pred for monoviewResult in resultsMonoview]) - else: - monoviewDecisions = np.array([genMulticlassMonoviewDecision(monoviewResult, classification_indices) for monoviewResult in resultsMonoview]) - monoviewDecisions = np.transpose(monoviewDecisions) - #monoviewDecisions = np.transpose(np.array([monoviewResult[1][3] for monoviewResult in resultsMonoview])) - arguments = {"CL_type": "fat_scm_late_fusion", - "views": ["all"], - "NB_VIEW": len(resultsMonoview), - "views_indices": range(len(resultsMonoview)), - "NB_CLASS": len(args.CL_classes), - "LABELS_NAMES": args.CL_classes, - "FatSCMLateFusionKWARGS": { - "monoviewDecisions": monoviewDecisions, - "p": args.FSCMLF_p, - "max_attributes": args.FSCMLF_max_attributes, - "model":args.FSCMLF_model, - } - } - argumentsList.append(arguments) - return argumentsList - - -def genParamsSets(classificationKWARGS, random_state, nIter=1): - """Used to generate parameters sets for the random hyper parameters optimization function""" - paramsSets = [] - for _ in range(nIter): - max_attributes = random_state.randint(1, 20) - p = random_state.random_sample() - model = random_state.choice(["conjunction", "disjunction"]) - paramsSets.append([p, max_attributes, model]) - - return paramsSets - - -class FatSCMLateFusionClass: - - def __init__(self, random_state, NB_CORES=1, **kwargs): - if kwargs["p"]: - self.p = kwargs["p"] - else: - self.p = 0.5 - if kwargs["max_attributes"]: - self.max_attributes = kwargs["max_attributes"] - else: - self.max_attributes = 5 - if kwargs["model"]: - self.model = kwargs["model"] - else: - self.model = "conjunction" - self.monoviewDecisions = kwargs["monoviewDecisions"] - self.random_state = random_state - - def setParams(self, paramsSet): - self.p = paramsSet[0] - self.max_attributes = paramsSet[1] - self.model = paramsSet[2] - - def fit_hdf5(self, DATASET, labels, trainIndices=None, views_indices=None, metric=["f1_score", None]): - features = self.monoviewDecisions[trainIndices] - self.SCMClassifier = DecisionStumpSCMNew(p=self.p, max_rules=self.max_attributes, model_type=self.model, - random_state=self.random_state) - self.SCMClassifier.fit(features, labels[trainIndices].astype(int)) - - def predict_hdf5(self, DATASET, usedIndices=None, views_indices=None): - if usedIndices is None: - usedIndices = range(DATASET.get("Metadata").attrs["datasetLength"]) - predictedLabels = self.SCMClassifier.predict(self.monoviewDecisions[usedIndices]) - return predictedLabels - - def predict_probas_hdf5(self, DATASET, usedIndices=None): - pass - - def getConfigString(self, classificationKWARGS): - return "p : "+str(self.p)+", max_aributes : "+str(self.max_attributes)+", model : "+self.model - - def getSpecificAnalysis(self, classificationKWARGS): - stringAnalysis = 'Rules used : ' + str(self.SCMClassifier.clf.model_) - return stringAnalysis - - -class DecisionStumpSCMNew(BaseEstimator, ClassifierMixin): - """docstring for SCM - A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like - CV, gridsearch, and so on ...""" - - def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): - super(DecisionStumpSCMNew, self).__init__() - self.model_type = model_type - self.p = p - self.max_rules = max_rules - self.random_state = random_state - - def fit(self, X, y): - self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) - self.clf.fit(X=X, y=y) - - def predict(self, X): - return self.clf.predict(X) - - def set_params(self, **params): - for key, value in iteritems(params): - if key == 'p': - self.p = value - if key == 'model_type': - self.model_type = value - if key == 'max_rules': - self.max_rules = value - - def get_stats(self): - return {"Binary_attributes": self.clf.model_.rules} diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py deleted file mode 100644 index 508d2a94d6c78d86cea917e2ae9164fcec4a8d49..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/mumbo.py +++ /dev/null @@ -1,41 +0,0 @@ -from sklearn.tree import DecisionTreeClassifier - - -from multimodalboost.mumbo import MumboClassifier -from ..multiview.multiview_utils import BaseMultiviewClassifier, \ - get_examples_views_indices -from ..utils.hyper_parameter_search import CustomRandint - -classifier_class_name = "Mumbo" - -class Mumbo(BaseMultiviewClassifier, MumboClassifier): - - def __init__(self, base_estimator=None, - n_estimators=50, - random_state=None, - best_view_mode="edge"): - super().__init__(random_state) - super(BaseMultiviewClassifier, self).__init__(base_estimator=base_estimator, - n_estimators=n_estimators, - random_state=random_state, - best_view_mode=best_view_mode) - self.param_names = ["base_estimator", "n_estimators", "random_state", "best_view_mode"] - self.distribs = [[DecisionTreeClassifier(max_depth=1)], - CustomRandint(5,200), [random_state], ["edge", "error"]] - - def fit(self, X, y, train_indices=None, view_indices=None): - train_indices, view_indices = get_examples_views_indices(X, - train_indices, - view_indices) - numpy_X, view_limits = X.to_numpy_array(example_indices=train_indices, - view_indices=view_indices) - return super(Mumbo, self).fit(numpy_X, y[train_indices], - view_limits) - - def predict(self, X, example_indices=None, view_indices=None): - example_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - numpy_X, view_limits = X.to_numpy_array(example_indices=example_indices, - view_indices=view_indices) - return super(Mumbo, self).predict(numpy_X) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/__init__.py deleted file mode 100644 index d6773304b2c117c67cdf8399b4840a4e54f76f03..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import analyze_results, pseudo_cq_fusion \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/analyze_results.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/analyze_results.py deleted file mode 100644 index 3823e68753d996524dd83c3475fb0fac8ee435e8..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/analyze_results.py +++ /dev/null @@ -1,21 +0,0 @@ -from ...multiview import analyze_results - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def execute(classifier, trainLabels, - testLabels, DATASET, - classificationKWARGS, classificationIndices, - labels_dictionary, views, nbCores, times, - name, KFolds, - hyper_param_search, nIter, metrics, - views_indices, randomState, labels, classifierModule): - return analyze_results.execute(classifier, trainLabels, - testLabels, DATASET, - classificationKWARGS, classificationIndices, - labels_dictionary, views, nbCores, times, - name, KFolds, - hyper_param_search, nIter, metrics, - views_indices, randomState, labels, classifierModule) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/pseudo_cq_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/pseudo_cq_fusion.py deleted file mode 100644 index bfd219d329c368594f6eab0a466c7eb5a4d3d358..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/pseudo_cq_fusion/pseudo_cq_fusion.py +++ /dev/null @@ -1,41 +0,0 @@ -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.additions import \ - diversity_utils -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.difficulty_fusion_old import difficulty -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.double_fault_fusion_old import doubleFault - - -def genName(config): - return "pseudo_cq_fusion" - - -def getBenchmark(benchmark, args=None): - benchmark["multiview"]["pseudo_cq_fusion"] = ["take_everything"] - return benchmark - - -def pseudoCQ(difficulty, doubleFlaut): - return difficulty/float(doubleFlaut) - - -def getArgs(args, benchmark, views, views_indices, randomState, directory, resultsMonoview, classificationIndices): - return diversity_utils.getArgs(args, benchmark, views, - views_indices, randomState, directory, - resultsMonoview, classificationIndices, - [doubleFault, difficulty], "pseudo_cq_fusion") - - -def genParamsSets(classificationKWARGS, randomState, nIter=1): - return diversity_utils.genParamsSets(classificationKWARGS, randomState, nIter=nIter) - - - -class PseudoCQFusionClass(diversity_utils.DiversityFusionClass): - - def __init__(self, randomState, NB_CORES=1, **kwargs): - diversity_utils.DiversityFusionClass.__init__(self, randomState, NB_CORES=1, **kwargs) - - def getSpecificAnalysis(self, classificationKWARGS): - - stringAnalysis = "Classifiers used for each view : " + ', '.join(self.classifiers_names) +\ - ', with a pseudo CQ of ' + str(self.div_measure) - return stringAnalysis \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/scm_late_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/scm_late_fusion.py deleted file mode 100644 index a8ec6bb2063760101b5be106141f9245843527fc..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/scm_late_fusion.py +++ /dev/null @@ -1,125 +0,0 @@ -import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.externals.six import iteritems -import itertools -from pyscm.scm import SetCoveringMachineClassifier as scm - - -from ..multiview_classifiers.additions.late_fusion_utils import \ - LateFusionClassifier -from ..multiview.multiview_utils import get_examples_views_indices -from ..monoview.monoview_utils import CustomRandint, CustomUniform - -classifier_class_name = "SCMLateFusionClassifier" - - -class DecisionStumpSCMNew(BaseEstimator, ClassifierMixin): - """docstring for SCM - A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like - CV, gridsearch, and so on ...""" - - def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): - super(DecisionStumpSCMNew, self).__init__() - self.model_type = model_type - self.p = p - self.max_rules = max_rules - self.random_state = random_state - - def fit(self, X, y): - self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) - self.clf.fit(X=X, y=y) - - def predict(self, X): - return self.clf.predict(X) - - def set_params(self, **params): - for key, value in iteritems(params): - if key == 'p': - self.p = value - if key == 'model_type': - self.model_type = value - if key == 'max_rules': - self.max_rules = value - - def get_stats(self): - return {"Binary_attributes": self.clf.model_.rules} - - -class SCMLateFusionClassifier(LateFusionClassifier): - def __init__(self, random_state=None, classifier_names=None, - classifier_configs=None, nb_cores=1, - p=1, max_rules=5, order=1, model_type="conjunction", weights=None): - self.need_probas=False - super(SCMLateFusionClassifier, self).__init__(random_state=random_state, - classifier_names=classifier_names, - classifier_configs=classifier_configs, - nb_cores=nb_cores - ) - self.scm_classifier = None - self.p = p - self.max_rules = max_rules - self.order = order - self.model_type = model_type - self.param_names+=["model_type", "max_rules", "p", "order"] - self.distribs+=[["conjunction", "disjunction"], - CustomRandint(low=1, high=15), - CustomUniform(loc=0, state=1), [1,2,3]] - - def fit(self, X, y, train_indices=None, view_indices=None): - super(SCMLateFusionClassifier, self).fit(X, y, - train_indices=train_indices, - view_indices=view_indices) - self.scm_fusion_fit(X, y, train_indices=train_indices, view_indices=view_indices) - return self - - def predict(self, X, example_indices=None, view_indices=None): - example_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - monoview_decisions = np.zeros((len(example_indices), X.nb_view), - dtype=int) - for index, view_index in enumerate(view_indices): - monoview_decision = self.monoview_estimators[index].predict( - X.get_v(view_index, example_indices)) - monoview_decisions[:, index] = monoview_decision - features = self.generate_interactions(monoview_decisions) - predicted_labels = self.scm_classifier.predict(features) - return predicted_labels - - def scm_fusion_fit(self, X, y, train_indices=None, view_indices=None): - train_indices, view_indices = get_examples_views_indices(X, train_indices, view_indices) - - self.scm_classifier = DecisionStumpSCMNew(p=self.p, max_rules=self.max_rules, model_type=self.model_type, - random_state=self.random_state) - monoview_decisions = np.zeros((len(train_indices), X.nb_view), dtype=int) - for index, view_index in enumerate(view_indices): - monoview_decisions[:, index] = self.monoview_estimators[index].predict( - X.get_v(view_index, train_indices)) - features = self.generate_interactions(monoview_decisions) - features = np.array([np.array([feat for feat in feature]) - for feature in features]) - self.scm_classifier.fit(features, y[train_indices].astype(int)) - - def generate_interactions(self, monoview_decisions): - if self.order is None: - self.order = monoview_decisions.shape[1] - if self.order == 1: - return monoview_decisions - else: - genrated_intercations = [monoview_decisions[:, i] - for i in range(monoview_decisions.shape[1])] - for order_index in range(self.order - 1): - combins = itertools.combinations(range(monoview_decisions.shape[1]), - order_index + 2) - for combin in combins: - generated_decision = monoview_decisions[:, combin[0]] - for index in range(len(combin) - 1): - if self.model_type == "disjunction": - generated_decision = np.logical_and(generated_decision, - monoview_decisions[:, combin[index + 1]]) - else: - generated_decision = np.logical_or(generated_decision, - monoview_decisions[:, combin[index + 1]]) - genrated_intercations.append(generated_decision) - return np.transpose(np.array(genrated_intercations)) - diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py index e63ebbb63b35e5d69baa113f68889ee9ca389ce4..159623e4dea06e3014fa96a13d2b588ca828c981 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py @@ -84,7 +84,7 @@ class WeightedLinearEarlyFusion(BaseMultiviewClassifier, BaseFusionClassifier): example_indices, self.view_indices = get_examples_views_indices(dataset, example_indices, view_indices) - if self.view_weights is None or self.view_weights=="None": + if self.view_weights is None: self.view_weights = np.ones(len(self.view_indices), dtype=float) else: self.view_weights = np.array(self.view_weights) diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis.py index a94f8e1ed9a8fb838a5ea897eb7ba4f540abb73b..f93d72a633ba4dd9e074a18438f95972cd598600 100644 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis.py +++ b/multiview_platform/mono_multi_view_classifiers/result_analysis.py @@ -61,6 +61,200 @@ def plot_results_noise(directory, noise_results, metric_to_plot, name, width=0.1 df.to_csv(directory+name+"_noise_analysis.csv") +def plot_metric_scores(train_scores, test_scores, names, nb_results, metric_name, + file_name, + tag="", train_STDs=None, test_STDs=None): + r"""Used to plot and save the score barplot for a specific metric. + + Parameters + ---------- + train_scores : list or np.array of floats + The scores of each classifier on the training set. + test_scores : list or np.array of floats + The scores of each classifier on the testing set. + names : list or np.array of strs + The names of all the classifiers. + nb_results: int + The number of classifiers to plot. + metric_name : str + The plotted metric's name + file_name : str + The name of the file where the figure will be saved. + tag : str + Some text to personalize the title, must start with a whitespace. + train_STDs : np.array of floats or None + The array containing the standard deviations for the averaged scores on the training set. + test_STDs : np.array of floats or None + The array containing the standard deviations for the averaged scores on the testing set. + + Returns + ------- + """ + + figKW, barWidth = get_fig_size(nb_results) + + names, train_scores, test_scores, train_STDs, test_STDs = sort_by_test_score( + train_scores, test_scores, names, + train_STDs, test_STDs) + + f, ax = plt.subplots(nrows=1, ncols=1, **figKW) + ax.set_title(metric_name + "\n" + tag + " scores for each classifier") + + rects = ax.bar(range(nb_results), test_scores, barWidth, color="0.1", + yerr=test_STDs) + rect2 = ax.bar(np.arange(nb_results) + barWidth, train_scores, barWidth, + color="0.8", yerr=train_STDs) + autolabel(rects, ax, set=1, std=test_STDs) + autolabel(rect2, ax, set=2, std=train_STDs) + ax.legend((rects[0], rect2[0]), ('Test', 'Train')) + ax.set_ylim(-0.1, 1.1) + ax.set_xticks(np.arange(nb_results) + barWidth/2) + ax.set_xticklabels(names, rotation="vertical") + + try: + plt.tight_layout() + except: + pass + f.savefig(file_name + '.png', transparent=True) + plt.close() + import pandas as pd + if train_STDs is None: + dataframe = pd.DataFrame(np.transpose(np.concatenate(( + train_scores.reshape((train_scores.shape[0], 1)), + test_scores.reshape((train_scores.shape[0], 1))), axis=1)), + columns=names) + else: + dataframe = pd.DataFrame(np.transpose(np.concatenate(( + train_scores.reshape((train_scores.shape[0], 1)), + train_STDs.reshape((train_scores.shape[0], 1)), + test_scores.reshape((train_scores.shape[0], 1)), + test_STDs.reshape((train_scores.shape[0], 1))), axis=1)), + columns=names) + dataframe.to_csv(file_name + ".csv") + + +def plot_2d(data, classifiers_names, nbClassifiers, nbExamples, + fileName, minSize=10, + width_denominator=2.0, height_denominator=20.0, stats_iter=1, + use_plotly=True, example_ids=None): + r"""Used to generate a 2D plot of the errors. + + Parameters + ---------- + data : np.array of shape `(nbClassifiers, nbExamples)` + A matrix with zeros where the classifier failed to classifiy the example, ones where it classified it well + and -100 if the example was not classified. + classifiers_names : list of str + The names of the classifiers. + nbClassifiers : int + The number of classifiers. + nbExamples : int + The number of examples. + nbCopies : int + The number of times the data is copied (classifier wise) in order for the figure to be more readable + fileName : str + The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) + minSize : int, optinal, default: 10 + The minimum width and height of the figure. + width_denominator : float, optional, default: 1.0 + To obtain the image width, the number of classifiers will be divided by this number. + height_denominator : float, optional, default: 1.0 + To obtain the image width, the number of examples will be divided by this number. + stats_iter : int, optional, default: 1 + The number of statistical iterations realized. + + Returns + ------- + """ + fig, ax = plt.subplots(nrows=1, ncols=1,) + cmap, norm = iterCmap(stats_iter) + cax = plt.imshow(data, cmap=cmap, norm=norm, + aspect='auto') + plt.title('Errors depending on the classifier') + ticks = np.arange(0, nbClassifiers, 1) + labels = classifiers_names + plt.xticks(ticks, labels, rotation="vertical") + cbar = fig.colorbar(cax, ticks=[-100 * stats_iter / 2, 0, stats_iter]) + cbar.ax.set_yticklabels(['Unseen', 'Always Wrong', 'Always Right']) + + fig.savefig(fileName + "error_analysis_2D.png", bbox_inches="tight", transparent=True) + plt.close() + ### The following part is used to generate an interactive graph. + if use_plotly: + import plotly + hover_text = [["Failed "+ str(stats_iter-data[i,j])+" time(s)" + for j in range(data.shape[1])] + for i in range(data.shape[0]) ] + fig = plotly.graph_objs.Figure(data=plotly.graph_objs.Heatmap( + x=list(classifiers_names), + y=[_ for _ in example_ids], + z=data, + text=hover_text, + hoverinfo=["y", "x", "text"], + colorscale="Greys", + colorbar=dict(tickvals=[0, stats_iter], + ticktext=["Always Wrong", "Always Right"]), + reversescale=True)) + fig.update_layout( + xaxis={"showgrid": False, "showticklabels": False, "ticks": ''}, + yaxis={"showgrid": False, "showticklabels": False, "ticks": ''}) + plotly.offline.plot(fig, filename=fileName + "error_analysis_2D.html", auto_open=False) + del fig + + +def plot_errors_bar(error_on_examples, nbClassifiers, nbExamples, fileName): + r"""Used to generate a barplot of the muber of classifiers that failed to classify each examples + + Parameters + ---------- + error_on_examples : np.array of shape `(nbExamples,)` + An array counting how many classifiers failed to classifiy each examples. + classifiers_names : list of str + The names of the classifiers. + nbClassifiers : int + The number of classifiers. + nbExamples : int + The number of examples. + fileName : str + The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) + + Returns + ------- + """ + fig, ax = plt.subplots() + x = np.arange(nbExamples) + plt.bar(x, error_on_examples) + plt.ylim([0, nbClassifiers]) + plt.title("Number of classifiers that failed to classify each example") + fig.savefig(fileName + "error_analysis_bar.png", transparent=True) + plt.close() + + +def iterCmap(statsIter): + r"""Used to generate a colormap that will have a tick for each iteration : the whiter the better. + + Parameters + ---------- + statsIter : int + The number of statistical iterations. + + Returns + ------- + cmap : matplotlib.colors.ListedColorMap object + The colormap. + norm : matplotlib.colors.BoundaryNorm object + The bounds for the colormap. + """ + cmapList = ["red", "0.0"] + [str(float((i + 1)) / statsIter) for i in + range(statsIter)] + cmap = mpl.colors.ListedColormap(cmapList) + bounds = [-100 * statsIter - 0.5, -0.5] + for i in range(statsIter): + bounds.append(i + 0.5) + bounds.append(statsIter + 0.5) + norm = mpl.colors.BoundaryNorm(bounds, cmap.N) + return cmap, norm + def autolabel(rects, ax, set=1, std=None): r"""Used to print the score below the bars. @@ -97,6 +291,34 @@ def autolabel(rects, ax, set=1, std=None): "%.2f" % height, weight=weight, ha='center', va='bottom', size="small") +def get_fig_size(nb_results, min_size=15, multiplier=1.0, bar_width=0.35): + r"""Used to get the image size to save the figure and the bar width, depending on the number of scores to plot. + + Parameters + ---------- + nb_results : int + The number of couple of bar to plot. + min_size : int + The minimum size of the image, if there are few classifiers to plot. + multiplier : float + The ratio between the image size and the number of classifiers. + bar_width : float + The width of the bars in the figure. Mainly here to centralize bar_width. + + Returns + ------- + fig_kwargs : dict of arguments + The argument restraining the size of the figure, usable directly in the `subplots` function of + `matplotlib.pyplot`. + bar_width : float + The width of the bars in the figure. Mainly here to centralize bar_width. + """ + size = nb_results * multiplier + if size < min_size: + size = min_size + fig_kwargs = {"figsize": (size, size / 3)} + return fig_kwargs, bar_width + def get_metrics_scores_biclass(metrics, results): r"""Used to extract metrics scores in case of biclass classification @@ -106,7 +328,7 @@ def get_metrics_scores_biclass(metrics, results): metrics : list of lists The metrics names with configuration metrics[i][0] = name of metric i results : list of MonoviewResult and MultiviewResults objects - A list containing all the resluts for all the monoview experimentations. + A list containing all the results for all the monoview experimentations. Returns ------- @@ -117,25 +339,28 @@ def get_metrics_scores_biclass(metrics, results): -`metricScores[metric_name]["train_scores"]` is a list of all the available classifiers scores on the train set, -`metricScores[metric_name]["test_scores"]` is a list of all the available classifiers scores on the test set. """ - metrics_scores = {} + classifier_names=[] + classifier_names = [classifierResult.get_classifier_name() + for classifierResult in results + if classifierResult.get_classifier_name() + not in classifier_names ] + metrics_scores = dict((metric[0], pd.DataFrame(data=np.zeros((2, + len(classifier_names))), + index=["train", "test"], + columns=classifier_names)) + for metric in metrics) for metric in metrics: - classifiers_names = [] - train_scores = [] - test_scores = [] - for classifierResult in results: - train_scores.append(classifierResult.metrics_scores[metric[0]][0]) - test_scores.append(classifierResult.metrics_scores[metric[0]][1]) - classifiers_names.append(classifierResult.get_classifier_name()) + metrics_scores[metric[0]].loc["train", classifierResult.get_classifier_name()] = classifierResult.metrics_scores[metric[0]][0] + metrics_scores[metric[0]].loc[ + "test", classifierResult.get_classifier_name()] = \ + classifierResult.metrics_scores[metric[0]][1] - metrics_scores[metric[0]] = {"classifiers_names": classifiers_names, - "train_scores": train_scores, - "test_scores": test_scores} return metrics_scores -def getExampleErrorsBiclass(groud_truth, results): +def get_example_errors_biclass(groud_truth, results): r"""Used to get for each classifier and each example whether the classifier has misclassified the example or not. Parameters @@ -154,46 +379,15 @@ def getExampleErrorsBiclass(groud_truth, results): """ example_errors = {} - for classifierResult in results: - error_on_examples = np.equal(classifierResult.full_labels_pred, + for classifier_result in results: + error_on_examples = np.equal(classifier_result.full_labels_pred, groud_truth).astype(int) - unseenExamples = np.where(groud_truth == -100)[0] - error_on_examples[unseenExamples] = -100 - example_errors[classifierResult.get_classifier_name()] = { - "error_on_examples": error_on_examples} - + unseen_examples = np.where(groud_truth == -100)[0] + error_on_examples[unseen_examples] = -100 + example_errors[classifier_result.get_classifier_name()] = error_on_examples return example_errors -def get_fig_size(nb_results, min_size=15, multiplier=1.0, bar_width=0.35): - r"""Used to get the image size to save the figure and the bar width, depending on the number of scores to plot. - - Parameters - ---------- - nb_results : int - The number of couple of bar to plot. - min_size : int - The minimum size of the image, if there are few classifiers to plot. - multiplier : float - The ratio between the image size and the number of classifiers. - bar_width : float - The width of the bars in the figure. Mainly here to centralize bar_width. - - Returns - ------- - fig_kwargs : dict of arguments - The argument restraining the size of the figure, usable directly in the `subplots` function of - `matplotlib.pyplot`. - bar_width : float - The width of the bars in the figure. Mainly here to centralize bar_width. - """ - size = nb_results * multiplier - if size < min_size: - size = min_size - fig_kwargs = {"figsize": (size, size / 3)} - return fig_kwargs, bar_width - - def sort_by_test_score(train_scores, test_scores, names, train_STDs=None, test_STDs=None): r"""Used to sort the results (names and both scores) in descending test score order. @@ -239,77 +433,7 @@ def sort_by_test_score(train_scores, test_scores, names, train_STDs=None, return sorted_names, sorted_train_scores, sorted_test_scores, sorted_train_STDs, sorted_test_STDs -def plotMetricScores(train_scores, test_scores, names, nb_results, metric_name, - file_name, - tag="", train_STDs=None, test_STDs=None): - r"""Used to plot and save the score barplot for a specific metric. - - Parameters - ---------- - train_scores : list or np.array of floats - The scores of each classifier on the training set. - test_scores : list or np.array of floats - The scores of each classifier on the testing set. - names : list or np.array of strs - The names of all the classifiers. - nb_results: int - The number of classifiers to plot. - metric_name : str - The plotted metric's name - file_name : str - The name of the file where the figure will be saved. - tag : str - Some text to personalize the title, must start with a whitespace. - train_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the training set. - test_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the testing set. - - Returns - ------- - """ - - figKW, barWidth = get_fig_size(nb_results) - - names, train_scores, test_scores, train_STDs, test_STDs = sort_by_test_score( - train_scores, test_scores, names, - train_STDs, test_STDs) - - f, ax = plt.subplots(nrows=1, ncols=1, **figKW) - ax.set_title(metric_name + "\n" + tag + " scores for each classifier") - - rects = ax.bar(range(nb_results), test_scores, barWidth, color="0.1", - yerr=test_STDs) - rect2 = ax.bar(np.arange(nb_results) + barWidth, train_scores, barWidth, - color="0.8", yerr=train_STDs) - autolabel(rects, ax, set=1, std=test_STDs) - autolabel(rect2, ax, set=2, std=train_STDs) - print("nb_results", nb_results) - ax.legend((rects[0], rect2[0]), ('Test', 'Train')) - ax.set_ylim(-0.1, 1.1) - ax.set_xticks(np.arange(nb_results) + barWidth) - ax.set_xticklabels(names, rotation="vertical") - try: - plt.tight_layout() - except: - pass - f.savefig(file_name + '.png', transparent=True) - plt.close() - import pandas as pd - if train_STDs is None: - dataframe = pd.DataFrame(np.transpose(np.concatenate(( - train_scores.reshape((train_scores.shape[0], 1)), - test_scores.reshape((train_scores.shape[0], 1))), axis=1)), - columns=names) - else: - dataframe = pd.DataFrame(np.transpose(np.concatenate(( - train_scores.reshape((train_scores.shape[0], 1)), - train_STDs.reshape((train_scores.shape[0], 1)), - test_scores.reshape((train_scores.shape[0], 1)), - test_STDs.reshape((train_scores.shape[0], 1))), axis=1)), - columns=names) - dataframe.to_csv(file_name + ".csv") def publishMetricsGraphs(metrics_scores, directory, database_name, labels_names): @@ -332,134 +456,40 @@ def publishMetricsGraphs(metrics_scores, directory, database_name, labels_names) results """ results=[] - for metric_name, metric_scores in metrics_scores.items(): + for metric_name, metric_dataframe in metrics_scores.items(): logging.debug( "Start:\t Biclass score graph generation for " + metric_name) - - nb_results = len(metric_scores["test_scores"]) - file_name = directory + time.strftime( - "%Y_%m_%d-%H_%M_%S") + "-" + database_name + "-" + "_vs_".join( - labels_names) + "-" + metric_name - - plotMetricScores(np.array(metric_scores["train_scores"]), - np.array(metric_scores["test_scores"]), - np.array(metric_scores["classifiers_names"]), nb_results, - metric_name, file_name, - tag=" " + " vs ".join(labels_names)) - - logging.debug( - "Done:\t Biclass score graph generation for " + metric_name) - results+=[[classifiers_name, metric_name, testMean, testSTD] - for classifiers_name, testMean, testSTD in zip(np.array(metric_scores["classifiers_names"]), - np.array(metric_scores["test_scores"]), - np.zeros(len(np.array(metric_scores["test_scores"]))))] + train_scores, test_scores, classifier_names, \ + file_name, nb_results,results = init_plot(results, metric_name, + metric_dataframe, directory, + database_name, labels_names) + + plot_metric_scores(train_scores, test_scores, classifier_names, + nb_results, metric_name, file_name, + tag=" "+" vs ".join(labels_names)) + logging.debug("Done:\t Biclass score graph generation for "+metric_name) return results -def iterCmap(statsIter): - r"""Used to generate a colormap that will have a tick for each iteration : the whiter the better. - Parameters - ---------- - statsIter : int - The number of statistical iterations. - - Returns - ------- - cmap : matplotlib.colors.ListedColorMap object - The colormap. - norm : matplotlib.colors.BoundaryNorm object - The bounds for the colormap. - """ - cmapList = ["red", "0.0"] + [str(float((i + 1)) / statsIter) for i in - range(statsIter)] - cmap = mpl.colors.ListedColormap(cmapList) - bounds = [-100 * statsIter - 0.5, -0.5] - for i in range(statsIter): - bounds.append(i + 0.5) - bounds.append(statsIter + 0.5) - norm = mpl.colors.BoundaryNorm(bounds, cmap.N) - return cmap, norm +def init_plot(results, metric_name, metric_dataframe, + directory, database_name, labels_names): + train = np.array(metric_dataframe.loc["train"]) + test = np.array(metric_dataframe.loc["test"]) + classifier_names = np.array(metric_dataframe.columns) -def publish2Dplot(data, classifiers_names, nbClassifiers, nbExamples, nbCopies, - fileName, minSize=10, - width_denominator=2.0, height_denominator=20.0, stats_iter=1): - r"""Used to generate a 2D plot of the errors. - - Parameters - ---------- - data : np.array of shape `(nbClassifiers, nbExamples)` - A matrix with zeros where the classifier failed to classifiy the example, ones where it classified it well - and -100 if the example was not classified. - classifiers_names : list of str - The names of the classifiers. - nbClassifiers : int - The number of classifiers. - nbExamples : int - The number of examples. - nbCopies : int - The number of times the data is copied (classifier wise) in order for the figure to be more readable - fileName : str - The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) - minSize : int, optinal, default: 10 - The minimum width and height of the figure. - width_denominator : float, optional, default: 1.0 - To obtain the image width, the number of classifiers will be divided by this number. - height_denominator : float, optional, default: 1.0 - To obtain the image width, the number of examples will be divided by this number. - stats_iter : int, optional, default: 1 - The number of statistical iterations realized. + nb_results = metric_dataframe.shape[1] - Returns - ------- - """ - figWidth = max(nbClassifiers / width_denominator, minSize) - figHeight = max(nbExamples / height_denominator, minSize) - figKW = {"figsize": (figWidth, figHeight)} - fig, ax = plt.subplots(nrows=1, ncols=1, **figKW) - cmap, norm = iterCmap(stats_iter) - cax = plt.imshow(data, interpolation='none', cmap=cmap, norm=norm, - aspect='auto') - plt.title('Errors depending on the classifier') - ticks = np.arange(nbCopies / 2 - 0.5, nbClassifiers * nbCopies, nbCopies) - labels = classifiers_names - plt.xticks(ticks, labels, rotation="vertical") - cbar = fig.colorbar(cax, ticks=[-100 * stats_iter / 2, 0, stats_iter]) - cbar.ax.set_yticklabels(['Unseen', 'Always Wrong', 'Always Right']) - fig.tight_layout() - fig.savefig(fileName + "error_analysis_2D.png", bbox_inches="tight", transparent=True) - plt.close() + file_name = directory + time.strftime( + "%Y_%m_%d-%H_%M_%S") + "-" + database_name + "-" + "_vs_".join( + labels_names) + "-" + metric_name + results += [[classifiers_name, metric_name, testMean, testSTD] + for classifiers_name, testMean, testSTD in + zip(classifier_names, test, np.zeros(len(test)))] + return train, test, classifier_names, file_name, nb_results, results -def publishErrorsBarPlot(error_on_examples, nbClassifiers, nbExamples, fileName): - r"""Used to generate a barplot of the muber of classifiers that failed to classify each examples - - Parameters - ---------- - error_on_examples : np.array of shape `(nbExamples,)` - An array counting how many classifiers failed to classifiy each examples. - classifiers_names : list of str - The names of the classifiers. - nbClassifiers : int - The number of classifiers. - nbExamples : int - The number of examples. - fileName : str - The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) - - Returns - ------- - """ - fig, ax = plt.subplots() - x = np.arange(nbExamples) - plt.bar(x, error_on_examples) - plt.ylim([0, nbClassifiers]) - plt.title("Number of classifiers that failed to classify each example") - fig.savefig(fileName + "error_analysis_bar.png", transparent=True) - plt.close() - - -def gen_error_data(example_errors, base_file_name, nbCopies=2): +def gen_error_data(example_errors): r"""Used to format the error data in order to plot it efficiently. The data is saves in a `.csv` file. Parameters @@ -493,42 +523,38 @@ def gen_error_data(example_errors, base_file_name, nbCopies=2): error_on_examples : np.array of shape `(nbExamples,)` An array counting how many classifiers failed to classifiy each examples. """ - nbClassifiers = len(example_errors) - nbExamples = len(list(example_errors.values())[0]["error_on_examples"]) - classifiers_names = example_errors.keys() + nb_classifiers = len(example_errors) + nb_examples = len(list(example_errors.values())[0]) + classifiers_names = list(example_errors.keys()) - data = np.zeros((nbExamples, nbClassifiers * nbCopies)) - temp_data = np.zeros((nbExamples, nbClassifiers)) + data_2d = np.zeros((nb_examples, nb_classifiers)) for classifierIndex, (classifier_name, error_on_examples) in enumerate( example_errors.items()): - for iter_index in range(nbCopies): - data[:, classifierIndex * nbCopies + iter_index] = error_on_examples[ - "error_on_examples"] - temp_data[:, classifierIndex] = error_on_examples["error_on_examples"] - error_on_examples = -1 * np.sum(data, axis=1) / nbCopies + nbClassifiers - - np.savetxt(base_file_name + "2D_plot_data.csv", data, delimiter=",") - np.savetxt(base_file_name + "bar_plot_data.csv", temp_data, delimiter=",") + data_2d[:, classifierIndex] = error_on_examples + error_on_examples = -1 * np.sum(data_2d, axis=1) / nb_classifiers - return nbClassifiers, nbExamples, nbCopies, classifiers_names, data, error_on_examples + return nb_classifiers, nb_examples, classifiers_names, data_2d, error_on_examples -def publishExampleErrors(example_errors, directory, databaseName, labels_names): +def publishExampleErrors(example_errors, directory, databaseName, labels_names, example_ids): logging.debug("Start:\t Biclass Label analysis figure generation") base_file_name = directory + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-" + "_vs_".join( labels_names) + "-" - nbClassifiers, nbExamples, nCopies, classifiers_names, data, error_on_examples = gen_error_data( - example_errors, - base_file_name) + nb_classifiers, nb_examples, classifiers_names, \ + data_2d, error_on_examples = gen_error_data(example_errors) + + np.savetxt(base_file_name + "2D_plot_data.csv", data_2d, delimiter=",") + np.savetxt(base_file_name + "bar_plot_data.csv", error_on_examples, + delimiter=",") - publish2Dplot(data, classifiers_names, nbClassifiers, nbExamples, nCopies, - base_file_name) + plot_2d(data_2d, classifiers_names, nb_classifiers, nb_examples, + base_file_name, example_ids=example_ids) - publishErrorsBarPlot(error_on_examples, nbClassifiers, nbExamples, - base_file_name) + plot_errors_bar(error_on_examples, nb_classifiers, nb_examples, + base_file_name) logging.debug("Done:\t Biclass Label analysis figures generation") @@ -554,7 +580,7 @@ def get_arguments(benchmark_argument_dictionaries, flag): return benchmarkArgumentDictionary -def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metrics): +def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metrics, example_ids): r"""Used to extract and format the results of the different biclass experimentations performed. Parameters @@ -581,7 +607,7 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric label combination, regrouping the scores for each metrics and the information useful to plot errors on examples. """ logging.debug("Srart:\t Analzing all biclass resuls") - biclass_results = [{} for _ in range(stats_iter)] + biclass_results = {} for flag, result in results: iteridex, [classifierPositive, classifierNegative] = flag @@ -589,7 +615,7 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric arguments = get_arguments(benchmark_argument_dictionaries, flag) metrics_scores = get_metrics_scores_biclass(metrics, result) - example_errors = getExampleErrorsBiclass(arguments["labels"], result) + example_errors = get_example_errors_biclass(arguments["labels"], result) directory = arguments["directory"] @@ -600,12 +626,15 @@ def analyze_biclass(results, benchmark_argument_dictionaries, stats_iter, metric results = publishMetricsGraphs(metrics_scores, directory, database_name, labels_names) publishExampleErrors(example_errors, directory, database_name, - labels_names) - - biclass_results[iteridex][ - str(classifierPositive) + str(classifierNegative)] = { - "metrics_scores": metrics_scores, - "example_errors": example_errors} + labels_names, example_ids) + if not str(classifierPositive) + str(classifierNegative) in biclass_results: + biclass_results[str(classifierPositive) + str(classifierNegative)] = {} + biclass_results[str(classifierPositive) + str(classifierNegative)][ + "metrics_scores"] = [i for i in range(stats_iter)] + biclass_results[str(classifierPositive) + str(classifierNegative)][ + "example_errors"] = [i for i in range(stats_iter)] + biclass_results[str(classifierPositive) + str(classifierNegative)]["metrics_scores"][iteridex] = metrics_scores + biclass_results[str(classifierPositive) + str(classifierNegative)]["example_errors"][iteridex] = example_errors logging.debug("Done:\t Analzing all biclass resuls") return results, biclass_results @@ -683,10 +712,10 @@ def publishMulticlassScores(multiclass_results, metrics, stats_iter, direcories, nbResults = classifiers_names.shape[0] fileName = directory + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-" + metric[ - 0] + ".png" + 0] - plotMetricScores(train_scores, validationScores, classifiers_names, - nbResults, metric[0], fileName, tag=" multiclass") + plot_metric_scores(train_scores, validationScores, classifiers_names, + nbResults, metric[0], fileName, tag=" multiclass") logging.debug( "Done:\t Multiclass score graph generation for " + metric[0]) @@ -695,7 +724,7 @@ def publishMulticlassScores(multiclass_results, metrics, stats_iter, direcories, def publishMulticlassExmapleErrors(multiclass_results, directories, - databaseName): + databaseName, example_ids): for iter_index, multiclassResult in enumerate(multiclass_results): directory = directories[iter_index] logging.debug("Start:\t Multiclass Label analysis figure generation") @@ -707,18 +736,18 @@ def publishMulticlassExmapleErrors(multiclass_results, directories, multiclassResult, base_file_name) - publish2Dplot(data, classifiers_names, nbClassifiers, nbExamples, - nCopies, base_file_name) + plot_2d(data, classifiers_names, nbClassifiers, nbExamples, + nCopies, base_file_name, example_ids=example_ids) - publishErrorsBarPlot(error_on_examples, nbClassifiers, nbExamples, - base_file_name) + plot_errors_bar(error_on_examples, nbClassifiers, nbExamples, + base_file_name) logging.debug("Done:\t Multiclass Label analysis figure generation") def analyzeMulticlass(results, stats_iter, benchmark_argument_dictionaries, nb_examples, nb_labels, multiclass_labels, - metrics, classification_indices, directories): + metrics, classification_indices, directories, example_ids): """Used to transform one versus one results in multiclass results and to publish it""" multiclass_results = [{} for _ in range(stats_iter)] @@ -770,7 +799,7 @@ def analyzeMulticlass(results, stats_iter, benchmark_argument_dictionaries, benchmark_argument_dictionaries[0]["args"]["Base"]["name"]) publishMulticlassExmapleErrors(multiclass_results, directories, benchmark_argument_dictionaries[0][ - "args"].name) + "args"].name, example_ids) return results, multiclass_results @@ -779,69 +808,79 @@ def numpy_mean_and_std(scores_array): def publish_iter_biclass_metrics_scores(iter_results, directory, labels_dictionary, - classifiers_dict, data_base_name, stats_iter, + data_base_name, stats_iter, min_size=10): results=[] - for labelsCombination, iterResult in iter_results.items(): - currentDirectory = directory + labels_dictionary[ - int(labelsCombination[0])] + "-vs-" + labels_dictionary[ - int(labelsCombination[1])] + "/" - if not os.path.exists(os.path.dirname(currentDirectory + "a")): + for labels_combination, iter_result in iter_results.items(): + current_directory = directory + labels_dictionary[ + int(labels_combination[0])] + "-vs-" + labels_dictionary[ + int(labels_combination[1])] + "/" + if not os.path.exists(os.path.dirname(current_directory + "a")): try: - os.makedirs(os.path.dirname(currentDirectory + "a")) + os.makedirs(os.path.dirname(current_directory + "a")) except OSError as exc: if exc.errno != errno.EEXIST: raise - for metricName, scores in iterResult["metrics_scores"].items(): - trainMeans, trainSTDs = numpy_mean_and_std(scores["train_scores"]) - testMeans, testSTDs = numpy_mean_and_std(scores["test_scores"]) - - names = np.array([name for name in classifiers_dict.keys()]) - fileName = currentDirectory + time.strftime( + for metric_name, scores in iter_result.items(): + train = np.array(scores["mean"].loc["train"]) + test = np.array(scores["mean"].loc["test"]) + names = np.array(scores["mean"].columns) + train_std = np.array(scores["std"].loc["train"]) + test_std = np.array(scores["std"].loc["test"]) + # trainMeans, trainSTDs = numpy_mean_and_std(scores["train_scores"]) + # testMeans, testSTDs = numpy_mean_and_std(scores["test_scores"]) + + # names = np.array([name for name in classifiers_dict.keys()]) + fileName = current_directory + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" + data_base_name + "-Mean_on_" + str( - stats_iter) + "_iter-" + metricName + ".png" + stats_iter) + "_iter-" + metric_name + ".png" nbResults = names.shape[0] - plotMetricScores(trainMeans, testMeans, names, nbResults, - metricName, fileName, tag=" averaged", - train_STDs=trainSTDs, test_STDs=testSTDs) - results+=[[classifiersName, metricName, testMean, testSTD] for classifiersName, testMean, testSTD in zip(names, testMeans, testSTDs)] + plot_metric_scores(train, test, names, nbResults, + metric_name, fileName, tag=" averaged", + train_STDs=train_std, test_STDs=test_std) + results+=[[classifier_name, metric_name, test_mean, test_std] for classifier_name, test_mean, test_std in zip(names, test, test_std)] return results -def gen_error_dat_glob(combi_results, stats_iter, base_file_name): - nbExamples = combi_results["error_on_examples"].shape[1] - nbClassifiers = combi_results["error_on_examples"].shape[0] - data = np.transpose(combi_results["error_on_examples"]) - error_on_examples = -1 * np.sum(data, axis=1) + (nbClassifiers * stats_iter) - np.savetxt(base_file_name + "clf_errors.csv", data, delimiter=",") - np.savetxt(base_file_name + "example_errors.csv", error_on_examples, - delimiter=",") - return nbExamples, nbClassifiers, data, error_on_examples +def gen_error_data_glob(combi_results, stats_iter): + nb_examples = next(iter(combi_results.values())).shape[0] + nb_classifiers = len(combi_results) + data = np.zeros((nb_examples, nb_classifiers), dtype=int) + classifier_names = [] + for clf_index, (classifier_name, error_data) in enumerate(combi_results.items()): + data[:, clf_index] = error_data + classifier_names.append(classifier_name) + error_on_examples = -1 * np.sum(data, axis=1) + (nb_classifiers * stats_iter) + return nb_examples, nb_classifiers, data, error_on_examples, classifier_names -def publish_iter_biclass_example_errors(iter_results, directory, labels_dictionary, - classifiers_dict, stats_iter, min_size=10): - for labelsCombination, combiResults in iter_results.items(): +def publish_iter_biclass_example_errors(iter_results, directory, + labels_dictionary, stats_iter, + example_ids): + for labels_combination, combi_results in iter_results.items(): base_file_name = directory + labels_dictionary[ - int(labelsCombination[0])] + "-vs-" + \ + int(labels_combination[0])] + "-vs-" + \ labels_dictionary[ - int(labelsCombination[1])] + "/" + time.strftime( + int(labels_combination[1])] + "/" + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" - classifiers_names = [classifier_name for classifier_name in - classifiers_dict.values()] + logging.debug( "Start:\t Global biclass label analysis figure generation") - nbExamples, nbClassifiers, data, error_on_examples = gen_error_dat_glob( - combiResults, stats_iter, base_file_name) + nbExamples, nbClassifiers, data, \ + error_on_examples, classifier_names = gen_error_data_glob(combi_results, + stats_iter) - publish2Dplot(data, classifiers_names, nbClassifiers, nbExamples, 1, - base_file_name, stats_iter=stats_iter) + np.savetxt(base_file_name + "clf_errors.csv", data, delimiter=",") + np.savetxt(base_file_name + "example_errors.csv", error_on_examples, + delimiter=",") - publishErrorsBarPlot(error_on_examples, nbClassifiers * stats_iter, - nbExamples, base_file_name) + plot_2d(data, classifier_names, nbClassifiers, nbExamples, + base_file_name, stats_iter=stats_iter, example_ids=example_ids) + plot_errors_bar(error_on_examples, nbClassifiers * stats_iter, + nbExamples, base_file_name) logging.debug( "Done:\t Global biclass label analysis figures generation") @@ -861,28 +900,28 @@ def publish_iter_multiclass_metrics_scores(iter_multiclass_results, classifiers_ "%Y_%m_%d-%H_%M_%S") + "-" + data_base_name + "-Mean_on_" + str( stats_iter) + "_iter-" + metric_name + ".png" - plotMetricScores(trainMeans, testMeans, classifiers_names, nb_results, - metric_name, file_name, tag=" averaged multiclass", - train_STDs=trainSTDs, test_STDs=testSTDs) + plot_metric_scores(trainMeans, testMeans, classifiers_names, nb_results, + metric_name, file_name, tag=" averaged multiclass", + train_STDs=trainSTDs, test_STDs=testSTDs) results+=[[classifiers_name, metric_name,testMean, testSTD] for classifiers_name, testMean, testSTD in zip(classifiers_names, testMeans, testSTDs)] return results def publish_iter_multiclass_example_errors(iter_multiclass_results, directory, - classifiers_names, stats_iter, min_size=10): + classifiers_names, stats_iter, example_ids, min_size=10): logging.debug( "Start:\t Global multiclass label analysis figures generation") base_file_name = directory + time.strftime("%Y_%m_%d-%H_%M_%S") + "-" - nb_examples, nb_classifiers, data, error_on_examples = gen_error_dat_glob( + nb_examples, nb_classifiers, data, error_on_examples = gen_error_data_glob( iter_multiclass_results, stats_iter, base_file_name) - publish2Dplot(data, classifiers_names, nb_classifiers, nb_examples, 1, - base_file_name, stats_iter=stats_iter) + plot_2d(data, classifiers_names, nb_classifiers, nb_examples, 1, + base_file_name, stats_iter=stats_iter, example_ids=example_ids) - publishErrorsBarPlot(error_on_examples, nb_classifiers * stats_iter, nb_examples, - base_file_name) + plot_errors_bar(error_on_examples, nb_classifiers * stats_iter, nb_examples, + base_file_name) logging.debug("Done:\t Global multiclass label analysis figures generation") @@ -891,8 +930,7 @@ def gen_classifiers_dict(results, metrics): classifiers_dict = dict((classifier_name, classifierIndex) for classifierIndex, classifier_name in enumerate( - results[0][list(results[0].keys())[0]]["metrics_scores"][metrics[0][0]][ - "classifiers_names"])) + list(results[list(results.keys())[0]]["metrics_scores"][0][metrics[0][0]].columns))) return classifiers_dict, len(classifiers_dict) @@ -920,52 +958,74 @@ def add_new_metric(iter_biclass_results, metric, labels_combination, nb_classifi return iter_biclass_results -def analyzebiclass_iter(biclass_results, metrics, stats_iter, directory, - labels_dictionary, data_base_name, nb_examples): - """Used to format the results in order to plot the mean results on the iterations""" - iter_biclass_results = {} - classifiers_dict, nb_classifiers = gen_classifiers_dict(biclass_results, - metrics) +def format_previous_results(biclass_results): + """ + Formats each statistical iteration's result into a mean/std analysis for + the metrics and adds the errors of each statistical iteration. - for iter_index, biclass_result in enumerate(biclass_results): - for labelsComination, results in biclass_result.items(): - for metric in metrics: + Parameters + ---------- + biclass_results : The raw results, for each statistical iteration i contains + - biclass_results[i]["metrics_scores"] is a dictionary with a pd.dataframe + for each metrics + - biclass_results[i]["example_errors"], a dicaitonary with a np.array + for each classifier. + + Returns + ------- + metrics_analysis : The mean and std dataframes for each metrics + + error_analysis : A dictionary containing the added errors + arrays for each classifier - iter_biclass_results = add_new_labels_combination( - iter_biclass_results, labelsComination, nb_classifiers, - nb_examples) - iter_biclass_results = add_new_metric(iter_biclass_results, metric, - labelsComination, - nb_classifiers, stats_iter) - - metric_results = results["metrics_scores"][metric[0]] - for classifier_name, trainScore, testScore in zip( - metric_results["classifiers_names"], - metric_results["train_scores"], - metric_results["test_scores"], ): - iter_biclass_results[labelsComination]["metrics_scores"][ - metric[0]]["train_scores"][ - classifiers_dict[classifier_name], iter_index] = trainScore - iter_biclass_results[labelsComination]["metrics_scores"][ - metric[0]]["test_scores"][ - classifiers_dict[classifier_name], iter_index] = testScore - for classifier_name, error_on_example in results[ - "example_errors"].items(): - iter_biclass_results[labelsComination]["error_on_examples"][ - classifiers_dict[classifier_name], :] += error_on_example[ - "error_on_examples"] - - results = publish_iter_biclass_metrics_scores( - iter_biclass_results, directory, - labels_dictionary, classifiers_dict, - data_base_name, stats_iter) - publish_iter_biclass_example_errors(iter_biclass_results, directory, - labels_dictionary, classifiers_dict, - stats_iter) + """ + metrics_analysis = dict((key, {}) for key in biclass_results.keys()) + error_analysis = dict((key, {}) for key in biclass_results.keys()) + for label_combination, biclass_result in biclass_results.items(): + + concat_dict = {} + for iter_index, metrics_score in enumerate( + biclass_result["metrics_scores"]): + for metric_name, dataframe in metrics_score.items(): + if metric_name not in concat_dict: + concat_dict[metric_name] = dataframe + else: + concat_dict[metric_name] = pd.concat( + [concat_dict[metric_name], dataframe]) + + for metric_name, dataframe in concat_dict.items(): + metrics_analysis[label_combination][metric_name] = {} + metrics_analysis[label_combination][metric_name][ + "mean"] = dataframe.groupby(dataframe.index).mean() + metrics_analysis[label_combination][metric_name][ + "std"] = dataframe.groupby(dataframe.index).std(ddof=0) + + added_example_errors = {} + for example_errors in biclass_result["example_errors"]: + for classifier_name, errors in example_errors.items(): + if classifier_name not in added_example_errors: + added_example_errors[classifier_name] = errors + else: + added_example_errors[classifier_name] += errors + error_analysis[label_combination] = added_example_errors + return metrics_analysis, error_analysis + + +def analyzebiclass_iter(biclass_results, stats_iter, directory, + labels_dictionary, data_base_name, example_ids): + """Used to format the results in order to plot the mean results on the iterations""" + metrics_analysis, error_analysis = format_previous_results(biclass_results) + + results = publish_iter_biclass_metrics_scores(metrics_analysis, + directory, labels_dictionary, + data_base_name, stats_iter) + publish_iter_biclass_example_errors(error_analysis, directory, + labels_dictionary, + stats_iter, example_ids) return results def analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics, - data_base_name, nb_examples): + data_base_name, nb_examples, example_ids): """Used to mean the multiclass results on the iterations executed with different random states""" logging.debug("Start:\t Getting mean results for multiclass classification") @@ -1002,19 +1062,19 @@ def analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics, iter_multiclass_results, classifiers_names, data_base_name, directory, stats_iter) publish_iter_multiclass_example_errors(iter_multiclass_results, directory, - classifiers_names, stats_iter) + classifiers_names, stats_iter, example_ids) return results def get_results(results, stats_iter, nb_multiclass, benchmark_argument_dictionaries, multiclass_labels, metrics, classification_indices, directories, directory, labels_dictionary, - nb_examples, nb_labels): + nb_examples, nb_labels, example_ids): """Used to analyze the results of the previous benchmarks""" data_base_name = benchmark_argument_dictionaries[0]["args"]["Base"]["name"] results_means_std, biclass_results = analyze_biclass(results, benchmark_argument_dictionaries, - stats_iter, metrics) + stats_iter, metrics, example_ids) if nb_multiclass > 1: results_means_std, multiclass_results = analyzeMulticlass(results, stats_iter, @@ -1022,12 +1082,12 @@ def get_results(results, stats_iter, nb_multiclass, benchmark_argument_dictionar nb_examples, nb_labels, multiclass_labels, metrics, classification_indices, - directories) + directories, example_ids) if stats_iter > 1: results_means_std = analyzebiclass_iter( - biclass_results, metrics, stats_iter, directory, - labels_dictionary, data_base_name, nb_examples) + biclass_results, stats_iter, directory, + labels_dictionary, data_base_name, example_ids) if nb_multiclass > 1: results_means_std = analyze_iter_multiclass(multiclass_results, directory, stats_iter, - metrics, data_base_name, nb_examples) + metrics, data_base_name, nb_examples, example_ids) return results_means_std diff --git a/multiview_platform/mono_multi_view_classifiers/utils/configuration.py b/multiview_platform/mono_multi_view_classifiers/utils/configuration.py index a492aff70e93e2a0a27e3c3576c8344562194c58..f297dcf09deebab08b29573a45344fbd7e40a822 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/configuration.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/configuration.py @@ -1,11 +1,20 @@ -import builtins -from distutils.util import strtobool as tobool import yaml -import os def get_the_args(path_to_config_file="../config_files/config.yml"): - """This is the main function for extracting the args for a '.yml' file""" + """ + The function for extracting the args for a '.yml' file. + + Parameters + ---------- + path_to_config_file : str, path to the yml file containing the configuration + + Returns + ------- + yaml_config : dict, the dictionary conaining the configuration for the + benchmark + + """ with open(path_to_config_file, 'r') as stream: yaml_config = yaml.safe_load(stream) return yaml_config diff --git a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py index 6c40d787545f5a155763571d180db58085040ea5..85666b66617bd20d054b0c0d32a486e12dd88412 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py @@ -66,7 +66,8 @@ class Dataset(): def __init__(self, views=None, labels=None, are_sparse=False, file_name="dataset.hdf5", view_names=None, path="", - hdf5_file=None, labels_names=None, is_temp=False): + hdf5_file=None, labels_names=None, is_temp=False, + example_ids=None): self.is_temp = False if hdf5_file is not None: self.dataset=hdf5_file @@ -104,6 +105,13 @@ class Dataset(): meta_data_grp.attrs["datasetLength"] = len(labels) dataset_file.close() self.update_hdf5_dataset(os.path.join(path, file_name)) + if example_ids is not None: + example_ids = [example_id if not is_just_number(example_id) + else "ID_"+example_id for example_id in example_ids] + self.example_ids = example_ids + else: + self.example_ids = ["ID_"+str(i) + for i in range(labels.shape[0])] def rm(self): """ @@ -144,8 +152,15 @@ class Dataset(): ------- """ - self.nb_view = self.dataset.get("Metadata").attrs["nbView"] + self.nb_view = self.dataset["Metadata"].attrs["nbView"] self.view_dict = self.get_view_dict() + if "example_ids" in self.dataset["Metadata"].keys(): + self.example_ids = [example_id.decode() + if not is_just_number(example_id.decode()) + else "ID_"+example_id.decode() + for example_id in self.dataset["Metadata"]["example_ids"]] + else: + self.example_ids = [str(i) for i in range(self.dataset["Labels"].shape[0])] def get_nb_examples(self): """ @@ -154,65 +169,100 @@ class Dataset(): ------- """ - return self.dataset.get("Metadata").attrs["datasetLength"] + return self.dataset["Metadata"].attrs["datasetLength"] def get_view_dict(self): + """ + Returns the dictionary with view indices as keys and the corresponding + names as values + """ view_dict = {} for view_index in range(self.nb_view): - view_dict[self.dataset.get("View" + str(view_index)).attrs["name"]] = view_index + view_dict[self.dataset["View" + str(view_index)].attrs["name"]] = view_index return view_dict def get_label_names(self, decode=True, example_indices=None): + """ + Used to get the list of the label names for the give set of examples + + Parameters + ---------- + decode : bool + If True, will decode the label names before lsiting them + + example_indices : numpy.ndarray + The array containig the indices of the needed examples + + Returns + ------- + + """ example_indices = self.init_example_indces(example_indices) selected_labels = self.get_labels(example_indices) if decode: return [label_name.decode("utf-8") - for label, label_name in enumerate(self.dataset.get("Labels").attrs["names"]) + for label, label_name in enumerate(self.dataset["Labels"].attrs["names"]) if label in selected_labels] else: return [label_name - for label, label_name in enumerate(self.dataset.get("Labels").attrs["names"]) + for label, label_name in enumerate(self.dataset["Labels"].attrs["names"]) if label in selected_labels] def init_example_indces(self, example_indices=None): + """If no example indices are provided, selects all the examples.""" if example_indices is None: return range(self.get_nb_examples()) else: return example_indices def get_v(self, view_index, example_indices=None): + """ + Selects the view to extract + Parameters + ---------- + view_index : int + The index of the view to extract + example_indices : numpy.ndarray + The array containing the indices of the examples to extract. + + Returns + ------- + A numpy.ndarray containing the view data for the needed examples + """ example_indices = self.init_example_indces(example_indices) if type(example_indices) is int: - return self.dataset.get("View" + str(view_index))[example_indices, :] + return self.dataset["View" + str(view_index)][example_indices, :] else: example_indices = np.array(example_indices) sorted_indices = np.argsort(example_indices) example_indices = example_indices[sorted_indices] - if not self.dataset.get("View" + str(view_index)).attrs["sparse"]: - return self.dataset.get("View" + str(view_index))[example_indices, :][ + if not self.dataset["View" + str(view_index)].attrs["sparse"]: + return self.dataset["View" + str(view_index)][()][example_indices, :][ np.argsort(sorted_indices), :] else: sparse_mat = sparse.csr_matrix( - (self.dataset.get("View" + str(view_index)).get("data").value, - self.dataset.get("View" + str(view_index)).get("indices").value, - self.dataset.get("View" + str(view_index)).get("indptr").value), - shape=self.dataset.get("View" + str(view_index)).attrs["shape"])[ + (self.dataset["View" + str(view_index)]["data"][()], + self.dataset["View" + str(view_index)]["indices"][()], + self.dataset["View" + str(view_index)]["indptr"][()]), + shape=self.dataset["View" + str(view_index)].attrs["shape"])[ example_indices, :][ np.argsort(sorted_indices), :] return sparse_mat - def get_shape(self, example_indices=None): - return self.get_v(0,example_indices=example_indices).shape + def get_shape(self, view_index=0, example_indices=None): + """Gets the shape of the needed view""" + return self.get_v(view_index,example_indices=example_indices).shape def get_nb_class(self, example_indices=None): + """Gets the number of class of the dataset""" example_indices = self.init_example_indces(example_indices) - return len(np.unique(self.dataset.get("Labels").value[example_indices])) + return len(np.unique(self.dataset["Labels"][()][example_indices])) def get_labels(self, example_indices=None): example_indices = self.init_example_indces(example_indices) - return self.dataset.get("Labels").value[example_indices] + return self.dataset["Labels"][()][example_indices] def copy_view(self, target_dataset=None, source_view_name=None, target_view_index=None, example_indices=None): @@ -220,7 +270,7 @@ class Dataset(): new_d_set = target_dataset.create_dataset("View"+str(target_view_index), data=self.get_v(self.view_dict[source_view_name], example_indices=example_indices)) - for key, value in self.dataset.get("View"+str(self.view_dict[source_view_name])).attrs.items(): + for key, value in self.dataset["View"+str(self.view_dict[source_view_name])].attrs.items(): new_d_set.attrs[key] = value def init_view_names(self, view_names=None): @@ -240,15 +290,23 @@ class Dataset(): dataset_file_path = os.path.join(path,self.get_name()+"_temp_filter.hdf5") new_dataset_file = h5py.File(dataset_file_path,"w") self.dataset.copy("Metadata", new_dataset_file) - new_dataset_file.get("Metadata").attrs["datasetLength"] = len(example_indices) - new_dataset_file.get("Metadata").attrs["nbClass"] = np.unique(labels) + if "example_ids" in self.dataset["Metadata"].keys(): + ex_ids = new_dataset_file["Metadata"]["example_ids"] + ex_ids = np.array([self.example_ids[example_indices]]).astype(np.dtype("S10")) + else: + new_dataset_file["Metadata"].create_dataset("example_ids", + (len(self.example_ids), ), + data=np.array(self.example_ids).astype(np.dtype("S10")), + dtype=np.dtype("S10")) + new_dataset_file["Metadata"].attrs["datasetLength"] = len(example_indices) + new_dataset_file["Metadata"].attrs["nbClass"] = np.unique(labels) new_dataset_file.create_dataset("Labels", data=labels) - new_dataset_file.get("Labels").attrs["names"] = [label_name.encode() + new_dataset_file["Labels"].attrs["names"] = [label_name.encode() if not isinstance(label_name, bytes) else label_name for label_name in label_names] view_names = self.init_view_names(view_names) - new_dataset_file.get("Metadata").attrs["nbView"] = len(view_names) + new_dataset_file["Metadata"].attrs["nbView"] = len(view_names) for new_index, view_name in enumerate(view_names): self.copy_view(target_dataset=new_dataset_file, source_view_name=view_name, @@ -271,18 +329,18 @@ class Dataset(): self.copy_view(target_dataset=noisy_dataset, source_view_name=self.get_view_name(view_index), target_view_index=view_index) - for view_index in range(noisy_dataset.get("Metadata").attrs["nbView"]): + for view_index in range(noisy_dataset["Metadata"].attrs["nbView"]): view_key = "View" + str(view_index) - view_dset = noisy_dataset.get(view_key) + view_dset = noisy_dataset[view_key] try: view_limits = self.dataset[ - "Metadata/View" + str(view_index) + "_limits"].value + "Metadata/View" + str(view_index) + "_limits"][()] except: import pdb;pdb.set_trace() view_ranges = view_limits[:, 1] - view_limits[:, 0] - normal_dist = random_state.normal(0, noise_std, view_dset.value.shape) + normal_dist = random_state.normal(0, noise_std, view_dset[()].shape) noise = normal_dist * view_ranges - noised_data = view_dset.value + noise + noised_data = view_dset[()] + noise noised_data = np.where(noised_data < view_limits[:, 0], view_limits[:, 0], noised_data) noised_data = np.where(noised_data > view_limits[:, 1], @@ -389,9 +447,12 @@ class Dataset(): return selected_label_names - - - +def is_just_number(string): + try: + float(string) + return True + except ValueError: + return False def datasets_already_exist(pathF, name, nbCores): """Used to check if it's necessary to copy datasets""" @@ -402,51 +463,6 @@ def datasets_already_exist(pathF, name, nbCores): pathF + name + str(coreIndex) + ".hdf5") return allDatasetExist -# def get_v(dataset, view_index, used_indices=None): -# # """Used to extract a view as a numpy array or a sparse mat from the HDF5 dataset""" -# # if used_indices is None: -# # used_indices = range(dataset.get("Metadata").attrs["datasetLength"]) -# # if type(used_indices) is int: -# # return dataset.get("View" + str(view_index))[used_indices, :] -# # else: -# # used_indices = np.array(used_indices) -# # sorted_indices = np.argsort(used_indices) -# # used_indices = used_indices[sorted_indices] -# # -# # if not dataset.get("View" + str(view_index)).attrs["sparse"]: -# # return dataset.get("View" + str(view_index))[used_indices, :][ -# # np.argsort(sorted_indices), :] -# # else: -# # sparse_mat = sparse.csr_matrix( -# # (dataset.get("View" + str(view_index)).get("data").value, -# # dataset.get("View" + str(view_index)).get("indices").value, -# # dataset.get("View" + str(view_index)).get("indptr").value), -# # shape=dataset.get("View" + str(view_index)).attrs["shape"])[ -# # used_indices, :][ -# # np.argsort(sorted_indices), :] -# # -# # return sparse_mat - - -def get_shape(dataset, view_index): - """Used to get the dataset shape even if it's sparse""" - if not dataset.get("View" + str(view_index)).attrs["sparse"]: - return dataset.get("View" + str(view_index)).shape - else: - return dataset.get("View" + str(view_index)).attrs["shape"] - - -def get_value(dataset): - """Used to get the value of a view in the HDF5 dataset even if it sparse""" - if not dataset.attrs["sparse"]: - return dataset.value - else: - sparse_mat = sparse.csr_matrix((dataset.get("data").value, - dataset.get("indices").value, - dataset.get("indptr").value), - shape=dataset.attrs["shape"]) - return sparse_mat - def extract_subset(matrix, used_indices): """Used to extract a subset of a matrix even if it's sparse""" @@ -555,10 +571,3 @@ def input_(timeout=15): return sys.stdin.readline().strip() else: return "y" - -def get_monoview_shared(path, name, view_name, labels_names, classification_indices): - """ATM is not used with shared memory, but soon :)""" - hdf5_dataset_file = h5py.File(path + name + ".hdf5", "w") - X = hdf5_dataset_file.get(view_name).value - y = hdf5_dataset_file.get("Labels").value - return X, y diff --git a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py index a3f2e1d1d480a3bac9f12ac83931549741d4a757..11e7bd3bfcf459cade4825f8cb749344620d01dc 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py @@ -4,10 +4,8 @@ import logging import h5py import numpy as np -from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.validation import check_array -from ..utils.dataset import Dataset, copy_hdf5 +from ..utils.dataset import Dataset # Author-Info __author__ = "Baptiste Bauvin" @@ -40,12 +38,13 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, nb_features=10): """Used to generate a plausible dataset to test the algorithms""" - if not os.path.exists(os.path.dirname(path + "Plausible.hdf5")): + if not os.path.exists(os.path.dirname(path + "plausible.hdf5")): try: - os.makedirs(os.path.dirname(path + "Plausible.hdf5")) + os.makedirs(os.path.dirname(path + "plausible.hdf5")) except OSError as exc: if exc.errno != errno.EEXIST: raise + example_ids = ["exmaple_id_"+str(i) for i in range(nb_examples)] views = [] view_names = [] are_sparse = [] @@ -64,6 +63,8 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, fake_zero_indices = random_state.randint(int(nb_examples / 2), nb_examples, int(nb_examples / 12)) + for index in np.concatenate((fake_one_indices, fake_zero_indices)): + example_ids[index]+="noised" view_data[fake_one_indices] = np.ones( (len(fake_one_indices), nb_features)) @@ -74,12 +75,14 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, view_names.append("ViewNumber" + str(view_index)) are_sparse.append(False) + + dataset = Dataset(views=views, labels=labels, labels_names=label_names, view_names=view_names, - are_sparse=are_sparse, file_name="Plausible.hdf5", - path=path) + are_sparse=are_sparse, file_name="plausible.hdf5", + path=path, example_ids=example_ids) labels_dictionary = {0: "No", 1: "Yes"} - return dataset, labels_dictionary, "Plausible" + return dataset, labels_dictionary, "plausible" elif nb_class >= 3: firstBound = int(nb_examples / 3) rest = nb_examples - 2 * int(nb_examples / 3) @@ -115,10 +118,10 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, dataset = Dataset(views=views, labels=labels, labels_names=label_names, view_names=view_names, are_sparse=are_sparse, - file_name="Plausible.hdf5", - path=path) + file_name="plausible.hdf5", + path=path, example_ids=example_ids) labels_dictionary = {0: "No", 1: "Yes", 2: "Maybe"} - return dataset, labels_dictionary, "Plausible" + return dataset, labels_dictionary, "plausible" class DatasetError(Exception): diff --git a/multiview_platform/mono_multi_view_classifiers/utils/parameters.py b/multiview_platform/mono_multi_view_classifiers/utils/parameters.py deleted file mode 100644 index 2b61691f20124cb20fd7872aa8c44f5757397f02..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/parameters.py +++ /dev/null @@ -1,145 +0,0 @@ -import numpy as np - - -class Parameter_pdata(object): - class __Parameter_pdata: - nbr_i = 0 - # option de renormalisation des donnees - # la séparation se faisant à une permutation pret et à un facteur de - # renormalisation pret, on peut choisir de normaliser les données au debut - # de l'algo et/ou à chaque iteration de l'algo et/ou à la fin de l'algo - # on normalise A ou S - _data_norm = {'FlagInit': True, 'FlagIter': False, 'FlagEnd': False} - # % on normalise suivant les colonnes (1) 'dim' (norme des colonnes à 1) ou les - # 'dim'% lignes (2) (norme des lignes à 1) - _Norm = {'p': 1, 'dim': 1, 'x': 'A'} - _list_mode = ['real', 'simul'] - _list_x = ['A', 'S'] - - def __init__(self): - self._Norm['p'] = 1 - self._Norm['dim'] = 1 - self._Norm['x'] = self._list_x[0] - self.mode = self._list_mode[1] - self.sigma = 20000 - self.dim = 1 - if self.nbr_i > 0: - raise ValueError("Instance of class Parameter_pdata can be only one") - self.nbr_i += 1 - - def __str__(self): - return repr(self) - - instance = None - - # def __init__(self, arg): - # if not Parameter_pdata.instance: - # Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata(arg) - # else: - # Parameter_pdata.instance.val = arg - - def __new__(cls): # _new_ est toujours une méthode de classe - if not Parameter_pdata.instance: - Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata() - return Parameter_pdata.instance - - def __getattr__(self, attr): - return getattr(self.instance, attr) - - # def __setattr__(self, attr, val): - # return setattr(self.instance, attr, val) - - def __setattr__(self, name): - return setattr(self.instance, name) - - -class Parameter_palgo(object): - class __Parameter_palgo: - - nbr_i = 0 - _list_algo = ['BCVMFB', 'PALS', 'STALS', 'LSfro', 'LSkl'] - _stop = {'DifA': False, 'DifS': False, - 'ObjFct': True, 'threshold': np.finfo(float).eps} - _pfwt = {'w': 'db6', 'family_pfwt': 'db', - 'level': 10, 'K': 4, - 'Ls': 3000, 'L1': 3000, 'L2': 3000} - # _wavelette_type = ['db', 'db6'] - # 'LS' pour Lee et Seung - # 'Lips' pour la constante de Lipschitz - # 'PALM' pas de preconditionnement - _list_precond = ['LS', 'Lips', 'PALM'] - - def __init__(self): - self.flagWave = False - self.val = None - algo_value = self._list_algo[1] - self._algo = algo_value - self.gamma = 0.99 - self.inf = np.inf - self.eps = np.finfo(float).eps - self.niter = 1000 - self.eta_inf = 'eps' - self.eta_sup = 'inf' - self.alpha_A = 0.0 - self.p_A = 1 - self.p_S = 1 - self.alpha_S = 0.0 - # self.level = 10 - self.alpha_S_eval = False - self.stopThreshold = 10e-5, - self.precond = 'LS' # 'LS' pour Lee et Seung - self.F = None - self.Fstar = None - self.verbose = False - - if self.nbr_i > 0: - raise ValueError("Instance of class Parameter_pdata can be only one") - self.nbr_i += 1 - - def __str__(self): - return repr(self) + repr(self.val) - - @property - def algo(self): - return self._algo - - @algo.setter - def algo(self, algo_value): - if algo_value not in self._list_algo: - raise NameError("parameter algo must be in %s" % self._list_algo) - else: - self._algo = algo_value - - instance = None - - # def __init__(self, arg): - # if not Parameter_pdata.instance: - # Parameter_pdata.instance = Parameter_pdata.__Parameter_pdata(arg) - # else: - # Parameter_pdata.instance.val = arg - - def __new__(cls): # _new_ est toujours une méthode de classe - if not Parameter_palgo.instance: - Parameter_palgo.instance = Parameter_palgo.__Parameter_palgo() - return Parameter_palgo.instance - - def __getattr__(self, attr): - return getattr(self.instance, attr) - - # def __setattr__(self, attr, val): - # return setattr(self.instance, attr, val) - - def __setattr__(self, name): - return setattr(self.instance, name) - - -if __name__ == '__main__': - a = Parameter_pdata() - a = Parameter_pdata() - b = Parameter_pdata() - b.val = 6 - b.x = 8 - a.x = 10 - param = Parameter_palgo() - algo = param._list_algo[3] - param.algo = algo diff --git a/multiview_platform/tests/test_ExecClassif.py b/multiview_platform/tests/test_ExecClassif.py index abbcd77f933e6c9f49dc74213388551bbe85e61d..ad86757828f53a732ded8785cbf0f199bbbdbc9d 100644 --- a/multiview_platform/tests/test_ExecClassif.py +++ b/multiview_platform/tests/test_ExecClassif.py @@ -219,7 +219,7 @@ def fakeBenchmarkExec_monocore(dataset_var=1, a=4, args=1): def fakegetResults(results, stats_iter, nb_multiclass, benchmark_arguments_dictionaries, multi_class_labels, metrics, classification_indices, directories, directory, - labels_dictionary, nb_examples, nb_labels): + labels_dictionary, nb_examples, nb_labels, example_ids): return 3 @@ -368,8 +368,7 @@ class Test_execOneBenchmark(unittest.TestCase): 1, 2, 1, 1, 2, 1, 21]), exec_monoview_multicore=fakeExecMono, - exec_multiview_multicore=fakeExecMulti, - init_multiview_arguments=fakeInitMulti) + exec_multiview_multicore=fakeExecMulti,) cls.assertEqual(flag, None) cls.assertEqual(results , @@ -428,8 +427,7 @@ class Test_execOneBenchmark_multicore(unittest.TestCase): flag=None, labels=np.array([0, 1, 2, 3, 4, 2, 2, 12, 1, 2, 1, 1, 2, 1, 21]), exec_monoview_multicore=fakeExecMono, - exec_multiview_multicore=fakeExecMulti, - init_multiview_arguments=fakeInitMulti) + exec_multiview_multicore=fakeExecMulti,) cls.assertEqual(flag, None) cls.assertEqual(results , diff --git a/multiview_platform/tests/test_ResultAnalysis.py b/multiview_platform/tests/test_ResultAnalysis.py index bc739072790e9730058d7a9f916f66d512e7c31f..bcf63fc7644acae02f6466a4198079bde42bc0af 100644 --- a/multiview_platform/tests/test_ResultAnalysis.py +++ b/multiview_platform/tests/test_ResultAnalysis.py @@ -1,56 +1,267 @@ -# import unittest -# import numpy as np -# -# from ..mono_multi_view_classifiers import ResultAnalysis -# -# -# class Test_getMetricsScoresBiclass(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.metrics = [["accuracy_score"]] -# cls.monoViewResults = [["", ["chicken_is_heaven", ["View0"], {"accuracy_score": [0.5,0.7]}]]] -# cls.multiviewResults = [["Mumbo", {"":""}, {"accuracy_score":[0.6,0.8]}]] -# -# def test_simple(cls): -# res = ResultAnalysis.getMetricsScoresBiclass(cls.metrics, cls.monoViewResults, cls.multiviewResults) -# cls.assertIn("accuracy_score",res) -# cls.assertEqual(type(res["accuracy_score"]), dict) -# cls.assertEqual(res["accuracy_score"]["classifiers_names"], ["chicken_is_heaven-View0", "Mumbo"]) -# cls.assertEqual(res["accuracy_score"]["train_scores"], [0.5, 0.6]) -# cls.assertEqual(res["accuracy_score"]["test_scores"], [0.7, 0.8]) -# -# def test_only_multiview(cls): -# cls.monoViewResults = [] -# res = ResultAnalysis.getMetricsScoresBiclass(cls.metrics, cls.monoViewResults, cls.multiviewResults) -# cls.assertIn("accuracy_score",res) -# cls.assertEqual(type(res["accuracy_score"]), dict) -# cls.assertEqual(res["accuracy_score"]["classifiers_names"], ["Mumbo"]) -# cls.assertEqual(res["accuracy_score"]["train_scores"], [0.6]) -# cls.assertEqual(res["accuracy_score"]["test_scores"], [0.8]) -# -# def test_only_monoview(cls): -# cls.multiviewResults = [] -# res = ResultAnalysis.getMetricsScoresBiclass(cls.metrics, cls.monoViewResults, cls.multiviewResults) -# cls.assertIn("accuracy_score",res) -# cls.assertEqual(type(res["accuracy_score"]), dict) -# cls.assertEqual(res["accuracy_score"]["classifiers_names"], ["chicken_is_heaven-View0"]) -# cls.assertEqual(res["accuracy_score"]["train_scores"], [0.5]) -# cls.assertEqual(res["accuracy_score"]["test_scores"], [0.7]) -# -# -# class Test_getExampleErrorsBiclass(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.usedBenchmarkArgumentDictionary = {"labels": np.array([0,1,1,-100,-100,0,1,1,-100])} -# cls.monoViewResults = [["", ["chicken_is_heaven", ["View0"], {}, np.array([1,1,1,-100,-100,0,1,1,-100])]]] -# cls.multiviewResults = [["Mumbo", {"":""}, {}, np.array([0,0,1,-100,-100,0,1,1,-100])]] -# -# def test_simple(cls): -# res = ResultAnalysis.getExampleErrorsBiclass(cls.usedBenchmarkArgumentDictionary, cls.monoViewResults, -# cls.multiviewResults) -# cls.assertIn("chicken_is_heaven-View0", res) -# cls.assertIn("Mumbo", res) -# np.testing.assert_array_equal(res["Mumbo"], np.array([1,0,1,-100,-100,1,1,1,-100])) -# np.testing.assert_array_equal(res["chicken_is_heaven-View0"], np.array([0,1,1,-100,-100,1,1,1,-100])) +import unittest +import numpy as np +import pandas as pd +import time + +from ..mono_multi_view_classifiers import result_analysis +from ..mono_multi_view_classifiers.multiview.multiview_utils import MultiviewResult +from ..mono_multi_view_classifiers.monoview.monoview_utils import MonoviewResult + + +class Test_get_arguments(unittest.TestCase): + + def setUp(self): + self.benchamrk_argument_dictionaries = [{"flag":"good_flag", "valid":True}, + {"flag":"bad_flag", "valid":False}] + + def test_benchmark_wanted(self): + argument_dict = result_analysis.get_arguments(self.benchamrk_argument_dictionaries, "good_flag") + self.assertTrue(argument_dict["valid"]) + + +class Test_get_metrics_scores_biclass(unittest.TestCase): + + + def test_simple(self): + metrics = [["accuracy_score"], ["f1_score"]] + results = [MonoviewResult(0, + "ada", + "0", + {"accuracy_score":[0.9, 0.95], + "f1_score":[0.91, 0.96]} + , "", "", "", "")] + metrics_scores = result_analysis.get_metrics_scores_biclass(metrics, + results) + self.assertIsInstance(metrics_scores, dict) + self.assertIsInstance(metrics_scores["accuracy_score"], pd.DataFrame) + np.testing.assert_array_equal(np.array(metrics_scores["accuracy_score"].loc["train"]), np.array([0.9])) + np.testing.assert_array_equal( + np.array(metrics_scores["accuracy_score"].loc["test"]), + np.array([0.95])) + np.testing.assert_array_equal( + np.array(metrics_scores["f1_score"].loc["train"]), + np.array([0.91])) + np.testing.assert_array_equal( + np.array(metrics_scores["f1_score"].loc["test"]), + np.array([0.96])) + np.testing.assert_array_equal(np.array(metrics_scores["f1_score"].columns), + np.array(["ada-0"])) + + def multiple_monoview_classifiers(self): + metrics = [["accuracy_score"], ["f1_score"]] + results = [MonoviewResult(0, + "ada", + "0", + {"accuracy_score": [0.9, 0.95], + "f1_score": [0.91, 0.96]} + , "", "", "", ""), + MonoviewResult(0, + "dt", + "1", + {"accuracy_score": [0.8, 0.85], + "f1_score": [0.81, 0.86]} + , "", "", "", "") + ] + metrics_scores = result_analysis.get_metrics_scores_biclass(metrics, + results) + self.assertIsInstance(metrics_scores, dict) + self.assertIsInstance(metrics_scores["accuracy_score"], pd.DataFrame) + np.testing.assert_array_equal( + np.array(metrics_scores["accuracy_score"].loc["train"]), + np.array([0.9, 0.8])) + np.testing.assert_array_equal( + np.array(metrics_scores["accuracy_score"].loc["test"]), + np.array([0.95, 0.85])) + np.testing.assert_array_equal( + np.array(metrics_scores["f1_score"].loc["train"]), + np.array([0.91, 0.81])) + np.testing.assert_array_equal( + np.array(metrics_scores["f1_score"].loc["test"]), + np.array([0.96, 0.86])) + np.testing.assert_array_equal( + np.array(metrics_scores["f1_score"].columns), + np.array(["ada-0", "dt-1"])) + + def mutiview_result(self): + metrics = [["accuracy_score"], ["f1_score"]] + results = [MultiviewResult("mv", "", {"accuracy_score": [0.7, 0.75], + "f1_score": [0.71, 0.76]}, "", ""), + MonoviewResult(0, + "dt", + "1", + {"accuracy_score": [0.8, 0.85], + "f1_score": [0.81, 0.86]} + , "", "", "", "") + ] + metrics_scores = result_analysis.get_metrics_scores_biclass(metrics, + results) + self.assertIsInstance(metrics_scores, dict) + self.assertIsInstance(metrics_scores["accuracy_score"], pd.DataFrame) + np.testing.assert_array_equal( + np.array(metrics_scores["accuracy_score"].loc["train"]), + np.array([0.7, 0.8])) + np.testing.assert_array_equal( + np.array(metrics_scores["accuracy_score"].loc["test"]), + np.array([0.75, 0.85])) + np.testing.assert_array_equal( + np.array(metrics_scores["f1_score"].loc["train"]), + np.array([0.71, 0.81])) + np.testing.assert_array_equal( + np.array(metrics_scores["f1_score"].loc["test"]), + np.array([0.76, 0.86])) + np.testing.assert_array_equal( + np.array(metrics_scores["f1_score"].columns), + np.array(["mv", "dt-1"])) + +class Test_get_example_errors_biclass(unittest.TestCase): + + def test_simple(self): + ground_truth = np.array([0,1,0,1,0,1,0,1, -100]) + results = [MultiviewResult("mv", "", {"accuracy_score": [0.7, 0.75], + "f1_score": [0.71, 0.76]}, + np.array([0,0,0,0,1,1,1,1,1]), + ""), + MonoviewResult(0, + "dt", + "1", + {"accuracy_score": [0.8, 0.85], + "f1_score": [0.81, 0.86]} + , np.array([0,0,1,1,0,0,1,1,0]), "", "", "") + ] + example_errors = result_analysis.get_example_errors_biclass(ground_truth, + results) + self.assertIsInstance(example_errors, dict) + np.testing.assert_array_equal(example_errors["mv"], + np.array([1,0,1,0,0,1,0,1,-100])) + np.testing.assert_array_equal(example_errors["dt-1"], + np.array([1, 0, 0, 1, 1, 0, 0, 1,-100])) + + +class Test_init_plot(unittest.TestCase): + + def test_simple(self): + results = [] + metric_name = "acc" + data = np.random.RandomState(42).uniform(0,1,(2,2)) + metric_dataframe = pd.DataFrame(index=["train", "test"], columns=["dt-1", "mv"], data=data) + directory = "dir" + database_name = 'db' + labels_names = ['lb1', "lb2"] + train, test, classifier_names, \ + file_name, nb_results, results = result_analysis.init_plot(results, + metric_name, + metric_dataframe, + directory, + database_name, + labels_names) + self.assertEqual(file_name, "dir"+time.strftime( + "%Y_%m_%d-%H_%M_%S")+"-db-lb1_vs_lb2-acc") + np.testing.assert_array_equal(train, data[0,:]) + np.testing.assert_array_equal(test, data[1, :]) + np.testing.assert_array_equal(classifier_names, np.array(["dt-1", "mv"])) + self.assertEqual(nb_results, 2) + self.assertEqual(results, [["dt-1", "acc", data[1,0], 0], ["mv", "acc", data[1,1], 0]]) + +class Test_gen_error_data(unittest.TestCase): + + def test_simple(self): + random_state = np.random.RandomState(42) + ada_data = random_state.randint(0,2,size=7) + mv_data = random_state.randint(0, 2, size=7) + example_errors = {"ada-1": ada_data, + "mv": mv_data} + nb_classifiers, nb_examples, classifiers_names, \ + data_2d, error_on_examples = result_analysis.gen_error_data(example_errors) + self.assertEqual(nb_classifiers, 2) + self.assertEqual(nb_examples, 7) + self.assertEqual(classifiers_names, ["ada-1", "mv"]) + np.testing.assert_array_equal(data_2d, np.array([ada_data, mv_data]).transpose()) + np.testing.assert_array_equal(error_on_examples, -1*(ada_data+mv_data)/nb_classifiers) + + +class Test_format_previous_results(unittest.TestCase): + + def test_simple(self): + biclass_results = {"01":{"metrics_scores":[], "example_errors":[]}} + random_state = np.random.RandomState(42) + + # Gen metrics data + metrics_1_data = random_state.uniform(size=(2,2)) + metrics_2_data = random_state.uniform(size=(2,2)) + metric_1_df = pd.DataFrame(data=metrics_1_data, index=["train", "test"], + columns=["ada-1", "mv"]) + metric_2_df = pd.DataFrame(data=metrics_2_data, index=["train", "test"], + columns=["ada-1", "mv"]) + biclass_results["01"]["metrics_scores"].append({"acc": metric_1_df}) + biclass_results["01"]["metrics_scores"].append({"acc": metric_2_df}) + + # Gen error data + ada_error_data_1 = random_state.randint(0,2,7) + ada_error_data_2 = random_state.randint(0, 2, 7) + ada_sum = ada_error_data_1+ada_error_data_2 + mv_error_data_1 = random_state.randint(0, 2, 7) + mv_error_data_2 = random_state.randint(0, 2, 7) + mv_sum = mv_error_data_1+mv_error_data_2 + biclass_results["01"]["example_errors"].append({}) + biclass_results["01"]["example_errors"].append({}) + biclass_results["01"]["example_errors"][0]["ada-1"] = ada_error_data_1 + biclass_results["01"]["example_errors"][0]["mv"] = mv_error_data_1 + biclass_results["01"]["example_errors"][1]["ada-1"] = ada_error_data_2 + biclass_results["01"]["example_errors"][1]["mv"] = mv_error_data_2 + + # Running the function + metric_analysis, error_analysis = result_analysis.format_previous_results(biclass_results) + mean_df = pd.DataFrame(data=np.mean(np.array([metrics_1_data, + metrics_2_data]), + axis=0), + index=["train", "test"], + columns=["ada-1", "mvm"]) + std_df = pd.DataFrame(data=np.std(np.array([metrics_1_data, + metrics_2_data]), + axis=0), + index=["train", "test"], + columns=["ada-1", "mvm"]) + + # Testing + np.testing.assert_array_equal(metric_analysis["01"]["acc"]["mean"].loc["train"], + mean_df.loc["train"]) + np.testing.assert_array_equal(metric_analysis["01"]["acc"]["mean"].loc["test"], + mean_df.loc["test"]) + np.testing.assert_array_equal(metric_analysis["01"]["acc"]["std"].loc["train"], + std_df.loc["train"]) + np.testing.assert_array_equal(metric_analysis["01"]["acc"]["std"].loc["test"], + std_df.loc["test"]) + np.testing.assert_array_equal(ada_sum, error_analysis["01"]["ada-1"]) + np.testing.assert_array_equal(mv_sum, error_analysis["01"]["mv"]) + + +class Test_gen_error_data_glob(unittest.TestCase): + + def test_simple(self): + random_state = np.random.RandomState(42) + + ada_error_data_1 = random_state.randint(0,2,7) + ada_error_data_2 = random_state.randint(0, 2, 7) + ada_sum = ada_error_data_1+ada_error_data_2 + mv_error_data_1 = random_state.randint(0, 2, 7) + mv_error_data_2 = random_state.randint(0, 2, 7) + mv_sum = mv_error_data_1+mv_error_data_2 + + combi_results = {"ada-1":ada_sum, "mv": mv_sum} + + stats_iter = 2 + + nb_examples, nb_classifiers, \ + data, error_on_examples, \ + classifier_names = result_analysis.gen_error_data_glob(combi_results, + stats_iter) + self.assertEqual(nb_examples, 7) + self.assertEqual(nb_classifiers, 2) + np.testing.assert_array_equal(data, np.array([ada_sum, mv_sum]).transpose()) + np.testing.assert_array_equal(error_on_examples, -1*np.sum(np.array([ada_sum, mv_sum]), axis=0)+(nb_classifiers*stats_iter)) + self.assertEqual(classifier_names, ["ada-1", "mv"]) + + + + + diff --git a/multiview_platform/tests/test_utils/test_GetMultiviewDB.py b/multiview_platform/tests/test_utils/test_GetMultiviewDB.py index a61bfbf3d0e967b8d849893f52c3e7e5967d545e..a9f5dae8922e31bc6f364076171f6f06ffe1db2d 100644 --- a/multiview_platform/tests/test_utils/test_GetMultiviewDB.py +++ b/multiview_platform/tests/test_utils/test_GetMultiviewDB.py @@ -21,7 +21,7 @@ class Test_get_classic_db_hdf5(unittest.TestCase): self.views = [self.rs.randint(0, 10, size=(self.nb_examples, 7)) for _ in range(self.nb_view)] self.labels = self.rs.randint(0, self.nb_class, self.nb_examples) - self.dataset_file = h5py.File(os.path.join(tmp_path, self.file_name)) + self.dataset_file = h5py.File(os.path.join(tmp_path, self.file_name), 'w') self.view_names = ["ViewN" + str(index) for index in range(len(self.views))] self.are_sparse = [False for _ in self.views] diff --git a/multiview_platform/tests/test_utils/test_dataset.py b/multiview_platform/tests/test_utils/test_dataset.py index dcfcb353d97ae45109c3f86c6a8d2e705f9d7207..6125243c08f1d6d82098f632fa28966b3a9564af 100644 --- a/multiview_platform/tests/test_utils/test_dataset.py +++ b/multiview_platform/tests/test_utils/test_dataset.py @@ -22,7 +22,7 @@ class Test_Dataset(unittest.TestCase): cls.views = [cls.rs.randint(0, 10, size=(cls.nb_examples, cls.nb_attr)) for _ in range(cls.nb_view)] cls.labels = cls.rs.randint(0, cls.nb_class, cls.nb_examples) - cls.dataset_file = h5py.File(os.path.join(tmp_path, cls.file_name)) + cls.dataset_file = h5py.File(os.path.join(tmp_path, cls.file_name), "w") cls.view_names = ["ViewN" + str(index) for index in range(len(cls.views))] cls.are_sparse = [False for _ in cls.views] for view_index, (view_name, view, is_sparse) in enumerate( @@ -50,7 +50,7 @@ class Test_Dataset(unittest.TestCase): def test_filter(self): """Had to create a new dataset to aviod playing with the class one""" file_name = "test_filter.hdf5" - dataset_file_filter = h5py.File(os.path.join(tmp_path, file_name)) + dataset_file_filter = h5py.File(os.path.join(tmp_path, file_name), "w") for view_index, (view_name, view, is_sparse) in enumerate( zip(self.view_names, self.views, self.are_sparse)): view_dataset = dataset_file_filter.create_dataset( @@ -155,7 +155,7 @@ class Test_Dataset(unittest.TestCase): source_view_name="ViewN0", target_view_index=1) self.assertIn("View1", list(new_dataset.keys())) - np.testing.assert_array_equal(dataset_object.get_v(0), new_dataset["View1"].value) + np.testing.assert_array_equal(dataset_object.get_v(0), new_dataset["View1"][()]) self.assertEqual(new_dataset["View1"].attrs["name"], "ViewN0") new_dataset.close() os.remove(os.path.join(tmp_path, "test_copy.hdf5")) @@ -180,7 +180,7 @@ class Test_Dataset(unittest.TestCase): def test_select_views_and_labels(self): file_name = "test_filter.hdf5" - dataset_file_select = h5py.File(os.path.join(tmp_path, file_name)) + dataset_file_select = h5py.File(os.path.join(tmp_path, file_name), "w") for view_index, (view_name, view, is_sparse) in enumerate( zip(self.view_names, self.views, self.are_sparse)): view_dataset = dataset_file_select.create_dataset( @@ -208,7 +208,7 @@ class Test_Dataset(unittest.TestCase): def test_add_gaussian_noise(self): file_name = "test_noise.hdf5" - dataset_file_select = h5py.File(os.path.join(tmp_path, file_name)) + dataset_file_select = h5py.File(os.path.join(tmp_path, file_name), "w") limits = np.zeros((self.nb_attr, 2)) limits[:, 1] += 100 meta_data_grp = dataset_file_select.create_group("Metadata") diff --git a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py index b5dfe409c98f7ecc18ddeccf6eaca53216f53aed..03a9655bbc10e0c8001a479897fe084db48f95a5 100644 --- a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py +++ b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py @@ -55,7 +55,7 @@ class Test_randomized_search(unittest.TestCase): def test_simple(self): best_params, test_folds_preds = hyper_parameter_search.randomized_search( - self.dataset, self.labels.value, "multiview", self.random_state, tmp_path, + self.dataset, self.labels[()], "multiview", self.random_state, tmp_path, weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds, 1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices) diff --git a/requirements.txt b/requirements.txt index 3899b3fa4a24155369d0b13c09a4f8639428e4c1..2db0c7eda8a0cf170f342967ce54b19a7e70ea1d 100755 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ m2r==0.2.1 docutils==0.12 pyyaml==3.12 cvxopt==1.2.0 --e git+https://github.com/IvanoLauriola/MKLpy.git#egg=MKLpy \ No newline at end of file +-e git+https://github.com/IvanoLauriola/MKLpy.git#egg=MKLpy +plotly==4.2.1 diff --git a/setup.py b/setup.py index f715ce8708c8a718f3229a381ddd929108cc226e..6ce0e776fa03b342229182d40253500f14a4c5ec 100644 --- a/setup.py +++ b/setup.py @@ -55,7 +55,8 @@ def setup_package(): install_requires=['numpy>=1.16', 'scipy>=0.16','scikit-learn==0.19', 'matplotlib', 'h5py', 'joblib', 'pandas', 'm2r', 'pyyaml', 'pyscm @ git+https://github.com/aldro61/pyscm', - 'cvxopt', 'MKLpy @ git+https://github.com/IvanoLauriola/MKLpy'], + 'MKLpy @ git+https://github.com/IvanoLauriola/MKLpy', + 'cvxopt', 'plotly==4.2.1'], # Il est d'usage de mettre quelques metadata à propos de sa lib # Pour que les robots puissent facilement la classer.