From 761622efcddc698d1e8b2c98f8e30f4744d34618 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Thu, 10 Oct 2019 08:22:57 -0400 Subject: [PATCH] Monoview working args multiview to discuss --- .gitignore | 1 + config_files/config.yml | 43 +- multiview_platform/__init__.py | 2 +- .../exec_classif.py | 102 +- .../monoview/exec_classif_mono_view.py | 45 +- .../multiview/exec_multiview.py | 6 +- .../multiview/multiview_utils.py | 4 +- .../additions/fusion_utils.py | 2 +- .../additions/late_fusion_utils.py | 4 +- .../multiview_classifiers/additions/utils.py | 4 +- .../weighted_linear_early_fusion.py | 4 +- .../utils/dataset.py | 227 ++++- .../utils/execution.py | 29 +- .../utils/get_multiview_db.py | 388 ++++--- multiview_platform/tests/test_ExecClassif.py | 16 +- .../test_weighted_linear_early_fusion.py | 54 +- .../tests/test_utils/test_GetMultiviewDB.py | 957 ++++++++---------- .../tests/test_utils/test_dataset.py | 190 +++- .../test_utils/test_hyper_parameter_search.py | 6 +- multiview_platform/tests/utils.py | 37 + 20 files changed, 1150 insertions(+), 971 deletions(-) diff --git a/.gitignore b/.gitignore index 3c5d0bd4..4d1a1ab0 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ TODO ipynb/.ipynb_checkpoints/** docs/source/monomulti/.ipynb_checkpoints/** results/** +data/** Data/** multiview_platform/MonoMultiviewClassifiers/Results/* multiview_platform/tests/temp_tests/** diff --git a/config_files/config.yml b/config_files/config.yml index 318ef51d..33c5469e 100644 --- a/config_files/config.yml +++ b/config_files/config.yml @@ -4,7 +4,7 @@ Base : name: ["Plausible"] label: "_" type: ".hdf5" - views: ["all"] + views: pathf: "../data/" nice: 0 random_state: 42 @@ -21,9 +21,9 @@ Classification: split: 0.8 nb_folds: 2 nb_class: 2 - classes: ["yes", "no"] - type: ["multiview",] - algos_monoview: ["all"] + classes: + type: ["multiview"] + algos_monoview: ["decision_tree", "adaboost"] algos_multiview: ["all"] stats_iter: 2 metrics: ["accuracy_score", "f1_score"] @@ -170,6 +170,41 @@ double_fault_fusion: splitter: ["best"] difficulty_fusion: + classifier_names: ["decison_tree"] + classifier_configs: + decision_tree: + max_depth: [1] + criterion: ["gini"] + splitter: ["best"] + +scm_late_fusion: + classifier_names: ["decison_tree"] + p: 0.1 + max_rules: 10 + model_type: 'conjunction' + classifier_configs: + decision_tree: + max_depth: [1] + criterion: ["gini"] + splitter: ["best"] + +majority_voting_fusion: + classifier_names: ["decison_tree"] + classifier_configs: + decision_tree: + max_depth: [1] + criterion: ["gini"] + splitter: ["best"] + +bayesian_inference_fusion: + classifier_names: ["decison_tree"] + classifier_configs: + decision_tree: + max_depth: [1] + criterion: ["gini"] + splitter: ["best"] + +weighted_linear_late_fusion: classifier_names: ["decison_tree"] classifier_configs: decision_tree: diff --git a/multiview_platform/__init__.py b/multiview_platform/__init__.py index f91d950e..3ca9940b 100644 --- a/multiview_platform/__init__.py +++ b/multiview_platform/__init__.py @@ -2,4 +2,4 @@ __version__ = "0.0.0.0" -from . import mono_multi_view_classifiers, tests, execute, versions +from . import mono_multi_view_classifiers, execute, versions diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py index cba482d1..2adbba0d 100644 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ b/multiview_platform/mono_multi_view_classifiers/exec_classif.py @@ -79,32 +79,32 @@ def init_benchmark(cl_type, monoview_algos, multiview_algos, args): return benchmark -def gen_views_dictionnary(dataset_var, views): - r"""Used to generate a dictionary mapping a view name (key) to it's index in the dataset (value). - - Parameters - ---------- - dataset_var : `h5py` dataset file - The full dataset on which the benchmark will be done - views : List of strings - Names of the selected views on which the banchmark will be done - - Returns - ------- - viewDictionary : Dictionary - Dictionary mapping the view names totheir indexin the full dataset. - """ - datasets_names = dataset_var.keys() - views_dictionary = {} - for dataset_name in datasets_names: - if dataset_name[:4] == "View": - view_name = dataset_var.get(dataset_name).attrs["name"] - if type(view_name) == bytes: - view_name = view_name.decode("utf-8") - if view_name in views: - views_dictionary[view_name] = int(dataset_name[4:]) - - return views_dictionary +# def gen_views_dictionnary(dataset_var, views): +# r"""Used to generate a dictionary mapping a view name (key) to it's index in the dataset (value). +# +# Parameters +# ---------- +# dataset_var : `h5py` dataset file +# The full dataset on which the benchmark will be done +# views : List of strings +# Names of the selected views on which the banchmark will be done +# +# Returns +# ------- +# viewDictionary : Dictionary +# Dictionary mapping the view names totheir indexin the full dataset. +# """ +# datasets_names = dataset_var.get_view_dict().keys() +# views_dictionary = {} +# for dataset_name in datasets_names: +# if dataset_name[:4] == "View": +# view_name = dataset_var.get(dataset_name).attrs["name"] +# if type(view_name) == bytes: +# view_name = view_name.decode("utf-8") +# if view_name in views: +# views_dictionary[view_name] = int(dataset_name[4:]) +# +# return views_dictionary def init_argument_dictionaries(benchmark, views_dictionary, @@ -312,7 +312,7 @@ def gen_multiple_args_dictionnaries(nb_class, kwargs_init, classifier, return args_dictionnaries -def init_kwargs(args, classifiers_names): +def init_kwargs(args, classifiers_names, framework="monoview"): r"""Used to init kwargs thanks to a function in each monoview classifier package. Parameters @@ -330,42 +330,44 @@ def init_kwargs(args, classifiers_names): For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" logging.debug("Start:\t Initializing monoview classifiers arguments") - monoview_kwargs = {} + kwargs = {} for classifiers_name in classifiers_names: try: - getattr(monoview_classifiers, classifiers_name) + if framework=="monoview": + getattr(monoview_classifiers, classifiers_name) + else: + getattr(multiview_classifiers, classifiers_name) except AttributeError: raise AttributeError( classifiers_name + " is not implemented in monoview_classifiers, " "please specify the name of the file in monoview_classifiers") - monoview_kwargs[ - classifiers_name] = args[classifiers_name] + kwargs[classifiers_name] = args[classifiers_name] logging.debug("Done:\t Initializing monoview classifiers arguments") - return monoview_kwargs + return kwargs def init_kwargs_func(args, benchmark): monoview_kwargs = init_kwargs(args, benchmark["monoview"]) - multiview_kwargs = init_kwargs(args, benchmark["multiview"]) + multiview_kwargs = init_kwargs(args, benchmark["multiview"], framework="multiview") kwargs = {"monoview":monoview_kwargs, "multiview":multiview_kwargs} return kwargs -def init_multiview_kwargs(args, classifiers_names): - logging.debug("Start:\t Initializing multiview classifiers arguments") - multiview_kwargs = {} - for classifiers_name in classifiers_names: - try: - getattr(multiview_classifiers, classifiers_name) - except AttributeError: - raise AttributeError( - classifiers_name + " is not implemented in mutliview_classifiers, " - "please specify the name of the coressponding .py " - "file in mutliview_classifiers") - multiview_kwargs[classifiers_name] = args[classifiers_name] - logging.debug("Done:\t Initializing multiview classifiers arguments") - return multiview_kwargs +# def init_multiview_kwargs(args, classifiers_names): +# logging.debug("Start:\t Initializing multiview classifiers arguments") +# multiview_kwargs = {} +# for classifiers_name in classifiers_names: +# try: +# getattr(multiview_classifiers, classifiers_name) +# except AttributeError: +# raise AttributeError( +# classifiers_name + " is not implemented in mutliview_classifiers, " +# "please specify the name of the coressponding .py " +# "file in mutliview_classifiers") +# multiview_kwargs[classifiers_name] = args[classifiers_name] +# logging.debug("Done:\t Initializing multiview classifiers arguments") +# return multiview_kwargs def init_multiview_arguments(args, benchmark, views, views_indices, @@ -572,7 +574,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, labels_dictionary, k_folds) logging.debug("Start:\t monoview benchmark") for arguments in argument_dictionaries["monoview"]: - X = dataset_var.get("View" + str(arguments["view_index"])) + X = dataset_var.get_v(arguments["view_index"]) Y = labels results_monoview += [ exec_monoview(directory, X, Y, args["Base"]["name"], labels_names, @@ -681,7 +683,7 @@ def exec_benchmark(nb_cores, stats_iter, nb_multiclass, # Do everything with flagging nb_examples = len(classification_indices[0][0]) + len( classification_indices[0][1]) - multiclass_ground_truth = dataset_var.get("Labels").value + multiclass_ground_truth = dataset_var.get_labels() logging.debug("Start:\t Analyzing predictions") results_mean_stds = get_results(results, stats_iter, nb_multiclass, benchmark_arguments_dictionaries, @@ -755,7 +757,7 @@ def exec_classif(arguments): views, views_indices, all_views = execution.init_views(dataset_var, args["Base"]["views"]) - views_dictionary = gen_views_dictionnary(dataset_var, views) + views_dictionary = dataset_var.get_view_dict() nb_views = len(views) nb_class = dataset_var.get_nb_class() diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py index 4492cdff..9bf64f7a 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py @@ -16,7 +16,7 @@ from . import monoview_utils from .analyze_result import execute # Import own modules from .. import monoview_classifiers -from ..utils.dataset import get_value, extract_subset +from ..utils.dataset import get_value, extract_subset, Dataset from ..utils import hyper_parameter_search # Author-Info @@ -33,31 +33,32 @@ def exec_monoview_multicore(directory, name, labels_names, classification_indice hyper_param_search="randomized_search", metrics=[["accuracy_score", None]], n_iter=30, **args): - dataset_var = h5py.File(path + name + str(dataset_file_index) + ".hdf5", "r") - neededViewIndex = args["viewIndex"] - X = dataset_var.get("View" + str(neededViewIndex)) + dataset_var = Dataset(hdf5_file=h5py.File(path + name + str(dataset_file_index) + ".hdf5", "r")) + neededViewIndex = args["view_index"] + X = dataset_var.get_v(neededViewIndex) Y = labels return exec_monoview(directory, X, Y, name, labels_names, classification_indices, k_folds, 1, database_type, path, random_state, hyper_param_search=hyper_param_search, - - metrics=metrics, n_iter=n_iter, **args) + metrics=metrics, n_iter=n_iter, + view_name=dataset_var.get_view_name(args["view_index"]), + **args) def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, KFolds, nbCores, databaseType, path, randomState, hyper_param_search="randomized_search", - metrics=[["accuracy_score", None]], nIter=30, **args): + metrics=[["accuracy_score", None]], nIter=30, view_name="", **args): logging.debug("Start:\t Loading data") kwargs, \ t_start, \ feat, \ - CL_type, \ + classifier_name, \ X, \ learningRate, \ labelsString, \ outputFileName = initConstants(args, X, classificationIndices, labels_names, - name, directory) + name, directory, view_name) logging.debug("Done:\t Loading data") logging.debug( @@ -65,7 +66,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, feat) + " train ratio:" + str(learningRate) + ", CrossValidation k-folds: " + str( KFolds.n_splits) + ", cores:" - + str(nbCores) + ", algorithm : " + CL_type) + + str(nbCores) + ", algorithm : " + classifier_name) logging.debug("Start:\t Determine Train/Test split") X_train, y_train, X_test, y_test, X_test_multiclass = init_train_test(X, Y, @@ -78,10 +79,10 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, logging.debug("Done:\t Determine Train/Test split") logging.debug("Start:\t Generate classifier args") - classifierModule = getattr(monoview_classifiers, CL_type) + classifierModule = getattr(monoview_classifiers, classifier_name) classifier_class_name = classifierModule.classifier_class_name - clKWARGS, testFoldsPreds = getHPs(classifierModule, hyper_parameter_search, - nIter, CL_type, classifier_class_name, + clKWARGS, testFoldsPreds = getHPs(classifierModule, hyper_param_search, + nIter, classifier_name, classifier_class_name, X_train, y_train, randomState, outputFileName, KFolds, nbCores, metrics, kwargs) @@ -115,7 +116,7 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, stringAnalysis, \ imagesAnalysis, \ metricsScores = execute(name, classificationIndices, KFolds, nbCores, - hyper_parameter_search, metrics, nIter, feat, CL_type, + hyper_parameter_search, metrics, nIter, feat, classifier_name, clKWARGS, labels_names, X.shape, y_train, y_train_pred, y_test, y_test_pred, t_end, randomState, classifier, outputFileName) @@ -130,39 +131,35 @@ def exec_monoview(directory, X, Y, name, labels_names, classificationIndices, viewIndex = args["view_index"] if testFoldsPreds is None: testFoldsPreds = y_train_pred - return monoview_utils.MonoviewResult(viewIndex, CL_type, feat, metricsScores, + return monoview_utils.MonoviewResult(viewIndex, classifier_name, feat, metricsScores, full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds) # return viewIndex, [CL_type, feat, metricsScores, full_labels_pred, clKWARGS, y_test_multiclass_pred, testFoldsPreds] -def initConstants(args, X, classificationIndices, labels_names, name, directory): +def initConstants(args, X, classificationIndices, labels_names, + name, directory, view_name): try: kwargs = args["args"] except KeyError: kwargs = args t_start = time.time() - if type(X.attrs["name"]) == bytes: - feat = X.attrs["name"].decode("utf-8") - else: - feat = X.attrs["name"] CL_type = kwargs["classifier_name"] - X = get_value(X) learningRate = float(len(classificationIndices[0])) / ( len(classificationIndices[0]) + len(classificationIndices[1])) labelsString = "-".join(labels_names) CL_type_string = CL_type timestr = time.strftime("%Y_%m_%d-%H_%M_%S") - outputFileName = directory + CL_type_string + "/" + feat + "/" + timestr + "-results-" + CL_type_string + "-" + labelsString + \ + outputFileName = directory + CL_type_string + "/" + view_name + "/" + timestr + "-results-" + CL_type_string + "-" + labelsString + \ '-learnRate_{0:.2f}'.format( - learningRate) + '-' + name + "-" + feat + "-" + learningRate) + '-' + name + "-" + view_name + "-" if not os.path.exists(os.path.dirname(outputFileName)): try: os.makedirs(os.path.dirname(outputFileName)) except OSError as exc: if exc.errno != errno.EEXIST: raise - return kwargs, t_start, feat, CL_type, X, learningRate, labelsString, outputFileName + return kwargs, t_start, view_name, CL_type, X, learningRate, labelsString, outputFileName def init_train_test(X, Y, classificationIndices): diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py index 08288d55..9d22a6d4 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py @@ -33,12 +33,12 @@ def init_constants(kwargs, classification_indices, metrics, name, nb_cores, k_fo logging.info("Info\t: Classification - Database : " + str( name) + " ; Views : " + ", ".join(views) + " ; Algorithm : " + classifier_name + " ; Cores : " + str( - nbCores) + ", Train ratio : " + str(learning_rate) + + nb_cores) + ", Train ratio : " + str(learning_rate) + ", CV on " + str(k_folds.n_splits) + " folds") for view_index, view_name in zip(views_indices, views): logging.info("Info:\t Shape of " + str(view_name) + " :" + str( - get_shape(dataset_var, view_index))) + dataset_var.get_shape())) return classifier_name, t_start, views_indices, classifier_config, views, learning_rate @@ -168,7 +168,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, k_folds string_analysis, images_analysis, metrics_scores = analyze_results.execute( classifier, train_labels, test_labels, dataset_var, - classifier_config, classificationIndices, + classifier_config, classification_indices, labels_dictionary, views, nb_cores, times, name, k_folds, hyper_param_search, n_iter, metrics, diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py index 3370cd99..6e017ba7 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py @@ -87,9 +87,9 @@ class BaseMultiviewClassifier(BaseEstimator, ClassifierMixin): def get_examples_views_indices(dataset, examples_indices, view_indices, ): """This function is used to get all the examples indices and view indices if needed""" if view_indices is None: - view_indices = np.arange(dataset["Metadata"].attrs["nbView"]) + view_indices = np.arange(dataset.nb_view) if examples_indices is None: - examples_indices = range(dataset["Metadata"].attrs["datasetLength"]) + examples_indices = range(dataset.get_nb_examples()) return examples_indices, view_indices diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/fusion_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/fusion_utils.py index 80f77ec9..47a5f15f 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/fusion_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/fusion_utils.py @@ -8,7 +8,7 @@ class BaseLateFusionClassifier(): def init_monoview_estimator(self, classifier_name, classifier_index=None): if classifier_index is not None: - classifier_configs = self.classifier_configs[classifier_index] + classifier_configs = self.classifier_configs[classifier_name] else: classifier_configs = self.classifier_configs if classifier_configs is not None and classifier_name in classifier_configs: diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/late_fusion_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/late_fusion_utils.py index c05d319c..189e1680 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/late_fusion_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/late_fusion_utils.py @@ -47,11 +47,13 @@ class LateFusionClassifier(BaseMultiviewClassifier, BaseLateFusionClassifier): classifier_configs=None, nb_cores=1, nb_view=None, weights=None): super(LateFusionClassifier, self).__init__(random_state) self.verif_clf_views(classifier_names, nb_view) + print(classifier_names) self.nb_view = len(classifier_names) self.classifiers_names = classifier_names self.classifier_configs = classifier_configs self.monoview_estimators = [self.init_monoview_estimator(classifier_name, classifier_index) - for classifier_index, classifier_name in self.classifiers_names] + for classifier_index, classifier_name + in enumerate(self.classifiers_names)] self.nb_cores = nb_cores self.accuracies = np.zeros(len(classifier_names)) self.needProbas = False diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py index 6d3c8a7b..8158ec7f 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py @@ -69,9 +69,9 @@ class BaseMultiviewClassifier(BaseEstimator, ClassifierMixin): def get_train_views_indices(dataset, train_indices, view_indices,): """This function is used to get all the examples indices and view indices if needed""" if view_indices is None: - view_indices = np.arange(dataset["Metadata"].attrs["nbView"]) + view_indices = np.arange(dataset.nb_view) if train_indices is None: - train_indices = range(dataset["Metadata"].attrs["datasetLength"]) + train_indices = range(dataset.get_nb_examples()) return train_indices, view_indices diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py index 25bd9c3e..aa86a71b 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py @@ -1,7 +1,7 @@ import numpy as np import inspect -from ..utils.dataset import get_v +# from ..utils.dataset import get_v from ..multiview.multiview_utils import BaseMultiviewClassifier, get_examples_views_indices, ConfigGenerator, get_available_monoview_classifiers @@ -79,7 +79,7 @@ class WeightedLinearEarlyFusion(BaseMultiviewClassifier): def hdf5_to_monoview(self, dataset, exmaples): """Here, we concatenate the views for the asked examples """ monoview_data = np.concatenate( - [get_v(dataset, view_idx, exmaples) + [dataset.get_v(view_idx, exmaples) for view_weight, (index, view_idx) in zip(self.view_weights, enumerate(self.view_indices))] , axis=1) diff --git a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py index 402f7263..2ab4003f 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py @@ -13,11 +13,14 @@ from scipy import sparse class Dataset(): + # The following methods use hdf5 + def __init__(self, views=None, labels=None, are_sparse=False, file_name="dataset.hdf5", view_names=None, path="", hdf5_file=None, labels_names=None): if hdf5_file is not None: self.dataset=hdf5_file + self.init_attrs() else: if not os.path.exists(os.path.dirname(os.path.join(path, file_name))): try: @@ -50,27 +53,39 @@ class Dataset(): meta_data_grp.attrs["nbClass"] = len(np.unique(labels)) meta_data_grp.attrs["datasetLength"] = len(labels) dataset_file.close() - dataset_file = h5py.File(os.path.join(path, file_name), "r") - self.dataset = dataset_file + self.update_hdf5_dataset(os.path.join(path, file_name)) + + def get_view_name(self, view_idx): + return self.dataset["View"+str(view_idx)].attrs["name"] + + def init_attrs(self): self.nb_view = self.dataset.get("Metadata").attrs["nbView"] self.view_dict = self.get_view_dict() + def get_nb_examples(self): + return self.dataset.get("Metadata").attrs["datasetLength"] + def get_view_dict(self): view_dict = {} for view_index in range(self.nb_view): view_dict[self.dataset.get("View" + str(view_index)).attrs["name"]] = view_index return view_dict - def get_label_names(self, decode=True): + def get_label_names(self, decode=True, example_indices=None): + example_indices = self.init_example_indces(example_indices) + selected_labels = self.get_labels(example_indices) if decode: return [label_name.decode("utf-8") - for label_name in self.dataset.get("Labels").attrs["names"]] + for label, label_name in enumerate(self.dataset.get("Labels").attrs["names"]) + if label in selected_labels] else: - return self.dataset.get("Labels").attrs["names"] + return [label_name + for label, label_name in enumerate(self.dataset.get("Labels").attrs["names"]) + if label in selected_labels] def init_example_indces(self, example_indices=None): if example_indices is None: - return range(self.dataset.get("Metadata").attrs["datasetLength"]) + return range(self.get_nb_examples()) else: return example_indices @@ -97,15 +112,8 @@ class Dataset(): return sparse_mat - # def copy(self, examples_indices, views_indices, target_dataset): - # new_dataset = Dataset(views=, - # labels=, - # are_sparse=, - # file_name=, - # view_names=, - # path=, - # labels_names=) - # return self.dataset.copy(part_name, target_dataset) + def get_shape(self, example_indices=None): + return self.get_v(0,example_indices=example_indices).shape def get_nb_class(self, example_indices=None): example_indices = self.init_example_indces(example_indices) @@ -113,17 +121,150 @@ class Dataset(): def get_labels(self, example_indices=None): example_indices = self.init_example_indces(example_indices) - return self.dataset.get("Labels").value([example_indices]) + return self.dataset.get("Labels").value[example_indices] def copy_view(self, target_dataset=None, source_view_name=None, - target_view_name=None, example_indices=None): + target_view_index=None, example_indices=None): example_indices = self.init_example_indces(example_indices) - new_d_set = target_dataset.create_dataset(target_view_name, + new_d_set = target_dataset.create_dataset("View"+str(target_view_index), data=self.get_v(self.view_dict[source_view_name], example_indices=example_indices)) - for key, value in self.get_v(self.view_dict[source_view_name]).attrs.items(): + for key, value in self.dataset.get("View"+str(self.view_dict[source_view_name])).attrs.items(): new_d_set.attrs[key] = value + def init_view_names(self, view_names=None): + if view_names is None: + return [key for key in self.get_view_dict().keys()] + else: + return view_names + + def update_hdf5_dataset(self, path): + if hasattr(self, 'dataset'): + self.dataset.close() + self.dataset = h5py.File(path, 'r') + self.init_attrs() + + def filter(self, labels, label_names, example_indices, view_names, path): + dataset_file_path = os.path.join(path,self.get_name()+"_temp_filter.hdf5") + new_dataset_file = h5py.File(dataset_file_path,"w") + self.dataset.copy("Metadata", new_dataset_file) + new_dataset_file.get("Metadata").attrs["datasetLength"] = len(example_indices) + new_dataset_file.get("Metadata").attrs["nbClass"] = np.unique(labels) + new_dataset_file.create_dataset("Labels", data=labels) + new_dataset_file.get("Labels").attrs["names"] = [label_name.encode() + if not isinstance(label_name, bytes) + else label_name + for label_name in label_names] + view_names = self.init_view_names(view_names) + new_dataset_file.get("Metadata").attrs["nbView"] = len(view_names) + for new_index, view_name in enumerate(view_names): + self.copy_view(target_dataset=new_dataset_file, + source_view_name=view_name, + target_view_index=new_index, + example_indices=example_indices) + new_dataset_file.close() + self.update_hdf5_dataset(dataset_file_path) + + def add_gaussian_noise(self, random_state, path, + noise_std=0.15): + """In this function, we add a guaussian noise centered in 0 with specified + std to each view, according to it's range (the noise will be + mutliplied by this range) and we crop the noisy signal according to the + view's attributes limits. + This is done by creating a new dataset, to keep clean data.""" + noisy_dataset = h5py.File(path + self.get_name() + "_noised.hdf5", "w") + self.dataset.copy("Metadata", noisy_dataset) + self.dataset.copy("Labels", noisy_dataset) + for view_index in range(self.nb_view): + self.copy_view(target_dataset=noisy_dataset, + source_view_name=self.get_view_name(view_index), + target_view_index=view_index) + for view_index in range(noisy_dataset.get("Metadata").attrs["nbView"]): + view_key = "View" + str(view_index) + view_dset = noisy_dataset.get(view_key) + try: + view_limits = self.dataset[ + "Metadata/View" + str(view_index) + "_limits"].value + except: + import pdb;pdb.set_trace() + view_ranges = view_limits[:, 1] - view_limits[:, 0] + normal_dist = random_state.normal(0, noise_std, view_dset.value.shape) + noise = normal_dist * view_ranges + noised_data = view_dset.value + noise + noised_data = np.where(noised_data < view_limits[:, 0], + view_limits[:, 0], noised_data) + noised_data = np.where(noised_data > view_limits[:, 1], + view_limits[:, 1], noised_data) + noisy_dataset[view_key][...] = noised_data + noisy_dataset_path = noisy_dataset.filename + noisy_dataset.close() + self.update_hdf5_dataset(noisy_dataset_path) + + + # The following methods are hdf5 free + + def select_views_and_labels(self, nb_labels=None, + selected_label_names=None, random_state=None, + view_names = None, path_for_new="../data/"): + if view_names is None and selected_label_names is None and nb_labels is None: + pass + else: + selected_label_names = self.check_selected_label_names(nb_labels, + selected_label_names, + random_state) + labels, label_names, example_indices = self.select_labels(selected_label_names) + self.filter(labels, label_names, example_indices, view_names, path_for_new) + labels_dictionary = dict( + (labelIndex, labelName) for labelIndex, labelName in + enumerate(self.get_label_names())) + return labels_dictionary + + def get_name(self): + """Ony works if there are not multiple dots in the files name""" + return self.dataset.filename.split('/')[-1].split('.')[0] + + def select_labels(self, selected_label_names): + selected_labels = [self.get_label_names().index(label_name.decode()) + if isinstance(label_name, bytes) + else self.get_label_names().index(label_name) + for label_name in selected_label_names] + selected_indices = np.array([index + for index, label in enumerate(self.get_labels()) + if label in selected_labels]) + labels = np.array([selected_labels.index(self.get_labels()[idx]) + for idx in selected_indices]) + return labels, selected_label_names, selected_indices + + def check_selected_label_names(self, nb_labels=None, + selected_label_names=None, + random_state=np.random.RandomState(42)): + if selected_label_names is None or nb_labels is None or len(selected_label_names) < nb_labels: + if selected_label_names is None: + nb_labels_to_add = nb_labels + selected_label_names = [] + elif nb_labels is not None: + nb_labels_to_add = nb_labels - len(selected_label_names) + else: + nb_labels_to_add=0 + labels_names_to_choose = [available_label_name + for available_label_name + in self.get_label_names() + if available_label_name + not in selected_label_names] + added_labels_names = random_state.choice(labels_names_to_choose, + nb_labels_to_add, + replace=False) + selected_label_names = list(selected_label_names) + list( + added_labels_names) + elif len(selected_label_names) > nb_labels: + selected_label_names = list( + random_state.choice(selected_label_names, nb_labels, + replace=False)) + + return selected_label_names + + + def datasets_already_exist(pathF, name, nbCores): @@ -135,30 +276,30 @@ def datasets_already_exist(pathF, name, nbCores): pathF + name + str(coreIndex) + ".hdf5") return allDatasetExist -def get_v(dataset, view_index, used_indices=None): - """Used to extract a view as a numpy array or a sparse mat from the HDF5 dataset""" - if used_indices is None: - used_indices = range(dataset.get("Metadata").attrs["datasetLength"]) - if type(used_indices) is int: - return dataset.get("View" + str(view_index))[used_indices, :] - else: - used_indices = np.array(used_indices) - sorted_indices = np.argsort(used_indices) - used_indices = used_indices[sorted_indices] - - if not dataset.get("View" + str(view_index)).attrs["sparse"]: - return dataset.get("View" + str(view_index))[used_indices, :][ - np.argsort(sorted_indices), :] - else: - sparse_mat = sparse.csr_matrix( - (dataset.get("View" + str(view_index)).get("data").value, - dataset.get("View" + str(view_index)).get("indices").value, - dataset.get("View" + str(view_index)).get("indptr").value), - shape=dataset.get("View" + str(view_index)).attrs["shape"])[ - used_indices, :][ - np.argsort(sorted_indices), :] - - return sparse_mat +# def get_v(dataset, view_index, used_indices=None): +# # """Used to extract a view as a numpy array or a sparse mat from the HDF5 dataset""" +# # if used_indices is None: +# # used_indices = range(dataset.get("Metadata").attrs["datasetLength"]) +# # if type(used_indices) is int: +# # return dataset.get("View" + str(view_index))[used_indices, :] +# # else: +# # used_indices = np.array(used_indices) +# # sorted_indices = np.argsort(used_indices) +# # used_indices = used_indices[sorted_indices] +# # +# # if not dataset.get("View" + str(view_index)).attrs["sparse"]: +# # return dataset.get("View" + str(view_index))[used_indices, :][ +# # np.argsort(sorted_indices), :] +# # else: +# # sparse_mat = sparse.csr_matrix( +# # (dataset.get("View" + str(view_index)).get("data").value, +# # dataset.get("View" + str(view_index)).get("indices").value, +# # dataset.get("View" + str(view_index)).get("indptr").value), +# # shape=dataset.get("View" + str(view_index)).attrs["shape"])[ +# # used_indices, :][ +# # np.argsort(sorted_indices), :] +# # +# # return sparse_mat def get_shape(dataset, view_index): diff --git a/multiview_platform/mono_multi_view_classifiers/utils/execution.py b/multiview_platform/mono_multi_view_classifiers/utils/execution.py index b965f810..7bc2d155 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/execution.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/execution.py @@ -115,7 +115,8 @@ def get_database_function(name, type_var): return get_database -def init_log_file(name, views, cl_type, log, debug, label, result_directory, add_noise, noise_std): +def init_log_file(name, views, cl_type, log, debug, label, + result_directory, add_noise, noise_std): r"""Used to init the directory where the preds will be stored and the log file. First this function will check if the result directory already exists (only one per minute is allowed). @@ -147,6 +148,8 @@ def init_log_file(name, views, cl_type, log, debug, label, result_directory, add results_directory : string Reference to the main results directory for the benchmark. """ + if views is None: + views=[] noise_string = "/n_"+str(int(noise_std*100)) if debug: result_directory = result_directory + name + noise_string + \ @@ -256,31 +259,27 @@ def init_views(dataset_var, arg_views): all_views : list of strings Names of all the available views in the dataset. """ - nb_view = dataset_var.get("Metadata").attrs["nbView"] - if arg_views != ["all"]: + nb_view = dataset_var.nb_view + if arg_views is not None: allowed_views = arg_views - all_views = [str(dataset_var.get("View" + str(view_index)).attrs["name"]) - if type( - dataset_var.get("View" + str(view_index)).attrs["name"]) != bytes - else dataset_var.get("View" + str(view_index)).attrs[ - "name"].decode("utf-8") + all_views = [str(dataset_var.get_view_name(view_index)) + if type(dataset_var.get_view_name(view_index)) != bytes + else dataset_var.get_view_name(view_index).decode("utf-8") for view_index in range(nb_view)] views = [] views_indices = [] for view_index in range(nb_view): - view_name = dataset_var.get("View" + str(view_index)).attrs["name"] + view_name = dataset_var.get_view_name(view_index) if type(view_name) == bytes: view_name = view_name.decode("utf-8") if view_name in allowed_views: views.append(view_name) views_indices.append(view_index) else: - views = [str(dataset_var.get("View" + str(viewIndex)).attrs["name"]) - if type( - dataset_var.get("View" + str(viewIndex)).attrs["name"]) != bytes - else dataset_var.get("View" + str(viewIndex)).attrs["name"].decode( - "utf-8") - for viewIndex in range(nb_view)] + views = [str(dataset_var.get_view_name(view_index)) + if type(dataset_var.get_view_name(view_index)) != bytes + else dataset_var.get_view_name(view_index).decode("utf-8") + for view_index in range(nb_view)] views_indices = range(nb_view) all_views = views return views, views_indices, all_views diff --git a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py index be069e57..0aec7f32 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py @@ -125,143 +125,144 @@ class DatasetError(Exception): def __init__(self, *args, **kwargs): Exception.__init__(self, *args, **kwargs) - -def get_classes(labels): - labels_set = set(list(labels)) - nb_labels = len(labels_set) - if nb_labels >= 2: - return labels_set - else: - raise DatasetError("Dataset must have at least two different labels") - - -def all_asked_labels_are_available(asked_labels_names_set, - available_labels_names): - for askedLabelName in asked_labels_names_set: - if askedLabelName in available_labels_names: - pass - else: - return False - return True - - -def fill_label_names(nb_class, asked_labels_names, random_state, - available_labels_names): - if len(asked_labels_names) < nb_class: - nb_labels_to_add = nb_class - len(asked_labels_names) - labels_names_to_choose = [available_label_name - for available_label_name - in available_labels_names - if available_label_name - not in asked_labels_names] - added_labels_names = random_state.choice(labels_names_to_choose, - nb_labels_to_add, replace=False) - asked_labels_names = list(asked_labels_names) + list(added_labels_names) - asked_labels_names_set = set(asked_labels_names) - - elif len(asked_labels_names) > nb_class: - asked_labels_names = list( - random_state.choice(asked_labels_names, nb_class, replace=False)) - asked_labels_names_set = set(asked_labels_names) - - else: - asked_labels_names_set = set(asked_labels_names) - - return asked_labels_names, asked_labels_names_set - - -def get_all_labels(full_labels, available_labels_names): - new_labels = full_labels - new_labels_names = available_labels_names - used_indices = np.arange(len(full_labels)) - return new_labels, new_labels_names, used_indices - - -def select_asked_labels(asked_labels_names_set, available_labels_names, - asked_labels_names, full_labels): - if all_asked_labels_are_available(asked_labels_names_set, available_labels_names): - used_labels = [available_labels_names.index(asked_label_name) for - asked_label_name in asked_labels_names] - used_indices = np.array( - [labelIndex for labelIndex, label in enumerate(full_labels) if - label in used_labels]) - new_labels = np.array([used_labels.index(label) for label in full_labels if - label in used_labels]) - new_labels_names = [available_labels_names[usedLabel] for usedLabel in - used_labels] - return new_labels, new_labels_names, used_indices - else: - raise DatasetError("Asked labels are not all available in the dataset") - - -def filter_labels(labels_set, asked_labels_names_set, full_labels, - available_labels_names, asked_labels_names): - if len(labels_set) > 2: - if asked_labels_names == available_labels_names: - new_labels, new_labels_names, used_indices = \ - get_all_labels(full_labels, available_labels_names) - elif len(asked_labels_names_set) <= len(labels_set): - new_labels, new_labels_names, used_indices = select_asked_labels( - asked_labels_names_set, available_labels_names, - asked_labels_names, full_labels) - else: - raise DatasetError( - "Asked more labels than available in the dataset. Available labels are : " + - ", ".join(available_labels_names)) - - else: - new_labels, new_labels_names, used_indices = get_all_labels(full_labels, - available_labels_names) - return new_labels, new_labels_names, used_indices - - -def filter_views(dataset_file, temp_dataset, views, used_indices): - new_view_index = 0 - if views == [""]: - for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): - copyhdf5_dataset(dataset_file, temp_dataset, "View" + str(view_index), - "View" + str(view_index), used_indices) - else: - for asked_view_name in views: - for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): - view_name = dataset_file.get("View" + str(view_index)).attrs["name"] - if type(view_name) == bytes: - view_name = view_name.decode("utf-8") - if view_name == asked_view_name: - copyhdf5_dataset(dataset_file, temp_dataset, - "View" + str(view_index), - "View" + str(new_view_index), used_indices) - new_view_name = \ - temp_dataset.get("View" + str(new_view_index)).attrs["name"] - if type(new_view_name) == bytes: - temp_dataset.get("View" + str(new_view_index)).attrs[ - "name"] = new_view_name.decode("utf-8") - - new_view_index += 1 - else: - pass - temp_dataset.get("Metadata").attrs["nbView"] = len(views) - - -def copyhdf5_dataset(source_data_file, destination_data_file, source_dataset_name, - destination_dataset_name, used_indices): - """Used to copy a view in a new dataset file using only the examples of - usedIndices, and copying the args""" - new_d_set = destination_data_file.create_dataset(destination_dataset_name, - data=source_data_file.get( - source_dataset_name).value[ - used_indices, :]) - if "sparse" in source_data_file.get(source_dataset_name).attrs.keys() and \ - source_data_file.get(source_dataset_name).attrs["sparse"]: - # TODO : Support sparse - pass - else: - for key, value in source_data_file.get(source_dataset_name).attrs.items(): - new_d_set.attrs[key] = value +# +# def get_classes(labels): +# labels_set = set(list(labels)) +# nb_labels = len(labels_set) +# if nb_labels >= 2: +# return labels_set +# else: +# raise DatasetError("Dataset must have at least two different labels") +# +# +# def all_asked_labels_are_available(asked_labels_names_set, +# available_labels_names): +# for askedLabelName in asked_labels_names_set: +# if askedLabelName in available_labels_names: +# pass +# else: +# return False +# return True +# +# +# def fill_label_names(nb_labels, selected_label_names, random_state, +# available_labels_names): +# if len(selected_label_names) < nb_labels: +# nb_labels_to_add = nb_labels - len(selected_label_names) +# labels_names_to_choose = [available_label_name +# for available_label_name +# in available_labels_names +# if available_label_name +# not in selected_label_names] +# added_labels_names = random_state.choice(labels_names_to_choose, +# nb_labels_to_add, replace=False) +# selected_label_names = list(selected_label_names) + list(added_labels_names) +# asked_labels_names_set = set(selected_label_names) +# +# elif len(selected_label_names) > nb_labels: +# selected_label_names = list( +# random_state.choice(selected_label_names, nb_labels, replace=False)) +# asked_labels_names_set = set(selected_label_names) +# +# else: +# asked_labels_names_set = set(selected_label_names) +# +# return selected_label_names, asked_labels_names_set +# +# +# def get_all_labels(full_labels, available_labels_names): +# new_labels = full_labels +# new_labels_names = available_labels_names +# used_indices = np.arange(len(full_labels)) +# return new_labels, new_labels_names, used_indices +# +# +# def select_asked_labels(asked_labels_names_set, available_labels_names, +# asked_labels_names, full_labels): +# if all_asked_labels_are_available(asked_labels_names_set, available_labels_names): +# used_labels = [available_labels_names.index(asked_label_name) for +# asked_label_name in asked_labels_names] +# used_indices = np.array( +# [labelIndex for labelIndex, label in enumerate(full_labels) if +# label in used_labels]) +# new_labels = np.array([used_labels.index(label) for label in full_labels if +# label in used_labels]) +# new_labels_names = [available_labels_names[usedLabel] for usedLabel in +# used_labels] +# return new_labels, new_labels_names, used_indices +# else: +# raise DatasetError("Asked labels are not all available in the dataset") +# +# +# def filter_labels(labels_set, asked_labels_names_set, full_labels, +# available_labels_names, asked_labels_names): +# if len(labels_set) > 2: +# if asked_labels_names == available_labels_names: +# new_labels, new_labels_names, used_indices = \ +# get_all_labels(full_labels, available_labels_names) +# elif len(asked_labels_names_set) <= len(labels_set): +# new_labels, new_labels_names, used_indices = select_asked_labels( +# asked_labels_names_set, available_labels_names, +# asked_labels_names, full_labels) +# else: +# raise DatasetError( +# "Asked more labels than available in the dataset. Available labels are : " + +# ", ".join(available_labels_names)) +# +# else: +# new_labels, new_labels_names, used_indices = get_all_labels(full_labels, +# available_labels_names) +# return new_labels, new_labels_names, used_indices +# +# +# def filter_views(dataset_file, temp_dataset, views, used_indices): +# new_view_index = 0 +# if views == [""]: +# for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): +# copyhdf5_dataset(dataset_file, temp_dataset, "View" + str(view_index), +# "View" + str(view_index), used_indices) +# else: +# for asked_view_name in views: +# for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): +# view_name = dataset_file.get("View" + str(view_index)).attrs["name"] +# if type(view_name) == bytes: +# view_name = view_name.decode("utf-8") +# if view_name == asked_view_name: +# copyhdf5_dataset(dataset_file, temp_dataset, +# "View" + str(view_index), +# "View" + str(new_view_index), used_indices) +# new_view_name = \ +# temp_dataset.get("View" + str(new_view_index)).attrs["name"] +# if type(new_view_name) == bytes: +# temp_dataset.get("View" + str(new_view_index)).attrs[ +# "name"] = new_view_name.decode("utf-8") +# +# new_view_index += 1 +# else: +# pass +# temp_dataset.get("Metadata").attrs["nbView"] = len(views) +# +# +# def copyhdf5_dataset(source_data_file, destination_data_file, source_dataset_name, +# destination_dataset_name, used_indices): +# """Used to copy a view in a new dataset file using only the examples of +# usedIndices, and copying the args""" +# new_d_set = destination_data_file.create_dataset(destination_dataset_name, +# data=source_data_file.get( +# source_dataset_name).value[ +# used_indices, :]) +# if "sparse" in source_data_file.get(source_dataset_name).attrs.keys() and \ +# source_data_file.get(source_dataset_name).attrs["sparse"]: +# # TODO : Support sparse +# pass +# else: +# for key, value in source_data_file.get(source_dataset_name).attrs.items(): +# new_d_set.attrs[key] = value def get_classic_db_hdf5(views, path_f, name_DB, nb_class, asked_labels_names, - random_state, full=False, add_noise=False, noise_std=0.15,): + random_state, full=False, add_noise=False, noise_std=0.15, + path_for_new="../data/"): """Used to load a hdf5 database""" if full: dataset_file = h5py.File(path_f + name_DB + ".hdf5", "r") @@ -269,88 +270,62 @@ def get_classic_db_hdf5(views, path_f, name_DB, nb_class, asked_labels_names, dataset_name = name_DB labels_dictionary = dict((label_index, label_name) for label_index, label_name - in dataset.get_label_names()) + in enumerate(dataset.get_label_names())) else: - asked_labels_names = [asked_label_name.encode("utf8") for asked_label_name in - asked_labels_names] - base_dataset_file = h5py.File(path_f + name_DB + ".hdf5", "r") - full_labels = base_dataset_file.get("Labels").value - dataset_file = h5py.File(path_f + name_DB + "_temp_view_label_select.hdf5", - "w") - dataset_name = name_DB + "_temp_view_label_select" - base_dataset_file.copy("Metadata", dataset_file) - labels_set = get_classes(full_labels) - available_labels_names = list( - base_dataset_file.get("Labels").attrs["names"]) - asked_labels_names, asked_labels_names_set = fill_label_names(nb_class, - asked_labels_names, - random_state, - available_labels_names) - - new_labels, new_labels_names, used_indices = filter_labels(labels_set, - asked_labels_names_set, - full_labels, - available_labels_names, - asked_labels_names) - dataset_file.get("Metadata").attrs["datasetLength"] = len(used_indices) - dataset_file.get("Metadata").attrs["nbClass"] = nb_class - dataset_file.create_dataset("Labels", data=new_labels) - dataset_file.get("Labels").attrs["names"] = new_labels_names - filter_views(base_dataset_file, dataset_file, views, used_indices) + dataset_file = h5py.File(path_f + name_DB + ".hdf5", "r") + dataset = Dataset(hdf5_file=dataset_file) + labels_dictionary = dataset.select_views_and_labels(nb_labels=nb_class, + selected_label_names=asked_labels_names, + view_names=views, random_state=random_state, + path_for_new=path_for_new) + dataset_name = dataset.get_name() - labels_dictionary = dict( - (labelIndex, labelName.decode("utf-8")) for labelIndex, labelName in - enumerate(dataset_file.get("Labels").attrs["names"])) - dataset_file.close() - dataset_file = h5py.File(path_f + name_DB + "_temp_view_label_select.hdf5", - "r") if add_noise: - dataset_file, dataset_name = add_gaussian_noise(dataset_file, random_state, - path_f, dataset_name, - noise_std) + dataset.add_gaussian_noise(random_state, path_for_new, noise_std) + dataset_name = dataset.get_name() else: pass return dataset, labels_dictionary, dataset_name - -def add_gaussian_noise(dataset_file, random_state, path_f, dataset_name, - noise_std=0.15): - """In this function, we add a guaussian noise centered in 0 with specified - std to each view, according to it's range (the noise will be - mutliplied by this range) and we crop the noisy signal according to the - view's attributes limits. - This is done by creating a new dataset, to keep clean data.""" - noisy_dataset = h5py.File(path_f + dataset_name + "_noised.hdf5", "w") - dataset_file.copy("Metadata", noisy_dataset) - dataset_file.copy("Labels", noisy_dataset) - for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): - dataset_file.copy("View" + str(view_index), noisy_dataset) - for view_index in range(noisy_dataset.get("Metadata").attrs["nbView"]): - view_name = "View" + str(view_index) - view_dset = noisy_dataset.get(view_name) - view_limits = dataset_file[ - "Metadata/View" + str(view_index) + "_limits"].value - view_ranges = view_limits[:, 1] - view_limits[:, 0] - normal_dist = random_state.normal(0, noise_std, view_dset.value.shape) - noise = normal_dist * view_ranges - noised_data = view_dset.value + noise - noised_data = np.where(noised_data < view_limits[:, 0], - view_limits[:, 0], noised_data) - noised_data = np.where(noised_data > view_limits[:, 1], - view_limits[:, 1], noised_data) - noisy_dataset[view_name][...] = noised_data - original_dataset_filename = dataset_file.filename - dataset_file.close() - noisy_dataset.close() - noisy_dataset = h5py.File(path_f + dataset_name + "_noised.hdf5", "r") - if "_temp_" in original_dataset_filename: - os.remove(original_dataset_filename) - return noisy_dataset, dataset_name + "_noised" +# +# def add_gaussian_noise(dataset_file, random_state, path_f, dataset_name, +# noise_std=0.15): +# """In this function, we add a guaussian noise centered in 0 with specified +# std to each view, according to it's range (the noise will be +# mutliplied by this range) and we crop the noisy signal according to the +# view's attributes limits. +# This is done by creating a new dataset, to keep clean data.""" +# noisy_dataset = h5py.File(path_f + dataset_name + "_noised.hdf5", "w") +# dataset_file.copy("Metadata", noisy_dataset) +# dataset_file.copy("Labels", noisy_dataset) +# for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): +# dataset_file.copy("View" + str(view_index), noisy_dataset) +# for view_index in range(noisy_dataset.get("Metadata").attrs["nbView"]): +# view_name = "View" + str(view_index) +# view_dset = noisy_dataset.get(view_name) +# view_limits = dataset_file[ +# "Metadata/View" + str(view_index) + "_limits"].value +# view_ranges = view_limits[:, 1] - view_limits[:, 0] +# normal_dist = random_state.normal(0, noise_std, view_dset.value.shape) +# noise = normal_dist * view_ranges +# noised_data = view_dset.value + noise +# noised_data = np.where(noised_data < view_limits[:, 0], +# view_limits[:, 0], noised_data) +# noised_data = np.where(noised_data > view_limits[:, 1], +# view_limits[:, 1], noised_data) +# noisy_dataset[view_name][...] = noised_data +# original_dataset_filename = dataset_file.filename +# dataset_file.close() +# noisy_dataset.close() +# noisy_dataset = h5py.File(path_f + dataset_name + "_noised.hdf5", "r") +# if "_temp_" in original_dataset_filename: +# os.remove(original_dataset_filename) +# return noisy_dataset, dataset_name + "_noised" def get_classic_db_csv(views, pathF, nameDB, NB_CLASS, askedLabelsNames, random_state, full=False, add_noise=False, noise_std=0.15, - delimiter=","): + delimiter=",", path_for_new="../data/"): # TODO : Update this one labels_names = np.genfromtxt(pathF + nameDB + "-labels-names.csv", dtype='str', delimiter=delimiter) @@ -380,7 +355,8 @@ def get_classic_db_csv(views, pathF, nameDB, NB_CLASS, askedLabelsNames, datasetFile.close() datasetFile, labelsDictionary, dataset_name = get_classic_db_hdf5(views, pathF, nameDB, NB_CLASS, askedLabelsNames, - random_state, full) + random_state, full, + path_for_new=path_for_new) return datasetFile, labelsDictionary, dataset_name diff --git a/multiview_platform/tests/test_ExecClassif.py b/multiview_platform/tests/test_ExecClassif.py index a0c11c76..bbc966cf 100644 --- a/multiview_platform/tests/test_ExecClassif.py +++ b/multiview_platform/tests/test_ExecClassif.py @@ -4,7 +4,7 @@ import unittest import h5py import numpy as np -from .utils import rm_tmp, tmp_path +from .utils import rm_tmp, tmp_path, test_dataset from ..mono_multi_view_classifiers import exec_classif @@ -232,11 +232,8 @@ class Test_execBenchmark(unittest.TestCase): @classmethod def setUpClass(cls): rm_tmp() - os.mkdir("multiview_platform/tests/tmp_tests") - cls.Dataset = h5py.File( - tmp_path+"test_file.hdf5", "w") - cls.labels = cls.Dataset.create_dataset("Labels", - data=np.array([0, 1, 2])) + os.mkdir(tmp_path) + cls.Dataset = test_dataset cls.argument_dictionaries = [{"a": 4, "args": {}}] cls.args = { "Base":{"name": "chicken_is_heaven", "type": "type", "pathf": "pathF"}, @@ -294,12 +291,7 @@ class Test_execBenchmark(unittest.TestCase): @classmethod def tearDownClass(cls): - cls.Dataset.close() - path = tmp_path - for file_name in os.listdir(path): - os.remove(os.path.join(path, file_name)) - os.rmdir(path) - + rm_tmp() def fakeExecMono(directory, name, labels_names, classification_indices, k_folds, coreIndex, type, pathF, random_state, labels, diff --git a/multiview_platform/tests/test_multiview_classifiers/test_weighted_linear_early_fusion.py b/multiview_platform/tests/test_multiview_classifiers/test_weighted_linear_early_fusion.py index 3c274c28..65541a03 100644 --- a/multiview_platform/tests/test_multiview_classifiers/test_weighted_linear_early_fusion.py +++ b/multiview_platform/tests/test_multiview_classifiers/test_weighted_linear_early_fusion.py @@ -1,10 +1,9 @@ import unittest import numpy as np -import h5py import os -from ..utils import rm_tmp, tmp_path +from ..utils import rm_tmp, tmp_path, test_dataset from multiview_platform.mono_multi_view_classifiers.multiview_classifiers import \ weighted_linear_early_fusion @@ -16,34 +15,17 @@ class Test_WeightedLinearEarlyFusion(unittest.TestCase): rm_tmp() cls.random_state = np.random.RandomState(42) cls.view_weights = [0.5, 0.5] - os.mkdir("multiview_platform/tests/tmp_tests") - cls.dataset_file = h5py.File( - tmp_path+"test_file.hdf5", "w") - cls.labels = cls.dataset_file.create_dataset("Labels", - data=np.array([0, 1, 0, 0, 1])) - cls.view0_data = cls.random_state.randint(1,10,size=(5, 4)) - view0 = cls.dataset_file.create_dataset("View0", data=cls.view0_data) - view0.attrs["sparse"] = False - cls.view1_data = cls.random_state.randint(1, 10, size=(5, 4)) - view1 = cls.dataset_file.create_dataset("View1", data=cls.view1_data) - view1.attrs["sparse"] = False - metaDataGrp = cls.dataset_file.create_group("Metadata") - metaDataGrp.attrs["nbView"] = 2 - metaDataGrp.attrs["nbClass"] = 2 - metaDataGrp.attrs["datasetLength"] = 5 cls.monoview_classifier_name = "decision_tree" cls.monoview_classifier_config = {"max_depth":1, "criterion": "gini", "splitter": "best"} cls.classifier = weighted_linear_early_fusion.WeightedLinearEarlyFusion( random_state=cls.random_state, view_weights=cls.view_weights, monoview_classifier_name=cls.monoview_classifier_name, monoview_classifier_config=cls.monoview_classifier_config) + cls.dataset = test_dataset @classmethod def tearDownClass(cls): - cls.dataset_file.close() - for file_name in os.listdir("multiview_platform/tests/tmp_tests"): - os.remove(os.path.join("multiview_platform/tests/tmp_tests", file_name)) - os.rmdir("multiview_platform/tests/tmp_tests") + rm_tmp() def test_simple(self): np.testing.assert_array_equal(self.view_weights, self.classifier.view_weights) @@ -51,37 +33,35 @@ class Test_WeightedLinearEarlyFusion(unittest.TestCase): def test_fit(self): self.assertRaises(AttributeError, getattr, self.classifier.monoview_classifier, "classes_") - self.classifier.fit(self.dataset_file, self.labels, None, None) + self.classifier.fit(self.dataset, test_dataset.get_labels(), None, None) np.testing.assert_array_equal(self.classifier.monoview_classifier.classes_, np.array([0,1])) def test_predict(self): - self.classifier.fit(self.dataset_file, self.labels, None, None) - predicted_labels = self.classifier.predict(self.dataset_file, None, None) - np.testing.assert_array_equal(predicted_labels, self.labels) + self.classifier.fit(self.dataset, test_dataset.get_labels(), None, None) + predicted_labels = self.classifier.predict(self.dataset, None, None) + np.testing.assert_array_equal(predicted_labels, test_dataset.get_labels()) def test_transform_data_to_monoview_simple(self): - - - example_indices, X = self.classifier.transform_data_to_monoview(self.dataset_file, + example_indices, X = self.classifier.transform_data_to_monoview(self.dataset, None, None) - self.assertEqual(X.shape, (5,8)) - np.testing.assert_array_equal(X, np.concatenate((self.view0_data, self.view1_data), axis=1)) + self.assertEqual(X.shape, (5,12)) + np.testing.assert_array_equal(X, np.concatenate((self.dataset.get_v(0), self.dataset.get_v(1)), axis=1)) np.testing.assert_array_equal(example_indices, np.arange(5)) def test_transform_data_to_monoview_view_select(self): example_indices, X = self.classifier.transform_data_to_monoview( - self.dataset_file, + self.dataset, None, np.array([0])) - self.assertEqual(X.shape, (5, 4)) - np.testing.assert_array_equal(X, self.view0_data) + self.assertEqual(X.shape, (5, 6)) + np.testing.assert_array_equal(X, self.dataset.get_v(0)) np.testing.assert_array_equal(example_indices, np.arange(5)) - def test_transform_data_to_monoview_view_select(self): + def test_transform_data_to_monoview_example_view_select(self): example_indices, X = self.classifier.transform_data_to_monoview( - self.dataset_file, + self.dataset, np.array([1,2,3]), np.array([0])) - self.assertEqual(X.shape, (3, 4)) - np.testing.assert_array_equal(X, self.view0_data[np.array([1,2,3]), :]) + self.assertEqual(X.shape, (3, 6)) + np.testing.assert_array_equal(X, self.dataset.get_v(0)[np.array([1,2,3]), :]) np.testing.assert_array_equal(example_indices, np.array([1,2,3])) diff --git a/multiview_platform/tests/test_utils/test_GetMultiviewDB.py b/multiview_platform/tests/test_utils/test_GetMultiviewDB.py index 862556e2..00e33b51 100644 --- a/multiview_platform/tests/test_utils/test_GetMultiviewDB.py +++ b/multiview_platform/tests/test_utils/test_GetMultiviewDB.py @@ -5,570 +5,443 @@ import h5py import numpy as np from ...mono_multi_view_classifiers.utils import get_multiview_db -from ..utils import rm_tmp, tmp_path - - -class Test_copyhdf5Dataset(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - cls.random_state = np.random.RandomState(42) - if not os.path.exists("multiview_platform/tests/tmp_tests"): - os.mkdir("multiview_platform/tests/tmp_tests") - cls.dataset_file = h5py.File( - tmp_path+"test_copy.hdf5", "w") - cls.dataset = cls.dataset_file.create_dataset("test", - data=cls.random_state.randint( - 0, 100, (10, 20))) - cls.dataset.attrs["test_arg"] = "Am I copied" - - def test_simple_copy(cls): - get_multiview_db.copyhdf5_dataset(cls.dataset_file, cls.dataset_file, - "test", "test_copy_1", np.arange(10)) - np.testing.assert_array_equal(cls.dataset_file.get("test").value, - cls.dataset_file.get("test_copy_1").value) - cls.assertEqual("Am I copied", - cls.dataset_file.get("test_copy_1").attrs["test_arg"]) - - def test_copy_only_some_indices(cls): - usedIndices = cls.random_state.choice(10, 6, replace=False) - get_multiview_db.copyhdf5_dataset(cls.dataset_file, cls.dataset_file, - "test", "test_copy", usedIndices) - np.testing.assert_array_equal( - cls.dataset_file.get("test").value[usedIndices, :], - cls.dataset_file.get("test_copy").value) - cls.assertEqual("Am I copied", - cls.dataset_file.get("test_copy").attrs["test_arg"]) - - @classmethod - def tearDownClass(cls): - os.remove(tmp_path+"test_copy.hdf5") - os.rmdir("multiview_platform/tests/tmp_tests") - - -class Test_filterViews(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - cls.random_state = np.random.RandomState(42) - cls.views = ["test_view_1", "test_view_2"] - if not os.path.exists("multiview_platform/tests/tmp_tests"): - os.mkdir("multiview_platform/tests/tmp_tests") - cls.dataset_file = h5py.File( - tmp_path+"test_copy.hdf5", "w") - cls.metadata_group = cls.dataset_file.create_group("Metadata") - cls.metadata_group.attrs["nbView"] = 4 - - for i in range(4): - cls.dataset = cls.dataset_file.create_dataset("View" + str(i), - data=cls.random_state.randint( - 0, 100, (10, 20))) - cls.dataset.attrs["name"] = "test_view_" + str(i) - - def test_simple_filter(cls): - cls.temp_dataset_file = h5py.File( - tmp_path+"test_copy_temp.hdf5", "w") - cls.dataset_file.copy("Metadata", cls.temp_dataset_file) - get_multiview_db.filter_views(cls.dataset_file, cls.temp_dataset_file, - cls.views, np.arange(10)) - cls.assertEqual(cls.dataset_file.get("View1").attrs["name"], - cls.temp_dataset_file.get("View0").attrs["name"]) - np.testing.assert_array_equal(cls.dataset_file.get("View2").value, - cls.temp_dataset_file.get("View1").value) - cls.assertEqual(cls.temp_dataset_file.get("Metadata").attrs["nbView"], - 2) - - def test_filter_view_and_examples(cls): - cls.temp_dataset_file = h5py.File( - tmp_path+"test_copy_temp.hdf5", "w") - cls.dataset_file.copy("Metadata", cls.temp_dataset_file) - usedIndices = cls.random_state.choice(10, 6, replace=False) - get_multiview_db.filter_views(cls.dataset_file, cls.temp_dataset_file, - cls.views, usedIndices) - np.testing.assert_array_equal( - cls.dataset_file.get("View1").value[usedIndices, :], - cls.temp_dataset_file.get("View0").value) - cls.temp_dataset_file.close() - - @classmethod - def tearDownClass(cls): - os.remove(tmp_path+"test_copy.hdf5") - os.remove(tmp_path+"test_copy_temp.hdf5") - os.rmdir("multiview_platform/tests/tmp_tests") +from ..utils import rm_tmp, tmp_path, test_dataset +# class Test_copyhdf5Dataset(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# rm_tmp() +# cls.random_state = np.random.RandomState(42) +# if not os.path.exists("multiview_platform/tests/tmp_tests"): +# os.mkdir("multiview_platform/tests/tmp_tests") +# cls.dataset_file = h5py.File( +# tmp_path+"test_copy.hdf5", "w") +# cls.dataset = cls.dataset_file.create_dataset("test", +# data=cls.random_state.randint( +# 0, 100, (10, 20))) +# cls.dataset.attrs["test_arg"] = "Am I copied" +# +# def test_simple_copy(cls): +# get_multiview_db.copyhdf5_dataset(cls.dataset_file, cls.dataset_file, +# "test", "test_copy_1", np.arange(10)) +# np.testing.assert_array_equal(cls.dataset_file.get("test").value, +# cls.dataset_file.get("test_copy_1").value) +# cls.assertEqual("Am I copied", +# cls.dataset_file.get("test_copy_1").attrs["test_arg"]) +# +# def test_copy_only_some_indices(cls): +# usedIndices = cls.random_state.choice(10, 6, replace=False) +# get_multiview_db.copyhdf5_dataset(cls.dataset_file, cls.dataset_file, +# "test", "test_copy", usedIndices) +# np.testing.assert_array_equal( +# cls.dataset_file.get("test").value[usedIndices, :], +# cls.dataset_file.get("test_copy").value) +# cls.assertEqual("Am I copied", +# cls.dataset_file.get("test_copy").attrs["test_arg"]) +# +# @classmethod +# def tearDownClass(cls): +# os.remove(tmp_path+"test_copy.hdf5") +# os.rmdir("multiview_platform/tests/tmp_tests") +# +# +# class Test_filterViews(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# rm_tmp() +# cls.random_state = np.random.RandomState(42) +# cls.views = ["test_view_1", "test_view_2"] +# if not os.path.exists("multiview_platform/tests/tmp_tests"): +# os.mkdir("multiview_platform/tests/tmp_tests") +# cls.dataset_file = h5py.File( +# tmp_path+"test_copy.hdf5", "w") +# cls.metadata_group = cls.dataset_file.create_group("Metadata") +# cls.metadata_group.attrs["nbView"] = 4 +# +# for i in range(4): +# cls.dataset = cls.dataset_file.create_dataset("View" + str(i), +# data=cls.random_state.randint( +# 0, 100, (10, 20))) +# cls.dataset.attrs["name"] = "test_view_" + str(i) +# +# def test_simple_filter(cls): +# cls.temp_dataset_file = h5py.File( +# tmp_path+"test_copy_temp.hdf5", "w") +# cls.dataset_file.copy("Metadata", cls.temp_dataset_file) +# get_multiview_db.filter_views(cls.dataset_file, cls.temp_dataset_file, +# cls.views, np.arange(10)) +# cls.assertEqual(cls.dataset_file.get("View1").attrs["name"], +# cls.temp_dataset_file.get("View0").attrs["name"]) +# np.testing.assert_array_equal(cls.dataset_file.get("View2").value, +# cls.temp_dataset_file.get("View1").value) +# cls.assertEqual(cls.temp_dataset_file.get("Metadata").attrs["nbView"], +# 2) +# +# def test_filter_view_and_examples(cls): +# cls.temp_dataset_file = h5py.File( +# tmp_path+"test_copy_temp.hdf5", "w") +# cls.dataset_file.copy("Metadata", cls.temp_dataset_file) +# usedIndices = cls.random_state.choice(10, 6, replace=False) +# get_multiview_db.filter_views(cls.dataset_file, cls.temp_dataset_file, +# cls.views, usedIndices) +# np.testing.assert_array_equal( +# cls.dataset_file.get("View1").value[usedIndices, :], +# cls.temp_dataset_file.get("View0").value) +# cls.temp_dataset_file.close() +# +# @classmethod +# def tearDownClass(cls): +# os.remove(tmp_path+"test_copy.hdf5") +# os.remove(tmp_path+"test_copy_temp.hdf5") +# os.rmdir("multiview_platform/tests/tmp_tests") +# +# +# # +# class Test_filterLabels(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# cls.random_state = np.random.RandomState(42) +# cls.labelsSet = set(range(4)) +# cls.askedLabelsNamesSet = {"test_label_1", "test_label_3"} +# cls.fullLabels = cls.random_state.randint(0, 4, 10) +# cls.availableLabelsNames = ["test_label_0", "test_label_1", +# "test_label_2", "test_label_3"] +# cls.askedLabelsNames = ["test_label_1", "test_label_3"] +# +# def test_simple(cls): +# newLabels, \ +# newLabelsNames, \ +# usedIndices = get_multiview_db.filter_labels(cls.labelsSet, +# cls.askedLabelsNamesSet, +# cls.fullLabels, +# cls.availableLabelsNames, +# cls.askedLabelsNames) +# cls.assertEqual(["test_label_1", "test_label_3"], newLabelsNames) +# np.testing.assert_array_equal(usedIndices, np.array([1, 5, 9])) +# np.testing.assert_array_equal(newLabels, np.array([1, 1, 0])) +# +# def test_biclasse(cls): +# cls.labelsSet = {0, 1} +# cls.fullLabels = cls.random_state.randint(0, 2, 10) +# cls.availableLabelsNames = ["test_label_0", "test_label_1"] +# newLabels, \ +# newLabelsNames, \ +# usedIndices = get_multiview_db.filter_labels(cls.labelsSet, +# cls.askedLabelsNamesSet, +# cls.fullLabels, +# cls.availableLabelsNames, +# cls.askedLabelsNames) +# cls.assertEqual(cls.availableLabelsNames, newLabelsNames) +# np.testing.assert_array_equal(usedIndices, np.arange(10)) +# np.testing.assert_array_equal(newLabels, cls.fullLabels) +# +# def test_asked_too_many_labels(cls): +# cls.askedLabelsNamesSet = {"test_label_0", "test_label_1", +# "test_label_2", "test_label_3", +# "chicken_is_heaven"} +# with cls.assertRaises(get_multiview_db.DatasetError) as catcher: +# get_multiview_db.filter_labels(cls.labelsSet, +# cls.askedLabelsNamesSet, +# cls.fullLabels, +# cls.availableLabelsNames, +# cls.askedLabelsNames) +# exception = catcher.exception +# +# def test_asked_all_labels(cls): +# cls.askedLabelsNamesSet = {"test_label_0", "test_label_1", +# "test_label_2", "test_label_3"} +# cls.askedLabelsNames = ["test_label_0", "test_label_1", "test_label_2", +# "test_label_3"] +# newLabels, \ +# newLabelsNames, \ +# usedIndices = get_multiview_db.filter_labels(cls.labelsSet, +# cls.askedLabelsNamesSet, +# cls.fullLabels, +# cls.availableLabelsNames, +# cls.askedLabelsNames) +# cls.assertEqual(cls.availableLabelsNames, newLabelsNames) +# np.testing.assert_array_equal(usedIndices, np.arange(10)) +# np.testing.assert_array_equal(newLabels, cls.fullLabels) +# +# +# class Test_selectAskedLabels(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# cls.random_state = np.random.RandomState(42) +# cls.askedLabelsNamesSet = {"test_label_1", "test_label_3"} +# cls.fullLabels = cls.random_state.randint(0, 4, 10) +# cls.availableLabelsNames = ["test_label_0", "test_label_1", +# "test_label_2", "test_label_3"] +# cls.askedLabelsNames = ["test_label_1", "test_label_3"] +# +# def test_simple(cls): +# newLabels, \ +# newLabelsNames, \ +# usedIndices = get_multiview_db.select_asked_labels(cls.askedLabelsNamesSet, +# cls.availableLabelsNames, +# cls.askedLabelsNames, +# cls.fullLabels) +# cls.assertEqual(["test_label_1", "test_label_3"], newLabelsNames) +# np.testing.assert_array_equal(usedIndices, np.array([1, 5, 9])) +# np.testing.assert_array_equal(newLabels, np.array([1, 1, 0])) +# +# def test_asked_all_labels(cls): +# cls.askedLabelsNamesSet = {"test_label_0", "test_label_1", +# "test_label_2", "test_label_3"} +# cls.askedLabelsNames = ["test_label_0", "test_label_1", "test_label_2", +# "test_label_3"] +# newLabels, \ +# newLabelsNames, \ +# usedIndices = get_multiview_db.select_asked_labels(cls.askedLabelsNamesSet, +# cls.availableLabelsNames, +# cls.askedLabelsNames, +# cls.fullLabels) +# cls.assertEqual(cls.availableLabelsNames, newLabelsNames) +# np.testing.assert_array_equal(usedIndices, np.arange(10)) +# np.testing.assert_array_equal(newLabels, cls.fullLabels) +# +# def test_asked_unavailable_labels(cls): +# cls.askedLabelsNamesSet = {"test_label_1", "test_label_3", +# "chicken_is_heaven"} +# with cls.assertRaises(get_multiview_db.DatasetError) as catcher: +# get_multiview_db.select_asked_labels(cls.askedLabelsNamesSet, +# cls.availableLabelsNames, +# cls.askedLabelsNames, +# cls.fullLabels) +# exception = catcher.exception +# # cls.assertTrue("Asked labels are not all available in the dataset" in exception) +# +# +# class Test_getAllLabels(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# cls.random_state = np.random.RandomState(42) +# cls.fullLabels = cls.random_state.randint(0, 4, 10) +# cls.availableLabelsNames = ["test_label_0", "test_label_1", +# "test_label_2", "test_label_3"] +# +# def test_simple(cls): +# newLabels, newLabelsNames, usedIndices = get_multiview_db.get_all_labels( +# cls.fullLabels, cls.availableLabelsNames) +# cls.assertEqual(cls.availableLabelsNames, newLabelsNames) +# np.testing.assert_array_equal(usedIndices, np.arange(10)) +# np.testing.assert_array_equal(newLabels, cls.fullLabels) +# +# +# class Test_fillLabelNames(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# cls.NB_CLASS = 2 +# cls.askedLabelsNames = ["test_label_1", "test_label_3"] +# cls.random_state = np.random.RandomState(42) +# cls.availableLabelsNames = ["test_label_" + str(_) for _ in range(40)] +# +# def test_simple(cls): +# askedLabelsNames, askedLabelsNamesSet = get_multiview_db.fill_label_names( +# cls.NB_CLASS, +# cls.askedLabelsNames, +# cls.random_state, +# cls.availableLabelsNames) +# cls.assertEqual(askedLabelsNames, cls.askedLabelsNames) +# cls.assertEqual(askedLabelsNamesSet, set(cls.askedLabelsNames)) +# +# def test_missing_labels_names(cls): +# cls.NB_CLASS = 39 +# askedLabelsNames, askedLabelsNamesSet = get_multiview_db.fill_label_names( +# cls.NB_CLASS, +# cls.askedLabelsNames, +# cls.random_state, +# cls.availableLabelsNames) +# +# cls.assertEqual(askedLabelsNames, +# ['test_label_1', 'test_label_3', 'test_label_35', +# 'test_label_38', 'test_label_6', 'test_label_15', +# 'test_label_32', 'test_label_28', 'test_label_8', +# 'test_label_29', 'test_label_26', 'test_label_17', +# 'test_label_19', 'test_label_10', 'test_label_18', +# 'test_label_14', 'test_label_21', 'test_label_11', +# 'test_label_34', 'test_label_0', 'test_label_27', +# 'test_label_7', 'test_label_13', 'test_label_2', +# 'test_label_39', 'test_label_23', 'test_label_4', +# 'test_label_31', 'test_label_37', 'test_label_5', +# 'test_label_36', 'test_label_25', 'test_label_33', +# 'test_label_12', 'test_label_24', 'test_label_20', +# 'test_label_22', 'test_label_9', 'test_label_16']) +# cls.assertEqual(askedLabelsNamesSet, set( +# ["test_label_" + str(_) for _ in range(30)] + [ +# "test_label_" + str(31 + _) for _ in range(9)])) +# +# def test_too_many_label_names(cls): +# cls.NB_CLASS = 2 +# cls.askedLabelsNames = ["test_label_1", "test_label_3", "test_label_4", +# "test_label_6"] +# askedLabelsNames, askedLabelsNamesSet = get_multiview_db.fill_label_names( +# cls.NB_CLASS, +# cls.askedLabelsNames, +# cls.random_state, +# cls.availableLabelsNames) +# cls.assertEqual(askedLabelsNames, ["test_label_3", "test_label_6"]) +# cls.assertEqual(askedLabelsNamesSet, {"test_label_3", "test_label_6"}) +# +# +# class Test_allAskedLabelsAreAvailable(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# cls.askedLabelsNamesSet = {"test_label_1", "test_label_3"} +# cls.availableLabelsNames = ["test_label_0", "test_label_1", +# "test_label_2", "test_label_3"] +# +# def test_asked_available_labels(cls): +# cls.assertTrue( +# get_multiview_db.all_asked_labels_are_available(cls.askedLabelsNamesSet, +# cls.availableLabelsNames)) +# +# def test_asked_unavailable_label(cls): +# cls.askedLabelsNamesSet = {"test_label_1", "test_label_3", +# "chicken_is_heaven"} +# cls.assertFalse( +# get_multiview_db.all_asked_labels_are_available(cls.askedLabelsNamesSet, +# cls.availableLabelsNames)) +# +# +# class Test_getClasses(unittest.TestCase): +# +# @classmethod +# def setUpClass(cls): +# cls.random_state = np.random.RandomState(42) +# +# def test_multiclass(cls): +# labelsSet = get_multiview_db.get_classes( +# cls.random_state.randint(0, 5, 30)) +# cls.assertEqual(labelsSet, {0, 1, 2, 3, 4}) +# +# def test_biclass(cls): +# labelsSet = get_multiview_db.get_classes( +# cls.random_state.randint(0, 2, 30)) +# cls.assertEqual(labelsSet, {0, 1}) +# +# def test_one_class(cls): +# with cls.assertRaises(get_multiview_db.DatasetError) as catcher: +# get_multiview_db.get_classes(np.zeros(30, dtype=int)) +# exception = catcher.exception +# # cls.assertTrue("Dataset must have at least two different labels" in exception) # -class Test_filterLabels(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.labelsSet = set(range(4)) - cls.askedLabelsNamesSet = {"test_label_1", "test_label_3"} - cls.fullLabels = cls.random_state.randint(0, 4, 10) - cls.availableLabelsNames = ["test_label_0", "test_label_1", - "test_label_2", "test_label_3"] - cls.askedLabelsNames = ["test_label_1", "test_label_3"] - - def test_simple(cls): - newLabels, \ - newLabelsNames, \ - usedIndices = get_multiview_db.filter_labels(cls.labelsSet, - cls.askedLabelsNamesSet, - cls.fullLabels, - cls.availableLabelsNames, - cls.askedLabelsNames) - cls.assertEqual(["test_label_1", "test_label_3"], newLabelsNames) - np.testing.assert_array_equal(usedIndices, np.array([1, 5, 9])) - np.testing.assert_array_equal(newLabels, np.array([1, 1, 0])) - - def test_biclasse(cls): - cls.labelsSet = {0, 1} - cls.fullLabels = cls.random_state.randint(0, 2, 10) - cls.availableLabelsNames = ["test_label_0", "test_label_1"] - newLabels, \ - newLabelsNames, \ - usedIndices = get_multiview_db.filter_labels(cls.labelsSet, - cls.askedLabelsNamesSet, - cls.fullLabels, - cls.availableLabelsNames, - cls.askedLabelsNames) - cls.assertEqual(cls.availableLabelsNames, newLabelsNames) - np.testing.assert_array_equal(usedIndices, np.arange(10)) - np.testing.assert_array_equal(newLabels, cls.fullLabels) - - def test_asked_too_many_labels(cls): - cls.askedLabelsNamesSet = {"test_label_0", "test_label_1", - "test_label_2", "test_label_3", - "chicken_is_heaven"} - with cls.assertRaises(get_multiview_db.DatasetError) as catcher: - get_multiview_db.filter_labels(cls.labelsSet, - cls.askedLabelsNamesSet, - cls.fullLabels, - cls.availableLabelsNames, - cls.askedLabelsNames) - exception = catcher.exception - - def test_asked_all_labels(cls): - cls.askedLabelsNamesSet = {"test_label_0", "test_label_1", - "test_label_2", "test_label_3"} - cls.askedLabelsNames = ["test_label_0", "test_label_1", "test_label_2", - "test_label_3"] - newLabels, \ - newLabelsNames, \ - usedIndices = get_multiview_db.filter_labels(cls.labelsSet, - cls.askedLabelsNamesSet, - cls.fullLabels, - cls.availableLabelsNames, - cls.askedLabelsNames) - cls.assertEqual(cls.availableLabelsNames, newLabelsNames) - np.testing.assert_array_equal(usedIndices, np.arange(10)) - np.testing.assert_array_equal(newLabels, cls.fullLabels) - - -class Test_selectAskedLabels(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.askedLabelsNamesSet = {"test_label_1", "test_label_3"} - cls.fullLabels = cls.random_state.randint(0, 4, 10) - cls.availableLabelsNames = ["test_label_0", "test_label_1", - "test_label_2", "test_label_3"] - cls.askedLabelsNames = ["test_label_1", "test_label_3"] - - def test_simple(cls): - newLabels, \ - newLabelsNames, \ - usedIndices = get_multiview_db.select_asked_labels(cls.askedLabelsNamesSet, - cls.availableLabelsNames, - cls.askedLabelsNames, - cls.fullLabels) - cls.assertEqual(["test_label_1", "test_label_3"], newLabelsNames) - np.testing.assert_array_equal(usedIndices, np.array([1, 5, 9])) - np.testing.assert_array_equal(newLabels, np.array([1, 1, 0])) - - def test_asked_all_labels(cls): - cls.askedLabelsNamesSet = {"test_label_0", "test_label_1", - "test_label_2", "test_label_3"} - cls.askedLabelsNames = ["test_label_0", "test_label_1", "test_label_2", - "test_label_3"] - newLabels, \ - newLabelsNames, \ - usedIndices = get_multiview_db.select_asked_labels(cls.askedLabelsNamesSet, - cls.availableLabelsNames, - cls.askedLabelsNames, - cls.fullLabels) - cls.assertEqual(cls.availableLabelsNames, newLabelsNames) - np.testing.assert_array_equal(usedIndices, np.arange(10)) - np.testing.assert_array_equal(newLabels, cls.fullLabels) - - def test_asked_unavailable_labels(cls): - cls.askedLabelsNamesSet = {"test_label_1", "test_label_3", - "chicken_is_heaven"} - with cls.assertRaises(get_multiview_db.DatasetError) as catcher: - get_multiview_db.select_asked_labels(cls.askedLabelsNamesSet, - cls.availableLabelsNames, - cls.askedLabelsNames, - cls.fullLabels) - exception = catcher.exception - # cls.assertTrue("Asked labels are not all available in the dataset" in exception) - - -class Test_getAllLabels(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.fullLabels = cls.random_state.randint(0, 4, 10) - cls.availableLabelsNames = ["test_label_0", "test_label_1", - "test_label_2", "test_label_3"] - - def test_simple(cls): - newLabels, newLabelsNames, usedIndices = get_multiview_db.get_all_labels( - cls.fullLabels, cls.availableLabelsNames) - cls.assertEqual(cls.availableLabelsNames, newLabelsNames) - np.testing.assert_array_equal(usedIndices, np.arange(10)) - np.testing.assert_array_equal(newLabels, cls.fullLabels) - - -class Test_fillLabelNames(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.NB_CLASS = 2 - cls.askedLabelsNames = ["test_label_1", "test_label_3"] - cls.random_state = np.random.RandomState(42) - cls.availableLabelsNames = ["test_label_" + str(_) for _ in range(40)] - - def test_simple(cls): - askedLabelsNames, askedLabelsNamesSet = get_multiview_db.fill_label_names( - cls.NB_CLASS, - cls.askedLabelsNames, - cls.random_state, - cls.availableLabelsNames) - cls.assertEqual(askedLabelsNames, cls.askedLabelsNames) - cls.assertEqual(askedLabelsNamesSet, set(cls.askedLabelsNames)) - - def test_missing_labels_names(cls): - cls.NB_CLASS = 39 - askedLabelsNames, askedLabelsNamesSet = get_multiview_db.fill_label_names( - cls.NB_CLASS, - cls.askedLabelsNames, - cls.random_state, - cls.availableLabelsNames) - - cls.assertEqual(askedLabelsNames, - ['test_label_1', 'test_label_3', 'test_label_35', - 'test_label_38', 'test_label_6', 'test_label_15', - 'test_label_32', 'test_label_28', 'test_label_8', - 'test_label_29', 'test_label_26', 'test_label_17', - 'test_label_19', 'test_label_10', 'test_label_18', - 'test_label_14', 'test_label_21', 'test_label_11', - 'test_label_34', 'test_label_0', 'test_label_27', - 'test_label_7', 'test_label_13', 'test_label_2', - 'test_label_39', 'test_label_23', 'test_label_4', - 'test_label_31', 'test_label_37', 'test_label_5', - 'test_label_36', 'test_label_25', 'test_label_33', - 'test_label_12', 'test_label_24', 'test_label_20', - 'test_label_22', 'test_label_9', 'test_label_16']) - cls.assertEqual(askedLabelsNamesSet, set( - ["test_label_" + str(_) for _ in range(30)] + [ - "test_label_" + str(31 + _) for _ in range(9)])) - - def test_too_many_label_names(cls): - cls.NB_CLASS = 2 - cls.askedLabelsNames = ["test_label_1", "test_label_3", "test_label_4", - "test_label_6"] - askedLabelsNames, askedLabelsNamesSet = get_multiview_db.fill_label_names( - cls.NB_CLASS, - cls.askedLabelsNames, - cls.random_state, - cls.availableLabelsNames) - cls.assertEqual(askedLabelsNames, ["test_label_3", "test_label_6"]) - cls.assertEqual(askedLabelsNamesSet, {"test_label_3", "test_label_6"}) - - -class Test_allAskedLabelsAreAvailable(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.askedLabelsNamesSet = {"test_label_1", "test_label_3"} - cls.availableLabelsNames = ["test_label_0", "test_label_1", - "test_label_2", "test_label_3"] - - def test_asked_available_labels(cls): - cls.assertTrue( - get_multiview_db.all_asked_labels_are_available(cls.askedLabelsNamesSet, - cls.availableLabelsNames)) - - def test_asked_unavailable_label(cls): - cls.askedLabelsNamesSet = {"test_label_1", "test_label_3", - "chicken_is_heaven"} - cls.assertFalse( - get_multiview_db.all_asked_labels_are_available(cls.askedLabelsNamesSet, - cls.availableLabelsNames)) - - -class Test_getClasses(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - - def test_multiclass(cls): - labelsSet = get_multiview_db.get_classes( - cls.random_state.randint(0, 5, 30)) - cls.assertEqual(labelsSet, {0, 1, 2, 3, 4}) - - def test_biclass(cls): - labelsSet = get_multiview_db.get_classes( - cls.random_state.randint(0, 2, 30)) - cls.assertEqual(labelsSet, {0, 1}) - - def test_one_class(cls): - with cls.assertRaises(get_multiview_db.DatasetError) as catcher: - get_multiview_db.get_classes(np.zeros(30, dtype=int)) - exception = catcher.exception - # cls.assertTrue("Dataset must have at least two different labels" in exception) - -class Test_getClassicDBhdf5(unittest.TestCase): +class Test_get_classic_db_hdf5(unittest.TestCase): - @classmethod - def setUpClass(cls): + def setUp(self): rm_tmp() - if not os.path.exists("multiview_platform/tests/tmp_tests"): - os.mkdir("multiview_platform/tests/tmp_tests") - cls.dataset_file = h5py.File( - tmp_path+"test_dataset.hdf5", "w") - cls.pathF = tmp_path - cls.nameDB = "test_dataset" - cls.NB_CLASS = 2 - cls.askedLabelsNames = ["test_label_1", "test_label_3"] - cls.random_state = np.random.RandomState(42) - cls.views = ["test_view_1", "test_view_3"] - cls.metadata_group = cls.dataset_file.create_group("Metadata") - cls.metadata_group.attrs["nbView"] = 4 - cls.labels_dataset = cls.dataset_file.create_dataset("Labels", - data=cls.random_state.randint( - 0, 4, 10)) - cls.labels_dataset.attrs["names"] = ["test_label_0".encode(), - "test_label_1".encode(), - "test_label_2".encode(), - "test_label_3".encode()] - - for i in range(4): - cls.dataset = cls.dataset_file.create_dataset("View" + str(i), - data=cls.random_state.randint( - 0, 100, (10, 20))) - cls.dataset.attrs["name"] = "test_view_" + str(i) - - def test_simple(cls): - dataset_file, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5( - cls.views, cls.pathF, cls.nameDB, - cls.NB_CLASS, cls.askedLabelsNames, - cls.random_state) - cls.assertEqual(dataset_file.get("View1").attrs["name"], "test_view_3") - cls.assertEqual(labels_dictionary, - {0: "test_label_1", 1: "test_label_3"}) - cls.assertEqual(dataset_file.get("Metadata").attrs["datasetLength"], 3) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbView"], 2) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbClass"], 2) - np.testing.assert_array_equal(dataset_file.get("View0").value, - cls.dataset_file.get("View1").value[ - np.array([1, 5, 9]), :]) - - def test_all_labels_asked(cls): - askedLabelsNames = ["test_label_0", "test_label_1", "test_label_2", - "test_label_3"] - NB_CLASS = 4 - dataset_file, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5( - cls.views, cls.pathF, cls.nameDB, - NB_CLASS, askedLabelsNames, - cls.random_state) - cls.assertEqual(dataset_name, 'test_dataset_temp_view_label_select') - cls.assertEqual(dataset_file.get("View1").attrs["name"], "test_view_3") - cls.assertEqual(labels_dictionary, - {0: "test_label_0", 1: "test_label_1", - 2: "test_label_2", 3: "test_label_3"}) - cls.assertEqual(dataset_file.get("Metadata").attrs["datasetLength"], 10) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbView"], 2) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbClass"], 4) - np.testing.assert_array_equal(dataset_file.get("View0").value, - cls.dataset_file.get("View1").value) - - def test_all_views_asked(cls): - views = ["test_view_0", "test_view_1", "test_view_2", "test_view_3"] - dataset_file, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5(views, - cls.pathF, - cls.nameDB, - cls.NB_CLASS, - cls.askedLabelsNames, - cls.random_state) - for viewIndex in range(4): - np.testing.assert_array_equal( - dataset_file.get("View" + str(viewIndex)).value, - cls.dataset_file.get("View" + str(viewIndex)).value[ - np.array([1, 5, 9]), :]) - cls.assertEqual( - dataset_file.get("View" + str(viewIndex)).attrs["name"], - "test_view_" + str(viewIndex)) - cls.assertEqual(labels_dictionary, - {0: "test_label_1", 1: "test_label_3"}) - cls.assertEqual(dataset_file.get("Metadata").attrs["datasetLength"], 3) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbView"], 4) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbClass"], 2) - - def test_asked_the_whole_dataset(cls): - askedLabelsNames = ["test_label_0", "test_label_1", "test_label_2", - "test_label_3"] - NB_CLASS = 4 - views = ["test_view_0", "test_view_1", "test_view_2", "test_view_3"] - dataset_file, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5(views, - cls.pathF, - cls.nameDB, - NB_CLASS, - askedLabelsNames, - cls.random_state) - for viewIndex in range(4): - np.testing.assert_array_equal( - dataset_file.get("View" + str(viewIndex)).value, - cls.dataset_file.get("View" + str(viewIndex))) - cls.assertEqual( - dataset_file.get("View" + str(viewIndex)).attrs["name"], - "test_view_" + str(viewIndex)) - cls.assertEqual(labels_dictionary, - {0: "test_label_0", 1: "test_label_1", - 2: "test_label_2", 3: "test_label_3"}) - cls.assertEqual(dataset_file.get("Metadata").attrs["datasetLength"], 10) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbView"], 4) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbClass"], 4) + os.mkdir(tmp_path) + self.rs = np.random.RandomState(42) + self.nb_view = 3 + self.file_name = "test.hdf5" + self.nb_examples = 5 + self.nb_class = 3 + self.views = [self.rs.randint(0, 10, size=(self.nb_examples, 7)) + for _ in range(self.nb_view)] + self.labels = self.rs.randint(0, self.nb_class, self.nb_examples) + self.dataset_file = h5py.File(os.path.join(tmp_path, self.file_name)) + self.view_names = ["ViewN" + str(index) for index in + range(len(self.views))] + self.are_sparse = [False for _ in self.views] + for view_index, (view_name, view, is_sparse) in enumerate( + zip(self.view_names, self.views, self.are_sparse)): + view_dataset = self.dataset_file.create_dataset( + "View" + str(view_index), + view.shape, + data=view) + view_dataset.attrs["name"] = view_name + view_dataset.attrs["sparse"] = is_sparse + labels_dataset = self.dataset_file.create_dataset("Labels", + shape=self.labels.shape, + data=self.labels) + self.labels_names = [str(index) for index in np.unique(self.labels)] + labels_dataset.attrs["names"] = [label_name.encode() + for label_name in self.labels_names] + meta_data_grp = self.dataset_file.create_group("Metadata") + meta_data_grp.attrs["nbView"] = len(self.views) + meta_data_grp.attrs["nbClass"] = len(np.unique(self.labels)) + meta_data_grp.attrs["datasetLength"] = len(self.labels) - @classmethod - def tearDownClass(cls): - os.remove( - tmp_path+"test_dataset_temp_view_label_select.hdf5") - os.remove(tmp_path+"test_dataset.hdf5") - dirs = os.listdir("multiview_platform/tests/tmp_tests") - for dir in dirs: - print(dir) - os.rmdir("multiview_platform/tests/tmp_tests") + def test_simple(self): + dataset , labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5( + ["ViewN2"], tmp_path, self.file_name.split(".")[0], + self.nb_class, ["0", "2"], + self.rs, path_for_new=tmp_path) + self.assertEqual(dataset.nb_view, 1) + self.assertEqual(labels_dictionary, + {0: "0", 1: "2", 2:"1"}) + self.assertEqual(dataset.get_nb_examples(), 5) + self.assertEqual(len(np.unique(dataset.get_labels())), 3) + + + def test_all_views_asked(self): + dataset, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5( + None, tmp_path, self.file_name.split(".")[0], + self.nb_class, ["0", "2"], + self.rs, path_for_new=tmp_path) + self.assertEqual(dataset.nb_view, 3) + self.assertEqual(dataset.get_view_dict(), {'ViewN0': 0, 'ViewN1': 1, 'ViewN2': 2}) + + def test_asked_the_whole_dataset(self): + dataset, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5( + ["ViewN2"], tmp_path, self.file_name.split(".")[0], + self.nb_class, ["0", "2"], + self.rs, path_for_new=tmp_path, full=True) + self.assertEqual(dataset.dataset, self.dataset_file) + + def tearDown(self): + rm_tmp() -class Test_getClassicDBcsv(unittest.TestCase): +class Test_get_classic_db_csv(unittest.TestCase): - @classmethod - def setUpClass(cls): + def setUp(self): rm_tmp() - if not os.path.exists("multiview_platform/tests/tmp_tests"): - os.mkdir("multiview_platform/tests/tmp_tests") - cls.pathF = tmp_path - cls.NB_CLASS = 2 - cls.nameDB = "test_dataset" - cls.askedLabelsNames = ["test_label_1", "test_label_3"] - cls.random_state = np.random.RandomState(42) - cls.views = ["test_view_1", "test_view_3"] - np.savetxt(cls.pathF + cls.nameDB + "-labels-names.csv", + os.mkdir(tmp_path) + self.pathF = tmp_path + self.NB_CLASS = 2 + self.nameDB = "test_dataset" + self.askedLabelsNames = ["test_label_1", "test_label_3"] + self.random_state = np.random.RandomState(42) + self.views = ["test_view_1", "test_view_3"] + np.savetxt(self.pathF + self.nameDB + "-labels-names.csv", np.array(["test_label_0", "test_label_1", "test_label_2", "test_label_3"]), fmt="%s", delimiter=",") - np.savetxt(cls.pathF + cls.nameDB + "-labels.csv", - cls.random_state.randint(0, 4, 10), delimiter=",") - os.mkdir(cls.pathF + "Views") - cls.datas = [] + np.savetxt(self.pathF + self.nameDB + "-labels.csv", + self.random_state.randint(0, 4, 10), delimiter=",") + os.mkdir(self.pathF + "Views") + self.datas = [] for i in range(4): - data = cls.random_state.randint(0, 100, (10, 20)) - np.savetxt(cls.pathF + "Views/test_view_" + str(i) + ".csv", + data = self.random_state.randint(0, 100, (10, 20)) + np.savetxt(self.pathF + "Views/test_view_" + str(i) + ".csv", data, delimiter=",") - cls.datas.append(data) + self.datas.append(data) def test_simple(cls): - dataset_file, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_csv( + dataset, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_csv( cls.views, cls.pathF, cls.nameDB, cls.NB_CLASS, cls.askedLabelsNames, - cls.random_state, delimiter=",") - cls.assertEqual(dataset_file.get("Metadata").attrs["nbView"], 2) - cls.assertEqual(dataset_file.get("View1").attrs["name"], "test_view_3") - cls.assertEqual(dataset_file.get("View0").attrs["name"], "test_view_1") + cls.random_state, delimiter=",", path_for_new=tmp_path) + cls.assertEqual(dataset.nb_view, 2) + cls.assertEqual(dataset.get_view_dict(), {'test_view_1': 0, 'test_view_3': 1}) cls.assertEqual(labels_dictionary, {0: "test_label_1", 1: "test_label_3"}) - cls.assertEqual(dataset_file.get("Metadata").attrs["datasetLength"], 3) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbClass"], 2) - np.testing.assert_array_equal(dataset_file.get("View0").value, - cls.datas[1][np.array([1, 5, 9]), :]) + cls.assertEqual(dataset.get_nb_examples(), 3) + cls.assertEqual(dataset.get_nb_class(), 2) - def test_all_views_asked(cls): - views = ["test_view_0", "test_view_1", "test_view_2", "test_view_3"] - dataset_file, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_csv(views, - cls.pathF, - cls.nameDB, - cls.NB_CLASS, - cls.askedLabelsNames, - cls.random_state, - delimiter=",") - cls.assertEqual(labels_dictionary, - {0: "test_label_1", 1: "test_label_3"}) - cls.assertEqual(dataset_file.get("Metadata").attrs["datasetLength"], 3) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbView"], 4) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbClass"], 2) - cls.assertEqual(dataset_name,'test_dataset_temp_view_label_select') - for viewIndex in range(4): - np.testing.assert_array_equal( - dataset_file.get("View" + str(viewIndex)).value, - cls.datas[viewIndex][np.array([1, 5, 9]), :]) - cls.assertEqual( - dataset_file.get("View" + str(viewIndex)).attrs["name"], - "test_view_" + str(viewIndex)) - - def test_all_labels_asked(cls): - askedLabelsNames = ["test_label_0", "test_label_1", "test_label_2", - "test_label_3"] - NB_CLASS = 4 - dataset_file, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_csv( - cls.views, cls.pathF, cls.nameDB, - NB_CLASS, askedLabelsNames, - cls.random_state, delimiter=",") - cls.assertEqual(dataset_file.get("View1").attrs["name"], "test_view_3") - cls.assertEqual(labels_dictionary, - {0: "test_label_0", 1: "test_label_1", - 2: "test_label_2", 3: "test_label_3"}) - cls.assertEqual(dataset_file.get("Metadata").attrs["datasetLength"], 10) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbView"], 2) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbClass"], 4) - np.testing.assert_array_equal(dataset_file.get("View0").value, - cls.datas[1]) - - def test_asked_the_whole_dataset(cls): - askedLabelsNames = ["test_label_0", "test_label_1", "test_label_2", - "test_label_3"] - NB_CLASS = 4 - views = ["test_view_0", "test_view_1", "test_view_2", "test_view_3"] - dataset_file, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_csv(views, - cls.pathF, - cls.nameDB, - NB_CLASS, - askedLabelsNames, - cls.random_state, - delimiter=",") - for viewIndex in range(4): - np.testing.assert_array_equal( - dataset_file.get("View" + str(viewIndex)).value, - cls.datas[viewIndex]) - cls.assertEqual( - dataset_file.get("View" + str(viewIndex)).attrs["name"], - "test_view_" + str(viewIndex)) - cls.assertEqual(labels_dictionary, - {0: "test_label_0", 1: "test_label_1", - 2: "test_label_2", 3: "test_label_3"}) - cls.assertEqual(dataset_file.get("Metadata").attrs["datasetLength"], 10) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbView"], 4) - cls.assertEqual(dataset_file.get("Metadata").attrs["nbClass"], 4) @classmethod - def tearDownClass(cls): + def tearDown(self): for i in range(4): os.remove( tmp_path+"Views/test_view_" + str( @@ -579,10 +452,8 @@ class Test_getClassicDBcsv(unittest.TestCase): os.remove(tmp_path+"test_dataset-labels.csv") os.remove(tmp_path+"test_dataset.hdf5") os.remove( - tmp_path+"test_dataset_temp_view_label_select.hdf5") - for file in os.listdir("multiview_platform/tests/tmp_tests"): print( - file) - os.rmdir("multiview_platform/tests/tmp_tests") + tmp_path+"test_dataset_temp_filter.hdf5") + os.rmdir(tmp_path) class Test_get_plausible_db_hdf5(unittest.TestCase): diff --git a/multiview_platform/tests/test_utils/test_dataset.py b/multiview_platform/tests/test_utils/test_dataset.py index 9b8c0023..dd6f12a5 100644 --- a/multiview_platform/tests/test_utils/test_dataset.py +++ b/multiview_platform/tests/test_utils/test_dataset.py @@ -18,23 +18,24 @@ class Test_Dataset(unittest.TestCase): cls.nb_view = 3 cls.file_name = "test.hdf5" cls.nb_examples = 5 + cls.nb_attr = 7 cls.nb_class = 3 - cls.views = [cls.rs.randint(0,10,size=(cls.nb_examples,7)) - for _ in range(cls.nb_view)] - cls.labels = cls.rs.randint(0,cls.nb_class,cls.nb_examples) + cls.views = [cls.rs.randint(0, 10, size=(cls.nb_examples, cls.nb_attr)) + for _ in range(cls.nb_view)] + cls.labels = cls.rs.randint(0, cls.nb_class, cls.nb_examples) cls.dataset_file = h5py.File(os.path.join(tmp_path, cls.file_name)) cls.view_names = ["ViewN" + str(index) for index in range(len(cls.views))] cls.are_sparse = [False for _ in cls.views] for view_index, (view_name, view, is_sparse) in enumerate( zip(cls.view_names, cls.views, cls.are_sparse)): view_dataset = cls.dataset_file.create_dataset("View" + str(view_index), - view.shape, - data=view) + view.shape, + data=view) view_dataset.attrs["name"] = view_name view_dataset.attrs["sparse"] = is_sparse labels_dataset = cls.dataset_file.create_dataset("Labels", - shape=cls.labels.shape, - data=cls.labels) + shape=cls.labels.shape, + data=cls.labels) cls.labels_names = [str(index) for index in np.unique(cls.labels)] labels_dataset.attrs["names"] = [label_name.encode() for label_name in cls.labels_names] @@ -46,11 +47,55 @@ class Test_Dataset(unittest.TestCase): @classmethod def tearDownClass(cls): cls.dataset_file.close() - rm_tmp() - def test_simple(self): + def test_filter(self): + """Had to create a new dataset to aviod playing with the class one""" + file_name = "test_filter.hdf5" + dataset_file_filter = h5py.File(os.path.join(tmp_path, file_name)) + for view_index, (view_name, view, is_sparse) in enumerate( + zip(self.view_names, self.views, self.are_sparse)): + view_dataset = dataset_file_filter.create_dataset( + "View" + str(view_index), + view.shape, + data=view) + view_dataset.attrs["name"] = view_name + view_dataset.attrs["sparse"] = is_sparse + labels_dataset = dataset_file_filter.create_dataset("Labels", + shape=self.labels.shape, + data=self.labels) + labels_dataset.attrs["names"] = [label_name.encode() + for label_name in self.labels_names] + meta_data_grp = dataset_file_filter.create_group("Metadata") + meta_data_grp.attrs["nbView"] = len(self.views) + meta_data_grp.attrs["nbClass"] = len(np.unique(self.labels)) + meta_data_grp.attrs["datasetLength"] = len(self.labels) + dataset_object = dataset.Dataset(hdf5_file=dataset_file_filter) + dataset_object.filter(np.array([0, 1, 0]), ["0", "1"], [1, 2, 3], + ["ViewN0"], tmp_path) + self.assertEqual(dataset_object.nb_view, 1) + np.testing.assert_array_equal(dataset_object.get_labels(), [0, 1, 0]) + dataset_object.dataset.close() + os.remove(os.path.join(tmp_path, "test_filter_temp_filter.hdf5")) + os.remove(os.path.join(tmp_path, "test_filter.hdf5")) + + def test_for_hdf5_file(self): dataset_object = dataset.Dataset(hdf5_file=self.dataset_file) + def test_from_scratch(self): + dataset_object = dataset.Dataset(views=self.views, + labels=self.labels, + are_sparse=self.are_sparse, + file_name="from_scratch"+self.file_name, + view_names=self.view_names, + path=tmp_path, + labels_names=self.labels_names) + nb_class = dataset_object.get_nb_class() + self.assertEqual(nb_class, self.nb_class) + example_indices = dataset_object.init_example_indces() + self.assertEqual(example_indices, range(self.nb_examples)) + view = dataset_object.get_v(0) + np.testing.assert_array_equal(view, self.views[0]) + def test_init_example_indices(self): example_indices = dataset.Dataset(hdf5_file=self.dataset_file).init_example_indces() self.assertEqual(example_indices, range(self.nb_examples)) @@ -69,20 +114,7 @@ class Test_Dataset(unittest.TestCase): nb_class = dataset.Dataset(hdf5_file=self.dataset_file).get_nb_class([0]) self.assertEqual(nb_class, 1) - def test_from_scratch(self): - dataset_object = dataset.Dataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - file_name="from_scratch"+self.file_name, - view_names=self.view_names, - path=tmp_path, - labels_names=self.labels_names) - nb_class = dataset_object.get_nb_class() - self.assertEqual(nb_class, self.nb_class) - example_indices = dataset_object.init_example_indces() - self.assertEqual(example_indices, range(self.nb_examples)) - view = dataset_object.get_v(0) - np.testing.assert_array_equal(view, self.views[0]) + def test_get_view_dict(self): dataset_object = dataset.Dataset(views=self.views, @@ -94,4 +126,114 @@ class Test_Dataset(unittest.TestCase): labels_names=self.labels_names) self.assertEqual(dataset_object.get_view_dict(), {"ViewN0":0, "ViewN1": 1, - "ViewN2": 2,}) \ No newline at end of file + "ViewN2": 2,}) + + def test_get_label_names(self): + dataset_object = dataset.Dataset(hdf5_file=self.dataset_file) + raw_label_names = dataset_object.get_label_names(decode=False) + decoded_label_names = dataset_object.get_label_names() + restricted_label_names = dataset_object.get_label_names(example_indices=[3,4]) + self.assertEqual(raw_label_names, [b'0', b'1', b'2']) + self.assertEqual(decoded_label_names, ['0', '1', '2']) + self.assertEqual(restricted_label_names, ['2']) + + def test_get_nb_exmaples(self): + dataset_object = dataset.Dataset(hdf5_file=self.dataset_file) + nb_examples = dataset_object.get_nb_examples() + self.assertEqual(nb_examples, self.nb_examples) + + def test_get_labels(self): + dataset_object = dataset.Dataset(hdf5_file=self.dataset_file) + labels = dataset_object.get_labels() + np.testing.assert_array_equal(labels, self.labels) + labels = dataset_object.get_labels([1,2,0]) + np.testing.assert_array_equal(labels, self.labels[[1,2,0]]) + + def test_copy_view(self): + dataset_object = dataset.Dataset(hdf5_file=self.dataset_file) + new_dataset = h5py.File(os.path.join(tmp_path, "test_copy.hdf5"), "w") + dataset_object.copy_view(target_dataset=new_dataset, + source_view_name="ViewN0", + target_view_index=1) + self.assertIn("View1", list(new_dataset.keys())) + np.testing.assert_array_equal(dataset_object.get_v(0), new_dataset["View1"].value) + self.assertEqual(new_dataset["View1"].attrs["name"], "ViewN0") + new_dataset.close() + os.remove(os.path.join(tmp_path, "test_copy.hdf5")) + + def test_get_name(self): + dataset_object = dataset.Dataset(hdf5_file=self.dataset_file) + self.assertEqual("test", dataset_object.get_name()) + + def test_select_labels(self): + dataset_object = dataset.Dataset(hdf5_file=self.dataset_file) + labels, label_names, indices = dataset_object.select_labels(["0", "2"]) + np.testing.assert_array_equal(np.unique(labels), np.array([0,1])) + self.assertEqual(label_names, ["0","2"]) + + def test_check_selected_label_names(self): + dataset_object = dataset.Dataset(hdf5_file=self.dataset_file) + names = dataset_object.check_selected_label_names(nb_labels=2, random_state=self.rs) + self.assertEqual(names, ["1", "0"]) + names = dataset_object.check_selected_label_names(selected_label_names=['0', '2'], + random_state=self.rs) + self.assertEqual(names, ["0", "2"]) + + def test_select_views_and_labels(self): + file_name = "test_filter.hdf5" + dataset_file_select = h5py.File(os.path.join(tmp_path, file_name)) + for view_index, (view_name, view, is_sparse) in enumerate( + zip(self.view_names, self.views, self.are_sparse)): + view_dataset = dataset_file_select.create_dataset( + "View" + str(view_index), + view.shape, + data=view) + view_dataset.attrs["name"] = view_name + view_dataset.attrs["sparse"] = is_sparse + labels_dataset = dataset_file_select.create_dataset("Labels", + shape=self.labels.shape, + data=self.labels) + labels_dataset.attrs["names"] = [label_name.encode() + for label_name in self.labels_names] + meta_data_grp = dataset_file_select.create_group("Metadata") + meta_data_grp.attrs["nbView"] = len(self.views) + meta_data_grp.attrs["nbClass"] = len(np.unique(self.labels)) + meta_data_grp.attrs["datasetLength"] = len(self.labels) + dataset_object = dataset.Dataset(hdf5_file=dataset_file_select) + names = dataset_object.select_views_and_labels(nb_labels=2, view_names=["ViewN0"], random_state=self.rs, path_for_new=tmp_path) + self.assertEqual(names, {0: '2', 1: '1'}) + self.assertEqual(dataset_object.nb_view, 1) + dataset_object.dataset.close() + os.remove(os.path.join(tmp_path, "test_filter_temp_filter.hdf5")) + os.remove(os.path.join(tmp_path, "test_filter.hdf5")) + + def test_add_gaussian_noise(self): + file_name = "test_noise.hdf5" + dataset_file_select = h5py.File(os.path.join(tmp_path, file_name)) + limits = np.zeros((self.nb_attr, 2)) + limits[:, 1] += 100 + meta_data_grp = dataset_file_select.create_group("Metadata") + for view_index, (view_name, view, is_sparse) in enumerate( + zip(self.view_names, self.views, self.are_sparse)): + view_dataset = dataset_file_select.create_dataset( + "View" + str(view_index), + view.shape, + data=view) + view_dataset.attrs["name"] = view_name + view_dataset.attrs["sparse"] = is_sparse + meta_data_grp.create_dataset("View"+str(view_index)+"_limits", data= limits) + labels_dataset = dataset_file_select.create_dataset("Labels", + shape=self.labels.shape, + data=self.labels) + labels_dataset.attrs["names"] = [label_name.encode() + for label_name in self.labels_names] + meta_data_grp.attrs["nbView"] = len(self.views) + meta_data_grp.attrs["nbClass"] = len(np.unique(self.labels)) + meta_data_grp.attrs["datasetLength"] = len(self.labels) + dataset_object = dataset.Dataset(hdf5_file=dataset_file_select) + dataset_object.add_gaussian_noise(self.rs, tmp_path) + dataset_object.dataset.close() + os.remove(os.path.join(tmp_path, "test_noise_noised.hdf5")) + os.remove(os.path.join(tmp_path, "test_noise.hdf5")) + + diff --git a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py index 17e52901..7c384488 100644 --- a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py +++ b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py @@ -7,6 +7,7 @@ from sklearn.model_selection import StratifiedKFold from ..utils import rm_tmp, tmp_path +from ...mono_multi_view_classifiers.utils.dataset import Dataset from ...mono_multi_view_classifiers.utils import hyper_parameter_search from ...mono_multi_view_classifiers.multiview_classifiers import weighted_linear_early_fusion @@ -27,10 +28,12 @@ class Test_randomized_search(unittest.TestCase): view0 = cls.dataset_file.create_dataset("View0", data=cls.view0_data) view0.attrs["sparse"] = False + view0.attrs["name"] = "ViewN0" cls.view1_data = cls.random_state.randint(1, 10, size=(10, 4)) view1 = cls.dataset_file.create_dataset("View1", data=cls.view1_data) view1.attrs["sparse"] = False + view1.attrs["name"] = "ViewN1" metaDataGrp = cls.dataset_file.create_group("Metadata") metaDataGrp.attrs["nbView"] = 2 metaDataGrp.attrs["nbClass"] = 2 @@ -41,6 +44,7 @@ class Test_randomized_search(unittest.TestCase): "splitter": "best"} cls.k_folds = StratifiedKFold(n_splits=3, random_state=cls.random_state) cls.learning_indices = np.array([1,2,3,4, 5,6,7,8,9]) + cls.dataset = Dataset(hdf5_file=cls.dataset_file) @classmethod def tearDownClass(cls): @@ -53,6 +57,6 @@ class Test_randomized_search(unittest.TestCase): def test_simple(self): best_params, test_folds_preds = hyper_parameter_search.randomized_search( - self.dataset_file, self.labels.value, "multiview", self.random_state, tmp_path, + self.dataset, self.labels.value, "multiview", self.random_state, tmp_path, weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds, 1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices) diff --git a/multiview_platform/tests/utils.py b/multiview_platform/tests/utils.py index cc77a9be..5a373cae 100644 --- a/multiview_platform/tests/utils.py +++ b/multiview_platform/tests/utils.py @@ -1,7 +1,12 @@ import os +import numpy as np +import h5py + +from ..mono_multi_view_classifiers.utils.dataset import Dataset tmp_path = "multiview_platform/tests/tmp_tests/" +test_dataset = Dataset(hdf5_file=h5py.File("multiview_platform/tests/test_database.hdf5", "r")) def rm_tmp(): try: @@ -10,3 +15,35 @@ def rm_tmp(): os.rmdir(tmp_path) except: pass + + +def gen_test_dataset(random_state=np.random.RandomState(42)): + dataset_file = h5py.File("test_database.hdf5", "w") + view_names = ["ViewN0", "ViewN1", "ViewN2"] + views = [random_state.randint(0,100,(5,6)) + for _ in range(len(view_names))] + labels = random_state.randint(0,2, 5) + label_names = ["yes", "no"] + for view_index, (view_name, view) in enumerate( + zip(view_names, views)): + view_dataset = dataset_file.create_dataset("View" + str(view_index), + view.shape, + data=view) + view_dataset.attrs["name"] = view_name + view_dataset.attrs["sparse"] = False + labels_dataset = dataset_file.create_dataset("Labels", + shape=labels.shape, + data=labels) + labels_dataset.attrs["names"] = [label_name.encode() + if not isinstance(label_name, bytes) + else label_name + for label_name in label_names] + meta_data_grp = dataset_file.create_group("Metadata") + meta_data_grp.attrs["nbView"] = len(views) + meta_data_grp.attrs["nbClass"] = len(np.unique(labels)) + meta_data_grp.attrs["datasetLength"] = len(labels) + dataset_file.close() + + +if __name__ == "__main__": + gen_test_dataset() -- GitLab