diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a874c2df756bb4b9bf7cc222c08cd43f9e977e4e..4e0f6146214e674c4c0fae99f06f937873d90894 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -23,7 +23,7 @@ doc: - export LC_ALL=$(locale -a | grep en_US) - export LANG=$(locale -a | grep en_US) - pip3 install -e . --no-deps - - sphinx-apidoc -o docs/source multiview_platform + - sphinx-apidoc -o docs/source summit - cd docs/source - sphinx-build -b html . ../build - cd ../.. @@ -45,7 +45,7 @@ pages: - export LANG=$(locale -a | grep en_US) - pip3 install -e . --no-deps - pytest-3 - - sphinx-apidoc -o docs/source multiview_platform + - sphinx-apidoc -o docs/source summit - cd docs/source - sphinx-build -b html . ../build - cd ../.. diff --git a/README.md b/README.md index 09aa84b0e8317eb954025da5de7e85dcda98707f..9a15dc2951a17f6820550af09ed9d46edaa4d37c 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ to read it carefully before playing around with the parameters. You can create your own configuration file. In order to run the platform with it, run : ```python -from multiview_platform.execute import execute +from summit.execute import execute execute(config_path="/absolute/path/to/your/config/file") ``` diff --git a/docs/source/conf.py b/docs/source/conf.py index 01c4690c87846617580ecf3d26d6be7b778d329b..2ab75d491a5e0f55b7fa97b147eaabf0c8e183ac 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -22,11 +22,11 @@ import os import sys sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../../multiview_platform')) +sys.path.insert(0, os.path.abspath('../../summit')) sys.path.insert(0, os.path.abspath('../..')) file_loc = os.path.split(__file__)[0] sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(file_loc), '.'))) -# import multiview_platform +# import summit # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. diff --git a/multiview_platform/__init__.py b/multiview_platform/__init__.py deleted file mode 100644 index f51f7a705fcb8d3f34a1083b6f1753b2098d4520..0000000000000000000000000000000000000000 --- a/multiview_platform/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""This is a test docstring to test stuff""" - -__version__ = "0.0.0.0" - -from . import mono_multi_view_classifiers, execute diff --git a/multiview_platform/examples/__init__.py b/multiview_platform/examples/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/examples/config_files/config_example_0.yml b/multiview_platform/examples/config_files/config_example_0.yml deleted file mode 100644 index d16f5843c1a5ddfd8e564e874f7b02f94b4d8f08..0000000000000000000000000000000000000000 --- a/multiview_platform/examples/config_files/config_example_0.yml +++ /dev/null @@ -1,78 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: True -# The name of each dataset in the directory on which the benchmark should be run -name: "digits_doc" -# A label for the resul directory -label: "example_0" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "examples/results/example_0/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# The ratio of test examples/number of train examples -split: 0.25 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 2 -# The number of classes to select in the dataset -nb_class: -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree"] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_early_fusion", "weighted_linear_late_fusion",] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 1 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: "None" -# The number of iteration in the hyper-parameter optimization process -hps_args: {} - -### Configuring the hyper-parameters for the classifiers - -decision_tree: - max_depth: 3 - -weighted_linear_early_fusion: - monoview_classifier_name: "decision_tree" - monoview_classifier_config: - decision_tree: - max_depth: 6 - -weighted_linear_late_fusion: - classifiers_names: "decision_tree" - classifier_configs: - decision_tree: - max_depth: 3 diff --git a/multiview_platform/examples/config_files/config_example_1.yml b/multiview_platform/examples/config_files/config_example_1.yml deleted file mode 100644 index fb9ab405aa0015885cccc08941b50c1c0188e9b7..0000000000000000000000000000000000000000 --- a/multiview_platform/examples/config_files/config_example_1.yml +++ /dev/null @@ -1,78 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: True -# The name of each dataset in the directory on which the benchmark should be run -name: "doc_summit" -# A label for the resul directory -label: "example_1" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "examples/data/mkljlj" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "examples/results/example_1/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# The ratio of test examples/number of train examples -split: 0.35 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 2 -# The number of classes to select in the dataset -nb_class: -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree"] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_late_fusion",] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 1 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: "None" -# The number of iteration in the hyper-parameter optimization process -hps_args: {} - -### Configuring the hyper-parameters for the classifiers - -decision_tree: - max_depth: 3 - -weighted_linear_early_fusion: - monoview_classifier_name: "decision_tree" - monoview_classifier_config: - decision_tree: - max_depth: 6 - -weighted_linear_late_fusion: - classifiers_names: "decision_tree" - classifier_configs: - decision_tree: - max_depth: 3 diff --git a/multiview_platform/examples/config_files/config_example_2_1_1.yml b/multiview_platform/examples/config_files/config_example_2_1_1.yml deleted file mode 100644 index b1a9e2e7379828d8b5a7ef1c1fd22c80862e2b6e..0000000000000000000000000000000000000000 --- a/multiview_platform/examples/config_files/config_example_2_1_1.yml +++ /dev/null @@ -1,83 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: True -# The name of each dataset in the directory on which the benchmark should be run -name: "doc_summit" -# A label for the resul directory -label: "example_2_1_1" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "examples/results/example_2_1_1/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# If the dataset is multiclass, will use this multiclass-to-biclass method -multiclass_method: "oneVersusOne" -# The ratio number of test exmaples/number of train examples -split: 0.8 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 2 -# The number of classes to select in the dataset -nb_class: 2 -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree", "adaboost", ] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_late_fusion", ] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 1 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: None -# The number of iteration in the hyper-parameter optimization process -hps_args: {} - -decision_tree: - max_depth: 3 - -adaboost: - base_estimator: "DecisionTreeClassifier" - n_estimators: 50 - -weighted_linear_late_fusion: - classifiers_names: "decision_tree" - classifier_configs: - decision_tree: - max_depth: 2 - - -# The following arguments are classifier-specific, and are documented in each -# of the corresponding modules. - -# In order to run multiple sets of parameters, use multiple values in the -# following lists, and set hps_type to None. diff --git a/multiview_platform/examples/config_files/config_example_2_1_2.yml b/multiview_platform/examples/config_files/config_example_2_1_2.yml deleted file mode 100644 index 256e18a8ade378f29c32ead31fcbba28b1db62b3..0000000000000000000000000000000000000000 --- a/multiview_platform/examples/config_files/config_example_2_1_2.yml +++ /dev/null @@ -1,83 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: True -# The name of each dataset in the directory on which the benchmark should be run -name: "doc_summit" -# A label for the resul directory -label: "example_2_1_2" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "examples/results/example_2_1_2/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# If the dataset is multiclass, will use this multiclass-to-biclass method -multiclass_method: "oneVersusOne" -# The ratio number of test exmaples/number of train examples -split: 0.2 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 2 -# The number of classes to select in the dataset -nb_class: 2 -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree", "adaboost", ] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_late_fusion", ] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 1 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: None -# The number of iteration in the hyper-parameter optimization process -hps_args: {} - -decision_tree: - max_depth: 3 - -adaboost: - base_estimator: "DecisionTreeClassifier" - n_estimators: 50 - -weighted_linear_late_fusion: - classifiers_names: "decision_tree" - classifier_configs: - decision_tree: - max_depth: 2 - - -# The following arguments are classifier-specific, and are documented in each -# of the corresponding modules. - -# In order to run multiple sets of parameters, use multiple values in the -# following lists, and set hps_type to None. diff --git a/multiview_platform/examples/config_files/config_example_2_2_1.yml b/multiview_platform/examples/config_files/config_example_2_2_1.yml deleted file mode 100644 index d462bee16fcf44ecf939c14776435ef15ef8c3ae..0000000000000000000000000000000000000000 --- a/multiview_platform/examples/config_files/config_example_2_2_1.yml +++ /dev/null @@ -1,71 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: True -# The name of each dataset in the directory on which the benchmark should be run -name: "doc_summit" -# A label for the result directory -label: "example_2_2_1" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "examples/results/example_2_2_1/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# If the dataset is multiclass, will use this multiclass-to-biclass method -multiclass_method: "oneVersusOne" -# The ratio number of test exmaples/number of train examples -split: 0.8 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 5 -# The number of classes to select in the dataset -nb_class: 2 -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree", "adaboost", ] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_late_fusion", ] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 1 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: 'Random' -# The number of iteration in the hyper-parameter optimization process -hps_args: - n_iter: 5 - equivalent_draws: True - -# The following arguments are classifier-specific, and are documented in each -# of the corresponding modules. - -# In order to run multiple sets of parameters, use multiple values in the -# following lists, and set hps_type to None. diff --git a/multiview_platform/examples/config_files/config_example_2_3.yml b/multiview_platform/examples/config_files/config_example_2_3.yml deleted file mode 100644 index bb8fb31ef226ae547d01a909b9eef108df0ba998..0000000000000000000000000000000000000000 --- a/multiview_platform/examples/config_files/config_example_2_3.yml +++ /dev/null @@ -1,87 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: True -# The name of each dataset in the directory on which the benchmark should be run -name: "doc_summit" -# A label for the result directory -label: "example_2_3" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "examples/results/example_2_3/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# If the dataset is multiclass, will use this multiclass-to-biclass method -multiclass_method: "oneVersusOne" -# The ratio number of test exmaples/number of train examples -split: 0.8 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 5 -# The number of classes to select in the dataset -nb_class: 2 -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree", "adaboost", ] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_late_fusion", ] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 1 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: 'Grid' -# The number of iteration in the hyper-parameter optimization process -hps_args: - decision_tree: - max_depth: [1,2,3,4,5] - - adaboost: - n_estimators: [10,15,20,25] - - weighted_linear_late_fusion: - classifiers_names: - - ["decision_tree", "decision_tree", "decision_tree", "decision_tree"] - - ["adaboost", "adaboost", "adaboost", "adaboost",] - - classifier_configs: - - decision_tree: - max_depth: 3 - adaboost: - n_estimators: 10 - - - -# The following arguments are classifier-specific, and are documented in each -# of the corresponding modules. - -# In order to run multiple sets of parameters, use multiple values in the -# following lists, and set hps_type to None. diff --git a/multiview_platform/examples/config_files/config_example_3.yml b/multiview_platform/examples/config_files/config_example_3.yml deleted file mode 100644 index 67ef06ea9b4b9da10e781133d20e0bb0fcaf1abb..0000000000000000000000000000000000000000 --- a/multiview_platform/examples/config_files/config_example_3.yml +++ /dev/null @@ -1,84 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: True -# The name of each dataset in the directory on which the benchmark should be run -name: "doc_summit" -# A label for the result directory -label: "example_3" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "examples/results/example_3/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# If the dataset is multiclass, will use this multiclass-to-biclass method -multiclass_method: "oneVersusOne" -# The ratio number of test exmaples/number of train examples -split: 0.8 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 5 -# The number of classes to select in the dataset -nb_class: 2 -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree", "adaboost", ] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_late_fusion", ] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 5 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: 'None' -# The number of iteration in the hyper-parameter optimization process -hps_args: {} - -decision_tree: - max_depth: 3 - -adaboost: - base_estimator: "DecisionTreeClassifier" - n_estimators: 10 - -weighted_linear_late_fusion: - classifiers_names: "decision_tree" - classifier_configs: - decision_tree: - max_depth: 2 - - - -# The following arguments are classifier-specific, and are documented in each -# of the corresponding modules. - -# In order to run multiple sets of parameters, use multiple values in the -# following lists, and set hps_type to None. diff --git a/multiview_platform/examples/data/digits_doc.hdf5 b/multiview_platform/examples/data/digits_doc.hdf5 deleted file mode 100644 index 61e452b9118eeabc4972c11803a8bba775dc3301..0000000000000000000000000000000000000000 Binary files a/multiview_platform/examples/data/digits_doc.hdf5 and /dev/null differ diff --git a/multiview_platform/examples/data/doc_summit.hdf5 b/multiview_platform/examples/data/doc_summit.hdf5 deleted file mode 100644 index 8400dd06429e58d67c98d7b9b1689a534b42e0d8..0000000000000000000000000000000000000000 Binary files a/multiview_platform/examples/data/doc_summit.hdf5 and /dev/null differ diff --git a/multiview_platform/execute.py b/multiview_platform/execute.py deleted file mode 100644 index c43c4362c9a7f2755be31da611348d22e6cb2b12..0000000000000000000000000000000000000000 --- a/multiview_platform/execute.py +++ /dev/null @@ -1,31 +0,0 @@ -"""This is the execution module, used to execute the code""" - -import os - - -def execute(config_path=None): # pragma: no cover - import sys - - from multiview_platform.mono_multi_view_classifiers import exec_classif - if config_path is None: - exec_classif.exec_classif(sys.argv[1:]) - else: - if config_path == "example 0": - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_0.yml") - elif config_path == "example 1": - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_1.yml") - elif config_path == "example 2.1.1": - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_2_1_1.yml") - elif config_path == "example 2.1.2": - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_2_1_2.yml") - elif config_path == "example 2.2": - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_2_2.yml") - elif config_path == "example 2.3": - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_2_3.yml") - elif config_path == "example 3": - config_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "examples", "config_files", "config_example_3.yml") - exec_classif.exec_classif(["--config_path", config_path]) - - -if __name__ == "__main__": - execute() diff --git a/multiview_platform/mono_multi_view_classifiers/__init__.py b/multiview_platform/mono_multi_view_classifiers/__init__.py deleted file mode 100644 index 9e2c30f3a193aff8b3a8c59f345c15dcff74c7ed..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from . import exec_classif, result_analysis, metrics, monoview_classifiers, \ - monoview, multiview, utils, multiview_classifiers - -__all__ = ['metrics', 'monoview', 'monoview_classifiers', 'multiview', 'utils'] diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py deleted file mode 100644 index 91d931bec67a0a77b9da9732402c180f8784257c..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ /dev/null @@ -1,814 +0,0 @@ -import itertools -import logging -import os -import pkgutil -import time -import traceback - -import matplotlib -import numpy as np -from sklearn.tree import DecisionTreeClassifier - -# Import own modules -from . import monoview_classifiers -from . import multiview_classifiers -from .monoview.exec_classif_mono_view import exec_monoview -from .multiview.exec_multiview import exec_multiview -from .result_analysis.execution import analyze_iterations, analyze -from .utils import execution, dataset, configuration -from .utils.organization import secure_file_path -from .utils.dataset import delete_HDF5 - -matplotlib.use( - 'Agg') # Anti-Grain Geometry C++ library to make a raster (pixel) image of the figure - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def init_benchmark(cl_type, monoview_algos, multiview_algos): - r"""Used to create a list of all the algorithm packages names used for the benchmark. - - First this function will check if the benchmark need mono- or/and multiview - algorithms and adds to the right - dictionary the asked algorithms. If none is asked by the user, all will be added. - - If the keyword `"Benchmark"` is used, all mono- and multiview algorithms will be added. - - Parameters - ---------- - cl_type : List of string - List of types of needed benchmark - multiview_algos : List of strings - List of multiview algorithms needed for the benchmark - monoview_algos : Listof strings - List of monoview algorithms needed for the benchmark - args : ParsedArgumentParser args - All the input args (used to tune the algorithms) - - Returns - ------- - benchmark : Dictionary of dictionaries - Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. - """ - benchmark = {"monoview": {}, "multiview": {}} - - if "monoview" in cl_type: - if monoview_algos == ['all']: # pragma: no cover - benchmark["monoview"] = [name for _, name, isPackage in - pkgutil.iter_modules( - monoview_classifiers.__path__) - if not isPackage] - - else: - benchmark["monoview"] = monoview_algos - - if "multiview" in cl_type: - if multiview_algos == ["all"]: # pragma: no cover - benchmark["multiview"] = [name for _, name, isPackage in - pkgutil.iter_modules( - multiview_classifiers.__path__) - if not isPackage] - else: - benchmark["multiview"] = multiview_algos - return benchmark - - -def init_argument_dictionaries(benchmark, views_dictionary, - nb_class, init_kwargs, hps_method, - hps_kwargs): # pragma: no cover - argument_dictionaries = {"monoview": [], "multiview": []} - if benchmark["monoview"]: - argument_dictionaries["monoview"] = init_monoview_exps( - benchmark["monoview"], - views_dictionary, - nb_class, - init_kwargs["monoview"], hps_method, hps_kwargs) - if benchmark["multiview"]: - argument_dictionaries["multiview"] = init_multiview_exps( - benchmark["multiview"], - views_dictionary, - nb_class, - init_kwargs["multiview"], hps_method, hps_kwargs) - return argument_dictionaries - - -def init_multiview_exps(classifier_names, views_dictionary, nb_class, - kwargs_init, hps_method, hps_kwargs): # pragma: no cover - multiview_arguments = [] - for classifier_name in classifier_names: - arguments = get_path_dict(kwargs_init[classifier_name]) - if hps_method == "Grid": - multiview_arguments += [ - gen_single_multiview_arg_dictionary(classifier_name, - arguments, - nb_class, - {"param_grid":hps_kwargs[classifier_name]}, - views_dictionary=views_dictionary)] - elif hps_method == "Random": - hps_kwargs = dict((key, value) - for key, value in hps_kwargs.items() - if key in ["n_iter", "equivalent_draws"]) - multiview_arguments += [ - gen_single_multiview_arg_dictionary(classifier_name, - arguments, - nb_class, - hps_kwargs, - views_dictionary=views_dictionary)] - elif hps_method == "None": - multiview_arguments += [ - gen_single_multiview_arg_dictionary(classifier_name, - arguments, - nb_class, - hps_kwargs, - views_dictionary=views_dictionary)] - else: - raise ValueError('At the moment only "None", "Random" or "Grid" ' - 'are available as hyper-parameter search ' - 'methods, sadly "{}" is not'.format(hps_method) - ) - - return multiview_arguments - - -def init_monoview_exps(classifier_names, - views_dictionary, nb_class, kwargs_init, hps_method, - hps_kwargs): # pragma: no cover - r"""Used to add each monoview exeperience args to the list of monoview experiences args. - - First this function will check if the benchmark need mono- or/and multiview algorithms and adds to the right - dictionary the asked algorithms. If none is asked by the user, all will be added. - - If the keyword `"Benchmark"` is used, all mono- and multiview algorithms will be added. - - Parameters - ---------- - classifier_names : dictionary - All types of monoview and multiview experiments that have to be benchmarked - argument_dictionaries : dictionary - Maps monoview and multiview experiments arguments. - views_dictionary : dictionary - Maps the view names to their index in the HDF5 dataset - nb_class : integer - Number of different labels in the classification - - Returns - ------- - benchmark : Dictionary of dictionaries - Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. - """ - monoview_arguments = [] - for view_name, view_index in views_dictionary.items(): - for classifier_name in classifier_names: - if hps_method == "Grid": - arguments = gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - {"param_grid": - hps_kwargs[classifier_name]}) - elif hps_method == "Random": - hps_kwargs = dict((key, value) - for key, value in hps_kwargs.items() - if key in ["n_iter", "equivalent_draws"]) - arguments = gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - hps_kwargs) - elif hps_method == "None": - arguments = gen_single_monoview_arg_dictionary(classifier_name, - kwargs_init, - nb_class, - view_index, - view_name, - hps_kwargs) - - else: - raise ValueError('At the moment only "None", "Random" or "Grid" ' - 'are available as hyper-parameter search ' - 'methods, sadly "{}" is not'.format(hps_method) - ) - monoview_arguments.append(arguments) - return monoview_arguments - - -def gen_single_monoview_arg_dictionary(classifier_name, arguments, nb_class, - view_index, view_name, hps_kwargs): - if classifier_name in arguments: - classifier_config = dict((key, value) for key, value in arguments[ - classifier_name].items()) - else: - classifier_config = {} - return {classifier_name: classifier_config, - "view_name": view_name, - "view_index": view_index, - "classifier_name": classifier_name, - "nb_class": nb_class, - "hps_kwargs":hps_kwargs } - - -def gen_single_multiview_arg_dictionary(classifier_name, arguments, nb_class, - hps_kwargs, views_dictionary=None): - return {"classifier_name": classifier_name, - "view_names": list(views_dictionary.keys()), - 'view_indices': list(views_dictionary.values()), - "nb_class": nb_class, - "labels_names": None, - "hps_kwargs": hps_kwargs, - classifier_name: extract_dict(arguments) - } - - -def extract_dict(classifier_config): - """Reverse function of get_path_dict""" - extracted_dict = {} - for key, value in classifier_config.items(): - extracted_dict = set_element(extracted_dict, key, value) - return extracted_dict - - -def set_element(dictionary, path, value): - """Set value in dictionary at the location indicated by path""" - existing_keys = path.split(".")[:-1] - dict_state = dictionary - for existing_key in existing_keys: - if existing_key in dict_state: - dict_state = dict_state[existing_key] - else: - dict_state[existing_key] = {} - dict_state = dict_state[existing_key] - dict_state[path.split(".")[-1]] = value - return dictionary - - -def get_path_dict(multiview_classifier_args): - """This function is used to generate a dictionary with each key being - the path to the value. - If given {"key1":{"key1_1":value1}, "key2":value2}, it will return - {"key1.key1_1":value1, "key2":value2}""" - path_dict = dict( - (key, value) for key, value in multiview_classifier_args.items()) - paths = is_dict_in(path_dict) - while paths: - for path in paths: - for key, value in path_dict[path].items(): - path_dict[".".join([path, key])] = value - path_dict.pop(path) - paths = is_dict_in(path_dict) - return path_dict - - -def is_dict_in(dictionary): - """ - Returns True if any of the dictionary value is a dictionary itself. - - Parameters - ---------- - dictionary - - Returns - ------- - - """ - paths = [] - for key, value in dictionary.items(): - if isinstance(value, dict): - paths.append(key) - return paths - -def init_kwargs(args, classifiers_names, framework="monoview"): - r"""Used to init kwargs thanks to a function in each monoview classifier package. - - Parameters - ---------- - args : parsed args objects - All the args passed by the user. - classifiers_names : list of strings - List of the benchmarks's monoview classifiers names. - - Returns - ------- - kwargs : Dictionary - Dictionary resuming all the specific arguments for the benchmark, one dictionary for each classifier. - - For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" - - logging.debug("Start:\t Initializing monoview classifiers arguments") - kwargs = {} - for classifiers_name in classifiers_names: - try: - if framework == "monoview": - getattr(monoview_classifiers, classifiers_name) - else: - getattr(multiview_classifiers, classifiers_name) - except AttributeError: - raise AttributeError( - classifiers_name + " is not implemented in monoview_classifiers, " - "please specify the name of the file in monoview_classifiers") - if classifiers_name in args: - kwargs[classifiers_name] = args[classifiers_name] - else: - kwargs[classifiers_name] = {} - logging.debug("Done:\t Initializing monoview classifiers arguments") - - return kwargs - - -def init_kwargs_func(args, benchmark): - """ - Dispached the kwargs initialization to monoview and multiview and creates - the kwargs variable - - Parameters - ---------- - args : parsed args objects - All the args passed by the user. - - benchmark : dict - The name of the mono- and mutli-view classifiers to run in the benchmark - - Returns - ------- - - kwargs : dict - The arguments for each mono- and multiview algorithms - """ - monoview_kwargs = init_kwargs(args, benchmark["monoview"], - framework="monoview") - multiview_kwargs = init_kwargs(args, benchmark["multiview"], - framework="multiview") - kwargs = {"monoview": monoview_kwargs, "multiview": multiview_kwargs} - return kwargs - - -def arange_metrics(metrics, metric_princ): - """Used to get the metrics list in the right order so that - the first one is the principal metric specified in args - - Parameters - ---------- - metrics : dict - The metrics that will be used in the benchmark - - metric_princ : str - The name of the metric that need to be used for the hyper-parameter - optimization process - - Returns - ------- - metrics : list of lists - The metrics list, but arranged so the first one is the principal one.""" - if metric_princ in metrics: - metrics = dict((key, value) if not key == metric_princ else (key+"*", value) for key, value in metrics.items()) - else: - raise ValueError("{} not in metric pool ({})".format(metric_princ, - metrics)) - return metrics - - -def benchmark_init(directory, classification_indices, labels, labels_dictionary, - k_folds, dataset_var): - """ - Initializes the benchmark, by saving the indices of the train - examples and the cross validation folds. - - Parameters - ---------- - directory : str - The benchmark's result directory - - classification_indices : numpy array - The indices of the examples, splitted for the train/test split - - labels : numpy array - The labels of the dataset - - labels_dictionary : dict - The dictionary with labels as keys and their names as values - - k_folds : sklearn.model_selection.Folds object - The folds for the cross validation process - - Returns - ------- - - """ - logging.debug("Start:\t Benchmark initialization") - secure_file_path(os.path.join(directory, "train_labels.csv")) - train_indices = classification_indices[0] - train_labels = dataset_var.get_labels(example_indices=train_indices) - np.savetxt(os.path.join(directory, "train_labels.csv"), train_labels, - delimiter=",") - np.savetxt(os.path.join(directory, "train_indices.csv"), - classification_indices[0], - delimiter=",") - results_monoview = [] - folds = k_folds.split(np.arange(len(train_labels)), train_labels) - min_fold_len = int(len(train_labels) / k_folds.n_splits) - for fold_index, (train_cv_indices, test_cv_indices) in enumerate(folds): - file_name = os.path.join(directory, "folds", "test_labels_fold_" + str( - fold_index) + ".csv") - secure_file_path(file_name) - np.savetxt(file_name, train_labels[test_cv_indices[:min_fold_len]], - delimiter=",") - labels_names = list(labels_dictionary.values()) - logging.debug("Done:\t Benchmark initialization") - return results_monoview, labels_names - - -# def exec_one_benchmark(core_index=-1, labels_dictionary=None, directory=None, -# classification_indices=None, args=None, -# k_folds=None, random_state=None, hyper_param_search=None, -# metrics=None, argument_dictionaries=None, -# benchmark=None, views=None, views_indices=None, flag=None, -# labels=None, -# exec_monoview_multicore=exec_monoview_multicore, -# exec_multiview_multicore=exec_multiview_multicore,): -# """Used to run a benchmark using one core. ExecMonoview_multicore, initMultiviewArguments and -# exec_multiview_multicore args are only used for tests""" -# -# results_monoview, labels_names = benchmark_init(directory, -# classification_indices, labels, -# labels_dictionary, k_folds) -# -# logging.debug("Start:\t monoview benchmark") -# results_monoview += [ -# exec_monoview_multicore(directory, args["name"], labels_names, -# classification_indices, k_folds, -# core_index, args["file_type"], args["pathf"], random_state, -# labels, -# hyper_param_search=hyper_param_search, -# metrics=metrics, -# n_iter=args["hps_iter"], **argument) -# for argument in argument_dictionaries["Monoview"]] -# logging.debug("Done:\t monoview benchmark") -# -# -# logging.debug("Start:\t multiview benchmark") -# results_multiview = [ -# exec_multiview_multicore(directory, core_index, args["name"], -# classification_indices, k_folds, args["file_type"], -# args["pathf"], labels_dictionary, random_state, -# labels, hyper_param_search=hyper_param_search, -# metrics=metrics, n_iter=args["hps_iter"], -# **arguments) -# for arguments in argument_dictionaries["multiview"]] -# logging.debug("Done:\t multiview benchmark") -# -# return [flag, results_monoview + results_multiview] -# -# -# def exec_one_benchmark_multicore(nb_cores=-1, labels_dictionary=None, -# directory=None, classification_indices=None, -# args=None, -# k_folds=None, random_state=None, -# hyper_param_search=None, metrics=None, -# argument_dictionaries=None, -# benchmark=None, views=None, views_indices=None, -# flag=None, labels=None, -# exec_monoview_multicore=exec_monoview_multicore, -# exec_multiview_multicore=exec_multiview_multicore,): -# """Used to run a benchmark using multiple cores. ExecMonoview_multicore, initMultiviewArguments and -# exec_multiview_multicore args are only used for tests""" -# -# results_monoview, labels_names = benchmark_init(directory, -# classification_indices, labels, -# labels_dictionary, k_folds) -# -# logging.debug("Start:\t monoview benchmark") -# nb_experiments = len(argument_dictionaries["monoview"]) -# nb_multicore_to_do = int(math.ceil(float(nb_experiments) / nb_cores)) -# for step_index in range(nb_multicore_to_do): -# results_monoview += (Parallel(n_jobs=nb_cores)( -# delayed(exec_monoview_multicore)(directory, args["name"], labels_names, -# classification_indices, k_folds, -# core_index, args["file_type"], args["pathf"], -# random_state, labels, -# hyper_param_search=hyper_param_search, -# metrics=metrics, -# n_iter=args["hps_iter"], -# **argument_dictionaries["monoview"][ -# core_index + step_index * nb_cores]) -# for core_index in -# range(min(nb_cores, nb_experiments - step_index * nb_cores)))) -# logging.debug("Done:\t monoview benchmark") -# -# logging.debug("Start:\t multiview arguments initialization") -# # argument_dictionaries = initMultiviewArguments(args, benchmark, views, -# # views_indices, -# # argument_dictionaries, -# # random_state, directory, -# # resultsMonoview, -# # classification_indices) -# logging.debug("Done:\t multiview arguments initialization") -# -# logging.debug("Start:\t multiview benchmark") -# results_multiview = [] -# nb_experiments = len(argument_dictionaries["multiview"]) -# nb_multicore_to_do = int(math.ceil(float(nb_experiments) / nb_cores)) -# for step_index in range(nb_multicore_to_do): -# results_multiview += Parallel(n_jobs=nb_cores)( -# delayed(exec_multiview_multicore)(directory, core_index, args["name"], -# classification_indices, k_folds, -# args["file_type"], args["Base"]["pathf"], -# labels_dictionary, random_state, -# labels, -# hyper_param_search=hyper_param_search, -# metrics=metrics, -# n_iter=args["hps_iter"], -# ** -# argument_dictionaries["multiview"][ -# step_index * nb_cores + core_index]) -# for core_index in -# range(min(nb_cores, nb_experiments - step_index * nb_cores))) -# logging.debug("Done:\t multiview benchmark") -# -# return [flag, results_monoview + results_multiview] - - -def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, - directory=None, classification_indices=None, - args=None, - k_folds=None, random_state=None, - hyper_param_search=None, metrics=None, - argument_dictionaries=None, - benchmark=None, views=None, views_indices=None, - flag=None, labels=None, - track_tracebacks=False): # pragma: no cover - results_monoview, labels_names = benchmark_init(directory, - classification_indices, - labels, - labels_dictionary, k_folds, - dataset_var) - logging.getLogger('matplotlib.font_manager').disabled = True - logging.debug("Start:\t monoview benchmark") - traceback_outputs = {} - for arguments in argument_dictionaries["monoview"]: - try: - X = dataset_var.get_v(arguments["view_index"]) - Y = dataset_var.get_labels() - results_monoview += [ - exec_monoview(directory, X, Y, args["name"], labels_names, - classification_indices, k_folds, - 1, args["file_type"], args["pathf"], random_state, - hyper_param_search=hyper_param_search, - metrics=metrics, - **arguments)] - except: - if track_tracebacks: - traceback_outputs[ - arguments["classifier_name"] + "-" + arguments[ - "view_name"]] = traceback.format_exc() - else: - raise - - logging.debug("Done:\t monoview benchmark") - - logging.debug("Start:\t multiview arguments initialization") - - # argument_dictionaries = initMultiviewArguments(args, benchmark, views, - # views_indices, - # argument_dictionaries, - # random_state, directory, - # resultsMonoview, - # classification_indices) - logging.debug("Done:\t multiview arguments initialization") - - logging.debug("Start:\t multiview benchmark") - results_multiview = [] - for arguments in argument_dictionaries["multiview"]: - try: - results_multiview += [ - exec_multiview(directory, dataset_var, args["name"], - classification_indices, - k_folds, 1, args["file_type"], - args["pathf"], labels_dictionary, random_state, - labels, - hps_method=hyper_param_search, - metrics=metrics, n_iter=args["hps_iter"], - **arguments)] - except: - if track_tracebacks: - traceback_outputs[ - arguments["classifier_name"]] = traceback.format_exc() - else: - raise - logging.debug("Done:\t multiview benchmark") - - return [flag, results_monoview + results_multiview, traceback_outputs] - - -def exec_benchmark(nb_cores, stats_iter, - benchmark_arguments_dictionaries, - directory, metrics, dataset_var, track_tracebacks, - exec_one_benchmark_mono_core=exec_one_benchmark_mono_core, - analyze=analyze, delete=delete_HDF5, - analyze_iterations=analyze_iterations): # pragma: no cover - r"""Used to execute the needed benchmark(s) on multicore or mono-core functions. - - Parameters - ---------- - nb_cores : int - Number of threads that the benchmarks can use. - stats_iter : int - Number of statistical iterations that have to be done. - benchmark_arguments_dictionaries : list of dictionaries - All the needed arguments for the benchmarks. - classification_indices : list of lists of numpy.ndarray - For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and - the ones of the testing set. - directories : list of strings - List of the paths to the result directories for each statistical iteration. - directory : string - Path to the main results directory. - multi_class_labels : ist of lists of numpy.ndarray - For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the - indices for the biclass training set, the ones for the biclass testing set and the ones for the - multiclass testing set. - metrics : list of lists - metrics that will be used to evaluate the algorithms performance. - labels_dictionary : dictionary - Dictionary mapping labels indices to labels names. - nb_labels : int - Total number of different labels in the dataset. - dataset_var : HDF5 dataset file - The full dataset that wil be used by the benchmark. - classifiers_names : list of strings - List of the benchmarks's monoview classifiers names. - rest_of_the_args : - Just used for testing purposes - - - Returns - ------- - results : list of lists - The results of the benchmark. - """ - logging.debug("Start:\t Executing all the needed benchmarks") - results = [] - # if nb_cores > 1: - # if stats_iter > 1 or nb_multiclass > 1: - # nb_exps_to_do = len(benchmark_arguments_dictionaries) - # nb_multicore_to_do = range(int(math.ceil(float(nb_exps_to_do) / nb_cores))) - # for step_index in nb_multicore_to_do: - # results += (Parallel(n_jobs=nb_cores)(delayed(exec_one_benchmark) - # (core_index=core_index, - # ** - # benchmark_arguments_dictionaries[ - # core_index + step_index * nb_cores]) - # for core_index in range( - # min(nb_cores, nb_exps_to_do - step_index * nb_cores)))) - # else: - # results += [exec_one_benchmark_multicore(nb_cores=nb_cores, ** - # benchmark_arguments_dictionaries[0])] - # else: - for arguments in benchmark_arguments_dictionaries: - benchmark_results = exec_one_benchmark_mono_core( - dataset_var=dataset_var, - track_tracebacks=track_tracebacks, - **arguments) - analyze_iterations([benchmark_results], - benchmark_arguments_dictionaries, stats_iter, - metrics, example_ids=dataset_var.example_ids, - labels=dataset_var.get_labels()) - results += [benchmark_results] - logging.debug("Done:\t Executing all the needed benchmarks") - - # Do everything with flagging - logging.debug("Start:\t Analyzing predictions") - results_mean_stds = analyze(results, stats_iter, - benchmark_arguments_dictionaries, - metrics, - directory, - dataset_var.example_ids, - dataset_var.get_labels()) - logging.debug("Done:\t Analyzing predictions") - delete(benchmark_arguments_dictionaries, nb_cores, dataset_var) - return results_mean_stds - - -def exec_classif(arguments): # pragma: no cover - """ - Runs the benchmark with the given arguments - - Parameters - ---------- - arguments : - - Returns - ------- - - - >>> exec_classif([--config_path, /path/to/config/files/]) - >>> - """ - start = time.time() - args = execution.parse_the_args(arguments) - args = configuration.get_the_args(args.config_path) - os.nice(args["nice"]) - nb_cores = args["nb_cores"] - if nb_cores == 1: - os.environ['OPENBLAS_NUM_THREADS'] = '1' - stats_iter = args["stats_iter"] - hps_method = args["hps_type"] - hps_kwargs = args["hps_args"] - cl_type = args["type"] - monoview_algos = args["algos_monoview"] - multiview_algos = args["algos_multiview"] - path, dataset_list = execution.find_dataset_names(args["pathf"], - args["file_type"], - args["name"]) - args["pathf"] = path - for dataset_name in dataset_list: - # noise_results = [] - # for noise_std in args["noise_std"]: - - directory = execution.init_log_file(dataset_name, args["views"], - args["file_type"], - args["log"], args["debug"], - args["label"], - args["res_dir"], - args) - - random_state = execution.init_random_state(args["random_state"], - directory) - stats_iter_random_states = execution.init_stats_iter_random_states( - stats_iter, - random_state) - - get_database = execution.get_database_function(dataset_name, - args["file_type"]) - - dataset_var, labels_dictionary, datasetname = get_database( - args["views"], - args["pathf"], dataset_name, - args["nb_class"], - args["classes"], - random_state, - args["full"], - ) - args["name"] = datasetname - splits = execution.gen_splits(dataset_var.get_labels(), - args["split"], - stats_iter_random_states) - - # multiclass_labels, labels_combinations, indices_multiclass = multiclass.gen_multiclass_labels( - # dataset_var.get_labels(), multiclass_method, splits) - - k_folds = execution.gen_k_folds(stats_iter, args["nb_folds"], - stats_iter_random_states) - - dataset_files = dataset.init_multiple_datasets(args["pathf"], - args["name"], - nb_cores) - - views, views_indices, all_views = execution.init_views(dataset_var, - args[ - "views"]) - views_dictionary = dataset_var.get_view_dict() - nb_views = len(views) - nb_class = dataset_var.get_nb_class() - - metrics = args["metrics"] - if metrics == "all": - metrics_names = [name for _, name, isPackage - in pkgutil.iter_modules( - [os.path.join(os.path.dirname( - os.path.dirname(os.path.realpath(__file__))), - 'metrics')]) if - not isPackage and name not in ["framework", - "log_loss", - "matthews_corrcoef", - "roc_auc_score"]] - metrics = dict((metric_name, {}) - for metric_name in metrics_names) - metrics = arange_metrics(metrics, args["metric_princ"]) - - benchmark = init_benchmark(cl_type, monoview_algos, multiview_algos,) - init_kwargs = init_kwargs_func(args, benchmark) - data_base_time = time.time() - start - argument_dictionaries = init_argument_dictionaries( - benchmark, views_dictionary, - nb_class, init_kwargs, hps_method, hps_kwargs) - # argument_dictionaries = initMonoviewExps(benchmark, viewsDictionary, - # NB_CLASS, initKWARGS) - directories = execution.gen_direcorties_names(directory, stats_iter) - benchmark_argument_dictionaries = execution.gen_argument_dictionaries( - labels_dictionary, directories, - splits, - hps_method, args, k_folds, - stats_iter_random_states, metrics, - argument_dictionaries, benchmark, - views, views_indices) - results_mean_stds = exec_benchmark( - nb_cores, stats_iter, - benchmark_argument_dictionaries, directory, metrics, - dataset_var, - args["track_tracebacks"]) - # noise_results.append([noise_std, results_mean_stds]) - # plot_results_noise(directory, noise_results, metrics[0][0], - # dataset_name) diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/__init__.py b/multiview_platform/mono_multi_view_classifiers/metrics/__init__.py deleted file mode 100644 index 4a7ca0b0f318e8483b6bc7cb464621ea27257f05..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ -__version__ = "0.0.0.0" -""" -To be able to add another metric to the benchmark you must : - -Create a .py file named after the metric -Define a score function - Input : - y_true : np array with the real labels - y_pred : np array with the predicted labels - kwargs : every argument that is specific to the metric - Returns: - score : the metric's score (float) -Define a get_scorer function - Input : - kwargs : every argument that is specific to the metric - Returns : - scorer : an object similar to an sk-learn scorer -Define a getConfig function - Input : - kwargs : every argument that is specific to the metric - Output : - config_string : A string that gives the name of the metric and explains how it is configured. Must end by - (lower is better) or (higher is better) to be able to analyze the preds -""" - -import os - -for module in os.listdir(os.path.dirname(os.path.realpath(__file__))): - if module in ['__init__.py'] or module[-3:] != '.py': - continue - __import__(module[:-3], locals(), globals(), [], 1) - pass -del os diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/accuracy_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/accuracy_score.py deleted file mode 100644 index e9faae69ed7dd7c8a33dabbd43da6f78a80b7ab7..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/accuracy_score.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Functions : - score: to get the accuracy score - get_scorer: returns a sklearn scorer for grid search -""" - -import warnings - -from sklearn.metrics import accuracy_score as metric -from sklearn.metrics import make_scorer - -warnings.warn("the accuracy_score module is deprecated", DeprecationWarning, - stacklevel=2) - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, multiclass=False, **kwargs): - """Arguments: - y_true: real labels - y_pred: predicted labels - - Keyword Arguments: - "0": weights to compute accuracy - - Returns: - Weighted accuracy score for y_true, y_pred""" - score = metric(y_true, y_pred, **kwargs) - return score - - -def get_scorer(**kwargs): - """Keyword Arguments: - "0": weights to compute accuracy - - Returns: - A weighted sklearn scorer for accuracy""" - return make_scorer(metric, greater_is_better=True, - **kwargs) - - -def get_config(**kwargs): - config_string = "Accuracy score using {}, (higher is better)".format(kwargs) - return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/f1_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/f1_score.py deleted file mode 100644 index 6b9b89df0e5556ea89617f558d309e113fbf47d0..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/f1_score.py +++ /dev/null @@ -1,31 +0,0 @@ -"""Functions : - score: to get the f1 score - get_scorer: returns a sklearn scorer for grid search -""" - -import warnings - -from sklearn.metrics import f1_score as metric -from sklearn.metrics import make_scorer - -warnings.warn("the f1_score module is deprecated", DeprecationWarning, - stacklevel=2) -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, multiclass=True, average='micro', **kwargs): - score = metric(y_true, y_pred, average=average, **kwargs) - return score - - -def get_scorer(average="micro", **kwargs): - return make_scorer(metric, greater_is_better=True, average=average, - **kwargs) - - -def get_config(average="micro", **kwargs, ): - config_string = "F1 score using average: {}, {} (higher is better)".format( - average, kwargs) - return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/fbeta_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/fbeta_score.py deleted file mode 100644 index 60a5141aa538ad4d204a705c18085de876066173..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/fbeta_score.py +++ /dev/null @@ -1,27 +0,0 @@ -import warnings - -from sklearn.metrics import fbeta_score as metric -from sklearn.metrics import make_scorer - -warnings.warn("the fbeta_score module is deprecated", DeprecationWarning, - stacklevel=2) - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, beta=2.0, average="micro", **kwargs): - score = metric(y_true, y_pred, beta=beta, average=average, **kwargs) - return score - - -def get_scorer(beta=2.0, average="micro", **kwargs): - return make_scorer(metric, greater_is_better=True, beta=beta, - average=average, **kwargs) - - -def get_config(beta=2.0, average="micro", **kwargs): - config_string = "F-beta score using beta: {}, average: {}, {} (higher is better)".format( - beta, average, kwargs) - return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/hamming_loss.py b/multiview_platform/mono_multi_view_classifiers/metrics/hamming_loss.py deleted file mode 100644 index 665dd243721d3d93e121b7d010f21c44dc3c528c..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/hamming_loss.py +++ /dev/null @@ -1,24 +0,0 @@ -import warnings - -from sklearn.metrics import hamming_loss as metric -from sklearn.metrics import make_scorer - -warnings.warn("the hamming_loss module is deprecated", DeprecationWarning, - stacklevel=2) -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, multiclass=False, **kwargs): - score = metric(y_true, y_pred, **kwargs) - return score - - -def get_scorer(**kwargs): - return make_scorer(metric, greater_is_better=False, **kwargs) - - -def get_config(**kwargs): - config_string = "Hamming loss using {} (lower is better)".format(kwargs) - return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/jaccard_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/jaccard_score.py deleted file mode 100644 index 248ec66d70b0d46ae8bc4ffbbe624a22008eebc6..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/jaccard_score.py +++ /dev/null @@ -1,27 +0,0 @@ -import warnings - -from sklearn.metrics import jaccard_score as metric -from sklearn.metrics import make_scorer - -warnings.warn("the jaccard_similarity_score module is deprecated", - DeprecationWarning, - stacklevel=2) -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, multiclass=False, **kwargs): - score = metric(y_true, y_pred, **kwargs) - return score - - -def get_scorer(**kwargs): - return make_scorer(metric, greater_is_better=True, - **kwargs) - - -def get_config(**kwargs): - config_string = "Jaccard_similarity score using {} (higher is better)".format( - kwargs) - return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/log_loss.py b/multiview_platform/mono_multi_view_classifiers/metrics/log_loss.py deleted file mode 100644 index 2b5ab917d973e9a1e62437ea497c0a40d75b81e3..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/log_loss.py +++ /dev/null @@ -1,25 +0,0 @@ -import warnings - -from sklearn.metrics import log_loss as metric -from sklearn.metrics import make_scorer - -warnings.warn("the log_loss module is deprecated", DeprecationWarning, - stacklevel=2) -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, multiclass=False, **kwargs): - score = metric(y_true, y_pred, **kwargs) - return score - - -def get_scorer(**kwargs): - return make_scorer(metric, greater_is_better=False, - **kwargs) - - -def get_config(**kwargs): - config_string = "Log loss using {} (lower is better)".format(kwargs) - return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/matthews_corrcoef.py b/multiview_platform/mono_multi_view_classifiers/metrics/matthews_corrcoef.py deleted file mode 100644 index b3b8ec6c125a867cf3a1c4a1f9b41b51ed4129c8..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/matthews_corrcoef.py +++ /dev/null @@ -1,24 +0,0 @@ -import warnings - -from sklearn.metrics import make_scorer -from sklearn.metrics import matthews_corrcoef as metric - -warnings.warn("the matthews_corrcoef module is deprecated", DeprecationWarning, - stacklevel=2) -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, multiclass=False, **kwargs): - score = metric(y_true, y_pred) - return score - - -def get_scorer(**kwargs): - return make_scorer(metric, greater_is_better=True) - - -def get_config(**kwargs): - config_string = "Matthews correlation coefficient (higher is better)" - return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/precision_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/precision_score.py deleted file mode 100644 index d1c861f91a39441a961ff2ff2ef3e79aafbe060e..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/precision_score.py +++ /dev/null @@ -1,26 +0,0 @@ -import warnings - -from sklearn.metrics import make_scorer -from sklearn.metrics import precision_score as metric - -warnings.warn("the precision_score module is deprecated", DeprecationWarning, - stacklevel=2) -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, average='micro', multiclass=False, **kwargs): - score = metric(y_true, y_pred, average=average, **kwargs) - return score - - -def get_scorer(average='micro', **kwargs): - return make_scorer(metric, greater_is_better=True, - average=average, **kwargs) - - -def get_config(average='micro', **kwargs): - config_string = "Precision score using average: {}, {} (higher is better)".format( - average, kwargs) - return config_string diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/recall_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/recall_score.py deleted file mode 100644 index 261261990b060b3b759e6013647f3285fd9c9e2c..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/recall_score.py +++ /dev/null @@ -1,26 +0,0 @@ -import warnings - -from sklearn.metrics import make_scorer -from sklearn.metrics import recall_score as metric - -warnings.warn("the recall_score module is deprecated", DeprecationWarning, - stacklevel=2) -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, average='micro', **kwargs): - score = metric(y_true, y_pred, average=average, **kwargs) - return score - - -def get_scorer(average='micro', **kwargs): - return make_scorer(metric, greater_is_better=True, - average=average, **kwargs) - - -def get_config(average="micro", **kwargs): - configString = "Recall score using average: {}, {} (higher is better)".format( - average, kwargs) - return configString diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/roc_auc_score.py b/multiview_platform/mono_multi_view_classifiers/metrics/roc_auc_score.py deleted file mode 100644 index ae21428b347caef47dc3bcc596404ea6d85c5dd5..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/roc_auc_score.py +++ /dev/null @@ -1,26 +0,0 @@ -import warnings - -from sklearn.metrics import make_scorer -from sklearn.metrics import roc_auc_score as metric -from sklearn.preprocessing import MultiLabelBinarizer - -warnings.warn("the roc_auc_score module is deprecated", DeprecationWarning, - stacklevel=2) -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, multiclass=False, **kwargs): - score = metric(y_true, y_pred, **kwargs) - return score - - -def get_scorer(**kwargs): - return make_scorer(metric, greater_is_better=True, - **kwargs) - - -def get_config(**kwargs): - configString = "ROC_AUC score using {}".format(kwargs) - return configString diff --git a/multiview_platform/mono_multi_view_classifiers/metrics/zero_one_loss.py b/multiview_platform/mono_multi_view_classifiers/metrics/zero_one_loss.py deleted file mode 100644 index e3a3449247edf934251ddbc4dbb8283bbf632746..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/metrics/zero_one_loss.py +++ /dev/null @@ -1,26 +0,0 @@ -import warnings - -from sklearn.metrics import make_scorer -from sklearn.metrics import zero_one_loss as metric - -warnings.warn("the zero_one_loss module is deprecated", DeprecationWarning, - stacklevel=2) - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def score(y_true, y_pred, multiclass=False, **kwargs): - score = metric(y_true, y_pred, **kwargs) - return score - - -def get_scorer(**kwargs): - return make_scorer(metric, greater_is_better=False, - **kwargs) - - -def get_config(**kwargs): - configString = "Zero_one loss using {} (lower is better)".format(kwargs) - return configString diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/__init__.py b/multiview_platform/mono_multi_view_classifiers/monoview/__init__.py deleted file mode 100644 index e94c149514edbf920daebd101e425a0e22c03d02..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# from . import ExecClassifMonoView, MonoviewUtils, analyzeResult diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py deleted file mode 100644 index eed469493e0bb863b2940683a022678675ca15bb..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ /dev/null @@ -1,255 +0,0 @@ -#!/usr/bin/env python - -""" Execution: Script to perform a MonoView classification """ - -import logging # To create Log-Files -# Import built-in modules -import os # to geth path of the running script -import time # for time calculations - -import h5py -# Import 3rd party modules -import numpy as np # for reading CSV-files and Series - -from .monoview_utils import MonoviewResult, MonoviewResultAnalyzer -# Import own modules -from .. import monoview_classifiers -from ..utils import hyper_parameter_search -from ..utils.dataset import extract_subset, HDF5Dataset -from ..utils.multiclass import get_mc_estim -from ..utils.organization import secure_file_path - -# Author-Info -__author__ = "Nikolas Huelsmann, Baptiste BAUVIN" -__status__ = "Prototype" # Production, Development, Prototype - - -# __date__ = 2016 - 03 - 25 - - -def exec_monoview_multicore(directory, name, labels_names, - classification_indices, - k_folds, dataset_file_index, database_type, - path, random_state, labels, - hyper_param_search="randomized_search", - metrics=[["accuracy_score", None]], n_iter=30, - **args): # pragma: no cover - dataset_var = HDF5Dataset( - hdf5_file=h5py.File(path + name + str(dataset_file_index) + ".hdf5", - "r")) - neededViewIndex = args["view_index"] - X = dataset_var.get_v(neededViewIndex) - Y = labels - return exec_monoview(directory, X, Y, name, labels_names, - classification_indices, k_folds, 1, database_type, - path, - random_state, hyper_param_search=hyper_param_search, - metrics=metrics, n_iter=n_iter, - view_name=dataset_var.get_view_name( - args["view_index"]), - **args) - - -def exec_monoview(directory, X, Y, database_name, labels_names, classification_indices, - k_folds, nb_cores, databaseType, path, - random_state, hyper_param_search="Random", - metrics={"accuracy_score*":{}}, n_iter=30, view_name="", - hps_kwargs={}, **args): - logging.debug("Start:\t Loading data") - kwargs, \ - t_start, \ - view_name, \ - classifier_name, \ - X, \ - learningRate, \ - labelsString, \ - output_file_name,\ - directory,\ - base_file_name = init_constants(args, X, classification_indices, - labels_names, - database_name, directory, view_name, ) - logging.debug("Done:\t Loading data") - - logging.debug( - "Info:\t Classification - Database:" + str(database_name) + " View:" + str( - view_name) + " train ratio:" - + str(learningRate) + ", CrossValidation k-folds: " + str( - k_folds.n_splits) + ", cores:" - + str(nb_cores) + ", algorithm : " + classifier_name) - - logging.debug("Start:\t Determine Train/Test split") - X_train, y_train, X_test, y_test = init_train_test(X, Y, - classification_indices) - - logging.debug("Info:\t Shape X_train:" + str( - X_train.shape) + ", Length of y_train:" + str(len(y_train))) - logging.debug("Info:\t Shape X_test:" + str( - X_test.shape) + ", Length of y_test:" + str(len(y_test))) - logging.debug("Done:\t Determine Train/Test split") - - logging.debug("Start:\t Generate classifier args") - classifier_module = getattr(monoview_classifiers, classifier_name) - classifier_class_name = classifier_module.classifier_class_name - hyper_param_beg = time.monotonic() - cl_kwargs = get_hyper_params(classifier_module, hyper_param_search, - classifier_name, - classifier_class_name, - X_train, y_train, - random_state, output_file_name, - k_folds, nb_cores, metrics, kwargs, - **hps_kwargs) - hyper_param_duration = time.monotonic() - hyper_param_beg - logging.debug("Done:\t Generate classifier args") - - logging.debug("Start:\t Training") - - classifier = get_mc_estim(getattr(classifier_module, - classifier_class_name) - (random_state, **cl_kwargs), - random_state, - y=Y) - fit_beg = time.monotonic() - classifier.fit(X_train, y_train) # NB_CORES=nbCores, - fit_duration = time.monotonic() - fit_beg - logging.debug("Done:\t Training") - - logging.debug("Start:\t Predicting") - train_pred = classifier.predict(X_train) - pred_beg = time.monotonic() - test_pred = classifier.predict(X_test) - pred_duration = time.monotonic() - pred_beg - - # Filling the full prediction in the right order - full_pred = np.zeros(Y.shape, dtype=int) - 100 - for trainIndex, index in enumerate(classification_indices[0]): - full_pred[index] = train_pred[trainIndex] - for testIndex, index in enumerate(classification_indices[1]): - full_pred[index] = test_pred[testIndex] - - logging.debug("Done:\t Predicting") - - whole_duration = time.monotonic() - t_start - logging.debug( - "Info:\t Duration for training and predicting: " + str(whole_duration) + "[s]") - - logging.debug("Start:\t Getting results") - result_analyzer = MonoviewResultAnalyzer(view_name=view_name, - classifier_name=classifier_name, - shape=X.shape, - classifier=classifier, - classification_indices=classification_indices, - k_folds=k_folds, - hps_method=hyper_param_search, - metrics_dict=metrics, - n_iter=n_iter, - class_label_names=labels_names, - pred=full_pred, - directory=directory, - base_file_name=base_file_name, - labels=Y, - database_name=database_name, - nb_cores=nb_cores, - duration=whole_duration) - string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ - confusion_matrix = result_analyzer.analyze() - logging.debug("Done:\t Getting results") - - logging.debug("Start:\t Saving preds") - save_results(string_analysis, output_file_name, full_pred, train_pred, - y_train, images_analysis, y_test, confusion_matrix) - logging.info("Done:\t Saving results") - - view_index = args["view_index"] - return MonoviewResult(view_index, classifier_name, view_name, - metrics_scores, full_pred, cl_kwargs, - classifier, X_train.shape[1], - hyper_param_duration, fit_duration, pred_duration, class_metrics_scores) - - -def init_constants(args, X, classification_indices, labels_names, - name, directory, view_name): - try: - kwargs = args["args"] - except KeyError: - kwargs = args - t_start = time.monotonic() - cl_type = kwargs["classifier_name"] - learning_rate = float(len(classification_indices[0])) / ( - len(classification_indices[0]) + len(classification_indices[1])) - labels_string = "-".join(labels_names) - cl_type_string = cl_type - directory = os.path.join(directory, cl_type_string, view_name,) - base_file_name = cl_type_string + '-' + name + "-" + view_name + "-" - output_file_name = os.path.join(directory, base_file_name) - secure_file_path(output_file_name) - return kwargs, t_start, view_name, cl_type, X, learning_rate, labels_string, output_file_name, directory, base_file_name - - -def init_train_test(X, Y, classification_indices): - train_indices, test_indices = classification_indices - X_train = extract_subset(X, train_indices) - X_test = extract_subset(X, test_indices) - y_train = Y[train_indices] - y_test = Y[test_indices] - return X_train, y_train, X_test, y_test - - -def get_hyper_params(classifier_module, search_method, classifier_module_name, - classifier_class_name, X_train, y_train, - random_state, - output_file_name, k_folds, nb_cores, metrics, kwargs, - **hps_kwargs): - if search_method != "None": - logging.debug( - "Start:\t " + search_method + " best settings for " + classifier_module_name) - classifier_hp_search = getattr(hyper_parameter_search, search_method) - estimator = getattr(classifier_module, classifier_class_name)( - random_state=random_state, - **kwargs[classifier_module_name]) - estimator = get_mc_estim(estimator, random_state, - multiview=False, y=y_train) - hps = classifier_hp_search(estimator, scoring=metrics, cv=k_folds, - random_state=random_state, - framework="monoview", n_jobs=nb_cores, - **hps_kwargs) - hps.fit(X_train, y_train, **kwargs[classifier_module_name]) - cl_kwargs = hps.get_best_params() - hps.gen_report(output_file_name) - logging.debug("Done:\t " + search_method + " best settings") - else: - cl_kwargs = kwargs[classifier_module_name] - return cl_kwargs - - -def save_results(string_analysis, output_file_name, full_labels_pred, - y_train_pred, - y_train, images_analysis, y_test, confusion_matrix): # pragma: no cover - logging.info(string_analysis) - output_text_file = open(output_file_name + 'summary.txt', 'w', encoding="utf-8") - output_text_file.write(string_analysis) - output_text_file.close() - np.savetxt(output_file_name+"confusion_matrix.csv", confusion_matrix, - delimiter=', ') - np.savetxt(output_file_name + "full_pred.csv", - full_labels_pred.astype(np.int16), delimiter=",") - np.savetxt(output_file_name + "train_pred.csv", - y_train_pred.astype(np.int16), - delimiter=",") - np.savetxt(output_file_name + "train_labels.csv", y_train.astype(np.int16), - delimiter=",") - np.savetxt(output_file_name + "test_labels.csv", y_test.astype(np.int16), - delimiter=",") - - if images_analysis is not None: - for image_name in images_analysis: - if os.path.isfile(output_file_name + image_name + ".png"): - for i in range(1, 20): - test_file_name = output_file_name + image_name + "-" + str( - i) + ".png" - if not os.path.isfile(test_file_name): - images_analysis[image_name].savefig(test_file_name, - transparent=True) - break - - images_analysis[image_name].savefig( - output_file_name + image_name + '.png', transparent=True) diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py b/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py deleted file mode 100644 index dcecfa6fcbfd9ff82281831f6c17aa55b2acbef2..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview/monoview_utils.py +++ /dev/null @@ -1,231 +0,0 @@ -import pickle -import os -import matplotlib.pyplot as plt -import numpy as np -from matplotlib.ticker import FuncFormatter -from scipy.stats import uniform, randint - -from ..utils.base import BaseClassifier, ResultAnalyser -from ..utils.hyper_parameter_search import CustomRandint, CustomUniform - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -# __date__ = 2016 - 03 - 25 - -def change_label_to_minus(y): - """ - Change the label 0 to minus one - - Parameters - ---------- - y : - - Returns - ------- - label y with -1 instead of 0 - - """ - minus_y = np.copy(y) - minus_y[np.where(y == 0)] = -1 - return minus_y - - -def change_label_to_zero(y): - """ - Change the label -1 to 0 - - Parameters - ---------- - y - - Returns - ------- - - """ - zeroed_y = np.copy(y) - zeroed_y[np.where(y == -1)] = 0 - return zeroed_y - - -def compute_possible_combinations(params_dict): - n_possibs = np.ones(len(params_dict)) * np.inf - for value_index, value in enumerate(params_dict.values()): - if type(value) == list: - n_possibs[value_index] = len(value) - elif isinstance(value, CustomRandint): - n_possibs[value_index] = value.get_nb_possibilities() - return n_possibs - - -def gen_test_folds_preds(X_train, y_train, KFolds, estimator): - test_folds_preds = [] - train_index = np.arange(len(y_train)) - folds = KFolds.split(train_index, y_train) - fold_lengths = np.zeros(KFolds.n_splits, dtype=int) - for fold_index, (train_indices, test_indices) in enumerate(folds): - fold_lengths[fold_index] = len(test_indices) - estimator.fit(X_train[train_indices], y_train[train_indices]) - test_folds_preds.append(estimator.predict(X_train[train_indices])) - min_fold_length = fold_lengths.min() - test_folds_preds = np.array( - [test_fold_preds[:min_fold_length] for test_fold_preds in - test_folds_preds]) - return test_folds_preds - - -# class CustomRandint: -# """Used as a distribution returning a integer between low and high-1. -# It can be used with a multiplier agrument to be able to perform more complex generation -# for example 10 e -(randint)""" -# -# def __init__(self, low=0, high=0, multiplier=""): -# self.randint = randint(low, high) -# self.multiplier = multiplier -# -# def rvs(self, random_state=None): -# randinteger = self.randint.rvs(random_state=random_state) -# if self.multiplier == "e-": -# return 10 ** -randinteger -# else: -# return randinteger -# -# def get_nb_possibilities(self): -# return self.randint.b - self.randint.a -# -# -# class CustomUniform: -# """Used as a distribution returning a float between loc and loc + scale.. -# It can be used with a multiplier agrument to be able to perform more complex generation -# for example 10 e -(float)""" -# -# def __init__(self, loc=0, state=1, multiplier=""): -# self.uniform = uniform(loc, state) -# self.multiplier = multiplier -# -# def rvs(self, random_state=None): -# unif = self.uniform.rvs(random_state=random_state) -# if self.multiplier == 'e-': -# return 10 ** -unif -# else: -# return unif - - -class BaseMonoviewClassifier(BaseClassifier): - - def get_feature_importance(self, directory, base_file_name, nb_considered_feats=50): - """Used to generate a graph and a pickle dictionary representing - feature importances""" - feature_importances = self.feature_importances_ - sorted_args = np.argsort(-feature_importances) - feature_importances_sorted = feature_importances[sorted_args][ - :nb_considered_feats] - feature_indices_sorted = sorted_args[:nb_considered_feats] - fig, ax = plt.subplots() - x = np.arange(len(feature_indices_sorted)) - formatter = FuncFormatter(percent) - ax.yaxis.set_major_formatter(formatter) - plt.bar(x, feature_importances_sorted) - plt.title("Importance depending on feature") - fig.savefig(os.path.join(directory, base_file_name + "feature_importances.png") - , transparent=True) - plt.close() - features_importances_dict = dict((featureIndex, featureImportance) - for featureIndex, featureImportance in - enumerate(feature_importances) - if featureImportance != 0) - with open(directory + 'feature_importances.pickle', 'wb') as handle: - pickle.dump(features_importances_dict, handle) - interpret_string = "Feature importances : \n" - for featureIndex, featureImportance in zip(feature_indices_sorted, - feature_importances_sorted): - if featureImportance > 0: - interpret_string += "- Feature index : " + str(featureIndex) + \ - ", feature importance : " + str( - featureImportance) + "\n" - return interpret_string - - def get_name_for_fusion(self): - return self.__class__.__name__[:4] - - -def percent(x, pos): - """Used to print percentage of importance on the y axis""" - return '%1.1f %%' % (x * 100) - - -class MonoviewResult(object): - def __init__(self, view_index, classifier_name, view_name, metrics_scores, - full_labels_pred, classifier_config, - classifier, n_features, hps_duration, fit_duration, - pred_duration, class_metric_scores): - self.view_index = view_index - self.classifier_name = classifier_name - self.view_name = view_name - self.metrics_scores = metrics_scores - self.full_labels_pred = full_labels_pred - self.classifier_config = classifier_config - self.clf = classifier - self.n_features = n_features - self.hps_duration = hps_duration - self.fit_duration = fit_duration - self.pred_duration = pred_duration - self.class_metric_scores = class_metric_scores - - def get_classifier_name(self): - return self.classifier_name + "-" + self.view_name - - -def get_accuracy_graph(plotted_data, classifier_name, file_name, - name="Accuracies", bounds=None, bound_name=None, - boosting_bound=None, set="train", zero_to_one=True): # pragma: no cover - if type(name) is not str: - name = " ".join(name.getConfig().strip().split(" ")[:2]) - f, ax = plt.subplots(nrows=1, ncols=1) - if zero_to_one: - ax.set_ylim(bottom=0.0, top=1.0) - ax.set_title(name + " during " + set + " for " + classifier_name) - x = np.arange(len(plotted_data)) - scat = ax.scatter(x, np.array(plotted_data), marker=".") - if bounds: - if boosting_bound: - scat2 = ax.scatter(x, boosting_bound, marker=".") - scat3 = ax.scatter(x, np.array(bounds), marker=".", ) - ax.legend((scat, scat2, scat3), - (name, "Boosting bound", bound_name)) - else: - scat2 = ax.scatter(x, np.array(bounds), marker=".", ) - ax.legend((scat, scat2), - (name, bound_name)) - # plt.tight_layout() - else: - ax.legend((scat,), (name,)) - f.savefig(file_name, transparent=True) - plt.close() - - -class MonoviewResultAnalyzer(ResultAnalyser): - - def __init__(self, view_name, classifier_name, shape, classifier, - classification_indices, k_folds, hps_method, metrics_dict, - n_iter, class_label_names, pred, - directory, base_file_name, labels, database_name, nb_cores, duration): - ResultAnalyser.__init__(self, classifier, classification_indices, - k_folds, hps_method, metrics_dict, n_iter, - class_label_names, pred, - directory, base_file_name, labels, - database_name, nb_cores, duration) - self.view_name = view_name - self.classifier_name = classifier_name - self.shape = shape - - def get_base_string(self): - return "Classification on {} for {} with {}.\n\n".format( - self.database_name, self.view_name, self.classifier_name - ) - - def get_view_specific_info(self): - return "\t- View name : {}\t View shape : {}\n".format(self.view_name, - self.shape) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/__init__.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/__init__.py deleted file mode 100644 index db257abe4c0afa79fa5166cdd037070aecc6a29e..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/__init__.py +++ /dev/null @@ -1,31 +0,0 @@ -import os - -for module in os.listdir(os.path.dirname(os.path.realpath(__file__))): - if module == '__init__.py' or module[-3:] != '.py': - continue - __import__(module[:-3], locals(), globals(), [], 1) -del module -del os - -""" -To be able to add a monoview Classifier to the benchmark, one has to : -Create a .py file named after the classifier -Define a canProbas function returning True or False whether the classifier is able to predict class probabilities -Define a fit function - Input : - DATASET : The data matrix used to fit the classifier - CLASS_LABELS : The labels' array of the training set - NB_CORES : The number of cores the classifier can use to train - kwargs : Any argument specific to the classifier - Output : - classifier : A classifier object, similar to the sk-learn classifier object -Define a ***Search that search hyper parameters for the algorithm. Check HP optimization methods to get all the -different functions to provide (returning the parameters in the order of the kwargs dict for the fit function) -Define a getKWARGS function - Input : - KWARGSList : The list of all the arguments as written in the argument parser - Output : - KWARGSDict : a dictionnary of arguments matching the kwargs needed in train -Define a getConfig function that returns a string explaining the algorithm's config using a config dict or list -Add the arguments to configure the classifier in the parser in exec_classif.py -""" diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py deleted file mode 100644 index 88a042ec6d69a94bb5f64619ad5eee5e55f40339..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/adaboost.py +++ /dev/null @@ -1,152 +0,0 @@ -import time -import os - -import numpy as np -from sklearn.ensemble import AdaBoostClassifier -from sklearn.tree import DecisionTreeClassifier - -from .. import metrics -from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier, \ - get_accuracy_graph -from ..utils.base import base_boosting_estimators - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "Adaboost" - - -class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): - """ - This class implement a Classifier with adaboost algorithm inherit from sklearn - AdaBoostClassifier - - Parameters - ---------- - - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - n_estimators : int number of estimators - - base_estimator : - - kwargs : others arguments - - - Attributes - ---------- - param_name : - - classed_params : - - distribs : - - weird_strings : - - plotted_metric : selection of metric to plot - - plotted_metric_name : name of the metric to plot - - step_predictions : - - """ - - def __init__(self, random_state=None, n_estimators=50, - base_estimator=None, base_estimator_config=None, **kwargs): - - base_estimator = BaseMonoviewClassifier.get_base_estimator(self, - base_estimator, - base_estimator_config) - AdaBoostClassifier.__init__(self, - random_state=random_state, - n_estimators=n_estimators, - base_estimator=base_estimator, - algorithm="SAMME" - ) - self.param_names = ["n_estimators", "base_estimator"] - self.classed_params = ["base_estimator"] - self.distribs = [CustomRandint(low=1, high=500), - base_boosting_estimators] - self.weird_strings = {"base_estimator": "class_name"} - self.plotted_metric = metrics.zero_one_loss - self.plotted_metric_name = "zero_one_loss" - self.step_predictions = None - - def fit(self, X, y, sample_weight=None): - """ - Fit adaboost model - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - - y : { array-like, shape (n_samples,) - Target values class labels in classification - - sample_weight : - - Returns - ------- - self : object - Returns self. - """ - begin = time.time() - AdaBoostClassifier.fit(self, X, y, sample_weight=sample_weight) - end = time.time() - self.train_time = end - begin - self.train_shape = X.shape - self.base_predictions = np.array( - [estim.predict(X) for estim in self.estimators_]) - self.metrics = np.array([self.plotted_metric.score(pred, y) for pred in - self.staged_predict(X)]) - return self - - def predict(self, X): - """ - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Training vectors, where n_samples is the number of samples - and n_features is the number of features. - For kernel="precomputed", the expected shape of X is - (n_samples, n_samples). - - Returns - ------- - predictions : ndarray of shape (n_samples, ) - The estimated labels. - """ - begin = time.time() - pred = AdaBoostClassifier.predict(self, X) - end = time.time() - self.pred_time = end - begin - self.step_predictions = np.array( - [step_pred for step_pred in self.staged_predict(X)]) - return pred - - def get_interpretation(self, directory, base_file_name, y_test, multi_class=False): # pragma: no cover - interpretString = "" - interpretString += self.get_feature_importance(directory, base_file_name) - interpretString += "\n\n Estimator error | Estimator weight\n" - interpretString += "\n".join( - [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for - error, weight in - zip(self.estimator_errors_, self.estimator_weights_)]) - step_test_metrics = np.array( - [self.plotted_metric.score(y_test, step_pred) for step_pred in - self.step_predictions]) - get_accuracy_graph(step_test_metrics, "Adaboost", - os.path.join(directory, base_file_name +"test_metrics.png"), - self.plotted_metric_name, set="test") - np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"), - step_test_metrics, - delimiter=',') - np.savetxt(os.path.join(directory, base_file_name + "train_metrics.csv"), - self.metrics, delimiter=',') - np.savetxt(os.path.join(directory, base_file_name + "times.csv"), - np.array([self.train_time, self.pred_time]), delimiter=',') - return interpretString diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/additions/SVCClassifier.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/additions/SVCClassifier.py deleted file mode 100644 index 06d6da20b104f2bba4e7efab22429c3b17440f31..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/additions/SVCClassifier.py +++ /dev/null @@ -1,17 +0,0 @@ -from sklearn.svm import SVC - - -class SVCClassifier(SVC): - - def __init__(self, random_state=None, kernel='rbf', C=1.0, degree=3, - **kwargs): - super(SVCClassifier, self).__init__( - C=C, - kernel=kernel, - degree=degree, - probability=True, - max_iter=1000, - random_state=random_state - ) - self.classed_params = [] - self.weird_strings = {} diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/additions/__init__.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/additions/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/decision_tree.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/decision_tree.py deleted file mode 100644 index be9f5e24716345660437eda3eb44476a982af04d..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/decision_tree.py +++ /dev/null @@ -1,36 +0,0 @@ -from sklearn.tree import DecisionTreeClassifier - -from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "DecisionTree" - - -class DecisionTree(DecisionTreeClassifier, BaseMonoviewClassifier): - - def __init__(self, random_state=None, max_depth=None, - criterion='gini', splitter='best', **kwargs): - DecisionTreeClassifier.__init__(self, - max_depth=max_depth, - criterion=criterion, - splitter=splitter, - random_state=random_state - ) - self.param_names = ["max_depth", "criterion", "splitter", - 'random_state'] - self.classed_params = [] - self.distribs = [CustomRandint(low=1, high=300), - ["gini", "entropy"], - ["best", "random"], [random_state]] - self.weird_strings = {} - - def get_interpretation(self, directory, base_file_name, y_test, - multiclass=False): - interpretString = "First featrue : \n\t{} <= {}\n".format( - self.tree_.feature[0], - self.tree_.threshold[0]) - interpretString += self.get_feature_importance(directory, base_file_name) - return interpretString diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/gradient_boosting.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/gradient_boosting.py deleted file mode 100644 index 7136370f1c684ead6010e6a9a944da031fdf4779..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/gradient_boosting.py +++ /dev/null @@ -1,96 +0,0 @@ -import time -import os - -import numpy as np -from sklearn.ensemble import GradientBoostingClassifier -from sklearn.tree import DecisionTreeClassifier - -from .. import metrics -from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier, \ - get_accuracy_graph - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "GradientBoosting" - - -class CustomDecisionTreeGB(DecisionTreeClassifier): - def predict(self, X, check_input=True): - y_pred = DecisionTreeClassifier.predict(self, X, - check_input=check_input) - return y_pred.reshape((y_pred.shape[0], 1)).astype(float) - - -class GradientBoosting(GradientBoostingClassifier, BaseMonoviewClassifier): - - def __init__(self, random_state=None, loss="exponential", max_depth=1.0, - n_estimators=100, - init=CustomDecisionTreeGB(max_depth=1), - **kwargs): - GradientBoostingClassifier.__init__(self, - loss=loss, - max_depth=max_depth, - n_estimators=n_estimators, - init=init, - random_state=random_state - ) - self.param_names = ["n_estimators", "max_depth"] - self.classed_params = [] - self.distribs = [CustomRandint(low=50, high=500), - CustomRandint(low=1, high=10),] - self.weird_strings = {} - self.plotted_metric = metrics.zero_one_loss - self.plotted_metric_name = "zero_one_loss" - self.step_predictions = None - - def fit(self, X, y, sample_weight=None, monitor=None): - begin = time.time() - GradientBoostingClassifier.fit(self, X, y, sample_weight=sample_weight) - end = time.time() - self.train_time = end - begin - self.train_shape = X.shape - self.base_predictions = np.array( - [estim[0].predict(X) for estim in self.estimators_]) - self.metrics = np.array( - [self.plotted_metric.score(pred, y) for pred in - self.staged_predict(X)]) - # self.bounds = np.array([np.prod( - # np.sqrt(1 - 4 * np.square(0.5 - self.estimator_errors_[:i + 1]))) for i - # in range(self.estimator_errors_.shape[0])]) - return self - - def predict(self, X): - begin = time.time() - pred = GradientBoostingClassifier.predict(self, X) - end = time.time() - self.pred_time = end - begin - if X.shape != self.train_shape: - self.step_predictions = np.array( - [step_pred for step_pred in self.staged_predict(X)]) - return pred - - def get_interpretation(self, directory, base_file_name, y_test, multi_class=False): - interpretString = "" - if multi_class: - return interpretString - else: - interpretString += self.get_feature_importance(directory, base_file_name) - step_test_metrics = np.array( - [self.plotted_metric.score(y_test, step_pred) for step_pred in - self.step_predictions]) - get_accuracy_graph(step_test_metrics, "AdaboostClassic", - directory + "test_metrics.png", - self.plotted_metric_name, set="test") - get_accuracy_graph(self.metrics, "AdaboostClassic", - directory + "metrics.png", - self.plotted_metric_name) - np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"), step_test_metrics, - delimiter=',') - np.savetxt(os.path.join(directory, base_file_name + "train_metrics.csv"), self.metrics, - delimiter=',') - np.savetxt(os.path.join(directory, base_file_name + "times.csv"), - np.array([self.train_time, self.pred_time]), - delimiter=',') - return interpretString diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/knn.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/knn.py deleted file mode 100644 index f3631bf6b7f281b65b0318028cbbebab89604a10..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/knn.py +++ /dev/null @@ -1,42 +0,0 @@ -from sklearn.neighbors import KNeighborsClassifier - -from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "KNN" - - -class KNN(KNeighborsClassifier, BaseMonoviewClassifier): - """ - Implement extention of KNeighborsClassifier of sklearn - for the usage of the multiview_platform. - - Parameters - ---------- - random_state - n_neighbors - weights - algorithm - p - kwargs - """ - - def __init__(self, random_state=None, n_neighbors=5, - weights='uniform', algorithm='auto', p=2, **kwargs): - KNeighborsClassifier.__init__(self, - n_neighbors=n_neighbors, - weights=weights, - algorithm=algorithm, - p=p - ) - self.param_names = ["n_neighbors", "weights", "algorithm", "p", - "random_state", ] - self.classed_params = [] - self.distribs = [CustomRandint(low=1, high=10), ["uniform", "distance"], - ["auto", "ball_tree", "kd_tree", "brute"], [1, 2], - [random_state]] - self.weird_strings = {} - self.random_state = random_state diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/lasso.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/lasso.py deleted file mode 100644 index c91d2355759867e18375b38d500b79439e23adc2..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/lasso.py +++ /dev/null @@ -1,74 +0,0 @@ -import numpy as np -from sklearn.linear_model import Lasso as LassoSK - -from ..monoview.monoview_utils import CustomRandint, CustomUniform, \ - BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "Lasso" - - -class Lasso(LassoSK, BaseMonoviewClassifier): - """ - - Parameters - ---------- - random_state : - - alpha : float, optional - Constant that multiplies the L1 term. Defaults to 1.0. - ``alpha = 0`` is equivalent to an ordinary least square, solved - by the :class:`LinearRegression` object. For numerical - reasons, using ``alpha = 0`` is with the Lasso object is - not advised - and you should prefer the LinearRegression object. (default( : 10) - - max_iter : int The maximum number of iterations (default : 10) - - warm_start : bool, optional - When set to True, reuse the solution of the previous call to fit as - initialization, otherwise, just erase the previous solution. - - kwargs : others arguments - - Attributes - ---------- - param_name : - - classed_params : - - distribs : - - weird_strings : - - """ - - def __init__(self, random_state=None, alpha=1.0, - max_iter=10, warm_start=False, **kwargs): - LassoSK.__init__(self, - alpha=alpha, - max_iter=max_iter, - warm_start=warm_start, - random_state=random_state - ) - self.param_names = ["max_iter", "alpha", "random_state"] - self.classed_params = [] - self.distribs = [CustomRandint(low=1, high=300), - CustomUniform(), [random_state]] - self.weird_strings = {} - - def fit(self, X, y, check_input=True): - neg_y = np.copy(y) - neg_y[np.where(neg_y == 0)] = -1 - LassoSK.fit(self, X, neg_y) - # self.feature_importances_ = self.coef_/np.sum(self.coef_) - return self - - def predict(self, X): - prediction = LassoSK.predict(self, X) - signed = np.sign(prediction) - signed[np.where(signed == -1)] = 0 - return signed \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/random_forest.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/random_forest.py deleted file mode 100644 index ece278a56e54ab88ffc078e25facee452ac54217..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/random_forest.py +++ /dev/null @@ -1,82 +0,0 @@ -from sklearn.ensemble import RandomForestClassifier - -from ..monoview.monoview_utils import CustomRandint, BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "RandomForest" - - -class RandomForest(RandomForestClassifier, BaseMonoviewClassifier): - """RandomForest Classifier Class - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - n_estimators : int (default : 10) number of estimators - - max_depth : int , optional (default : None) maximum of depth - - criterion : criteria (default : 'gini') - - kwargs : others arguments - - - Attributes - ---------- - param_names : - - distribs : - - classed_params : - - weird_strings : - - """ - - def __init__(self, random_state=None, n_estimators=10, - max_depth=None, criterion='gini', **kwargs): - """ - - Parameters - ---------- - random_state - n_estimators - max_depth - criterion - kwargs - """ - RandomForestClassifier.__init__(self, - n_estimators=n_estimators, - max_depth=max_depth, - criterion=criterion, - random_state=random_state - ) - self.param_names = ["n_estimators", "max_depth", "criterion", - "random_state"] - self.classed_params = [] - self.distribs = [CustomRandint(low=1, high=300), - CustomRandint(low=1, high=10), - ["gini", "entropy"], [random_state]] - self.weird_strings = {} - - def get_interpretation(self, directory, base_file_name, y_test, multiclass=False): - """ - - Parameters - ---------- - directory - y_test - - Returns - ------- - string for interpretation interpret_string - """ - interpret_string = "" - interpret_string += self.get_feature_importance(directory, base_file_name) - return interpret_string diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py deleted file mode 100644 index 09c345bae7993576dcc595f10e5accc56f480a83..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/sgd.py +++ /dev/null @@ -1,56 +0,0 @@ -from sklearn.linear_model import SGDClassifier - -from ..monoview.monoview_utils import CustomUniform, BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "SGD" - - -class SGD(SGDClassifier, BaseMonoviewClassifier): - """ - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - loss : str , (default = "hinge") - penalty : str, (default = "l2") - - alpha : float, (default = 0.0001) - - kwargs : other arguments - - - Attributes - ---------- - param_names : - - distribs : - - classed_params : - - weird_strings : - - """ - - def __init__(self, random_state=None, loss='hinge', - penalty='l2', alpha=0.0001, max_iter=5, tol=None, **kwargs): - SGDClassifier.__init__(self, - loss=loss, - penalty=penalty, - alpha=alpha, - max_iter=5, - tol=None, - random_state=random_state - ) - self.param_names = ["loss", "penalty", "alpha", "random_state"] - self.classed_params = [] - self.distribs = [['log', 'modified_huber'], - ["l1", "l2", "elasticnet"], - CustomUniform(loc=0, state=1), [random_state]] - self.weird_strings = {} diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_linear.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_linear.py deleted file mode 100644 index e5d293a624d2df1b21b8b4702b287c563ecf0c4e..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_linear.py +++ /dev/null @@ -1,36 +0,0 @@ -from multiview_platform.mono_multi_view_classifiers.monoview_classifiers.additions.SVCClassifier import \ - SVCClassifier -from ..monoview.monoview_utils import CustomUniform, BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "SVMLinear" - - -class SVMLinear(SVCClassifier, BaseMonoviewClassifier): - """SVMLinear - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - - C : float, optional (default=1.0) - Penalty parameter C of the error term. - - kwargs : others arguments - - """ - - def __init__(self, random_state=None, C=1.0, **kwargs): - SVCClassifier.__init__(self, - C=C, - kernel='linear', - random_state=random_state - ) - self.param_names = ["C", "random_state"] - self.distribs = [CustomUniform(loc=0, state=1), [random_state]] diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_poly.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_poly.py deleted file mode 100644 index d93bdcc352499390b749d42f95f5d2d799b69317..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_poly.py +++ /dev/null @@ -1,50 +0,0 @@ -from multiview_platform.mono_multi_view_classifiers.monoview_classifiers.additions.SVCClassifier import \ - SVCClassifier -from ..monoview.monoview_utils import CustomUniform, CustomRandint, \ - BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "SVMPoly" - - -class SVMPoly(SVCClassifier, BaseMonoviewClassifier): - """ - Class of SVMPoly for SVC Classifier - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - - C : float, optional (default=1.0) - Penalty parameter C of the error term. - - - degree : - - kwargs : others arguments - - - Attributes - ---------- - - param_names : list of parameters names - - distribs : list of random_state distribution - """ - - def __init__(self, random_state=None, C=1.0, degree=3, **kwargs): - SVCClassifier.__init__(self, - C=C, - kernel='poly', - degree=degree, - random_state=random_state - ) - self.param_names = ["C", "degree", "random_state"] - self.distribs = [CustomUniform(loc=0, state=1), - CustomRandint(low=2, high=30), [random_state]] diff --git a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_rbf.py b/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_rbf.py deleted file mode 100644 index 1af02e4d6e7fffbd35c0b2d0d554006a46b55752..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/monoview_classifiers/svm_rbf.py +++ /dev/null @@ -1,41 +0,0 @@ -from multiview_platform.mono_multi_view_classifiers.monoview_classifiers.additions.SVCClassifier import \ - SVCClassifier -from ..monoview.monoview_utils import CustomUniform, BaseMonoviewClassifier - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - -classifier_class_name = "SVMRBF" - - -class SVMRBF(SVCClassifier, BaseMonoviewClassifier): - """ - class SVMRBF for classifier SVCC - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - C : - - kwargs : others arguments - - Attributes - ---------- - - param_names : list of parameters names - - distribs : list of random_state distribution - """ - - def __init__(self, random_state=None, C=1.0, **kwargs): - SVCClassifier.__init__(self, - C=C, - kernel='rbf', - random_state=random_state - ) - self.param_names = ["C", "random_state"] - self.distribs = [CustomUniform(loc=0, state=1), [random_state]] diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview/__init__.py deleted file mode 100644 index a3ab07f2a12aedb7ffa60628a0103c838e9af55b..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# # from Code.mono_multi_view_classifiers.multiview_classifiers import fusion, Mumbo -# from . import ExecMultiview -# # from . import Mumbo -# -# __all__ = ['fusion', 'Mumbo'] diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py deleted file mode 100644 index c89034cf494399fc9cfa2561531192f79c93c2bd..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ /dev/null @@ -1,356 +0,0 @@ -import logging -import os -import os.path -import time - -import h5py -import numpy as np - -from .multiview_utils import MultiviewResult, MultiviewResultAnalyzer -from .. import multiview_classifiers -from ..utils import hyper_parameter_search -from ..utils.multiclass import get_mc_estim -from ..utils.organization import secure_file_path - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def init_constants(kwargs, classification_indices, metrics, - name, nb_cores, k_folds, - dataset_var, directory): - """ - Used to init the constants - Parameters - ---------- - kwargs : - - classification_indices : - - metrics : - - name : - - nb_cores : nint number of cares to execute - - k_folds : - - dataset_var : {array-like} shape (n_samples, n_features) - dataset variable - - Returns - ------- - tuple of (classifier_name, t_start, views_indices, - classifier_config, views, learning_rate) - """ - views = kwargs["view_names"] - views_indices = kwargs["view_indices"] - if metrics is None: - metrics = {"f1_score*":{}} - classifier_name = kwargs["classifier_name"] - classifier_config = kwargs[classifier_name] - learning_rate = len(classification_indices[0]) / float( - (len(classification_indices[0]) + len(classification_indices[1]))) - t_start = time.time() - logging.info("Info\t: Classification - Database : " + str( - name) + " ; Views : " + ", ".join(views) + - " ; Algorithm : " + classifier_name + " ; Cores : " + str( - nb_cores) + ", Train ratio : " + str(learning_rate) + - ", CV on " + str(k_folds.n_splits) + " folds") - - for view_index, view_name in zip(views_indices, views): - logging.info("Info:\t Shape of " + str(view_name) + " :" + str( - dataset_var.get_shape())) - labels = dataset_var.get_labels() - directory = os.path.join(directory, classifier_name) - base_file_name = classifier_name+"-"+dataset_var.get_name()+"-" - output_file_name = os.path.join(directory, base_file_name) - return classifier_name, t_start, views_indices, \ - classifier_config, views, learning_rate, labels, output_file_name,\ - directory, base_file_name, metrics - - -def save_results(string_analysis, images_analysis, output_file_name, - confusion_matrix): # pragma: no cover - """ - Save results in derectory - - Parameters - ---------- - - classifier : classifier class - - labels_dictionary : dict dictionary of labels - - string_analysis : str - - views : - - classifier_module : module of the classifier - - classification_kargs : - - directory : str directory - - learning_rate : - - name : - - images_analysis : - - """ - logging.info(string_analysis) - secure_file_path(output_file_name) - output_text_file = open(output_file_name + 'summary.txt', 'w', encoding="utf-8") - output_text_file.write(string_analysis) - output_text_file.close() - np.savetxt(output_file_name+"confusion_matrix.csv", confusion_matrix, - delimiter=',') - - if images_analysis is not None: - for image_name in images_analysis.keys(): - if os.path.isfile(output_file_name + image_name + ".png"): - for i in range(1, 20): - test_file_name = output_file_name + image_name + "-" + str( - i) + ".png" - if not os.path.isfile(test_file_name): - images_analysis[image_name].savefig(test_file_name, - transparent=True) - break - - images_analysis[image_name].savefig( - output_file_name + image_name + '.png', transparent=True) - - -def exec_multiview_multicore(directory, core_index, name, learning_rate, - nb_folds, - database_type, path, labels_dictionary, - random_state, labels, - hyper_param_search=False, nb_cores=1, metrics=None, - n_iter=30, **arguments): # pragma: no cover - """ - execute multiview process on - - Parameters - ---------- - - directory : indicate the directory - - core_index : - - name : name of the data file to perform - - learning_rate : - - nb_folds : - - database_type : - - path : path to the data name - - labels_dictionary - - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - labels : - - hyper_param_search : - - nb_cores : in number of cores - - metrics : metric to use - - n_iter : int number of iterations - - arguments : others arguments - - Returns - ------- - exec_multiview on directory, dataset_var, name, learning_rate, nb_folds, 1, - database_type, path, labels_dictionary, - random_state, labels, - hyper_param_search=hyper_param_search, metrics=metrics, - n_iter=n_iter, **arguments - """ - """Used to load an HDF5 dataset_var for each parallel job and execute multiview classification""" - dataset_var = h5py.File(path + name + str(core_index) + ".hdf5", "r") - return exec_multiview(directory, dataset_var, name, learning_rate, nb_folds, - 1, - database_type, path, labels_dictionary, - random_state, labels, - hps_method=hyper_param_search, - metrics=metrics, - n_iter=n_iter, **arguments) - - -def exec_multiview(directory, dataset_var, name, classification_indices, - k_folds, - nb_cores, database_type, path, - labels_dictionary, random_state, labels, - hps_method="None", hps_kwargs={}, metrics=None, - n_iter=30, **kwargs): - """Used to execute multiview classification and result analysis - - Parameters - ---------- - - directory : indicate the directory - - - dataset_var : - - name - - classification_indices - - k_folds - - nb_cores - - database_type - - path - - labels_dictionary : dict dictionary of labels - - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - - labels - - hps_method - - metrics - - n_iter : int number of iterations - - kwargs - - Returns - ------- - - ``MultiviewResult`` - """ - - logging.debug("Start:\t Initialize constants") - cl_type, \ - t_start, \ - views_indices, \ - classifier_config, \ - views, \ - learning_rate, \ - labels, \ - output_file_name,\ - directory,\ - base_file_name, \ - metrics = init_constants(kwargs, classification_indices, metrics, name, - nb_cores, k_folds, dataset_var, directory) - logging.debug("Done:\t Initialize constants") - - extraction_time = time.time() - t_start - logging.info("Info:\t Extraction duration " + str(extraction_time) + "s") - - logging.debug("Start:\t Getting train/test split") - learning_indices, validation_indices = classification_indices - logging.debug("Done:\t Getting train/test split") - - logging.debug("Start:\t Getting classifiers modules") - # classifierPackage = getattr(multiview_classifiers, - # CL_type) # Permet d'appeler un module avec une string - classifier_module = getattr(multiview_classifiers, cl_type) - classifier_name = classifier_module.classifier_class_name - # classifierClass = getattr(classifierModule, CL_type + "Class") - logging.debug("Done:\t Getting classifiers modules") - - logging.debug("Start:\t Optimizing hyperparameters") - hps_beg = time.monotonic() - if hps_method != "None": - hps_method_class = getattr(hyper_parameter_search, hps_method) - estimator = getattr(classifier_module, classifier_name)( - random_state=random_state, - **classifier_config) - estimator = get_mc_estim(estimator, random_state, - multiview=True, - y=dataset_var.get_labels()[learning_indices]) - hps = hps_method_class(estimator, scoring=metrics, cv=k_folds, - random_state=random_state, framework="multiview", - n_jobs=nb_cores, - learning_indices=learning_indices, - view_indices=views_indices, **hps_kwargs) - hps.fit(dataset_var, dataset_var.get_labels(), ) - classifier_config = hps.get_best_params() - hps.gen_report(output_file_name) - # classifier_config = hyper_parameter_search.search_best_settings( - # dataset_var, dataset_var.get_labels(), classifier_module, - # classifier_name, - # metrics[0], learning_indices, k_folds, random_state, - # output_file_name, nb_cores=nb_cores, views_indices=views_indices, - # searching_tool=hps_method, n_iter=n_iter, - # classifier_config=classifier_config) - hps_duration = time.monotonic() - hps_beg - classifier = get_mc_estim( - getattr(classifier_module, classifier_name)(random_state=random_state, - **classifier_config), - random_state, multiview=True, - y=dataset_var.get_labels()) - logging.debug("Done:\t Optimizing hyperparameters") - logging.debug("Start:\t Fitting classifier") - fit_beg = time.monotonic() - classifier.fit(dataset_var, dataset_var.get_labels(), - train_indices=learning_indices, - view_indices=views_indices) - fit_duration = time.monotonic() - fit_beg - logging.debug("Done:\t Fitting classifier") - - logging.debug("Start:\t Predicting") - train_pred = classifier.predict(dataset_var, - example_indices=learning_indices, - view_indices=views_indices) - pred_beg = time.monotonic() - test_pred = classifier.predict(dataset_var, - example_indices=validation_indices, - view_indices=views_indices) - pred_duration = time.monotonic() - pred_beg - full_pred = np.zeros(dataset_var.get_labels().shape, dtype=int) - 100 - full_pred[learning_indices] = train_pred - full_pred[validation_indices] = test_pred - logging.info("Done:\t Pertidcting") - - whole_duration = time.time() - t_start - logging.info( - "Info:\t Classification duration " + str(extraction_time) + "s") - - # TODO: get better cltype - - logging.info("Start:\t Result Analysis for " + cl_type) - times = (extraction_time, whole_duration) - result_analyzer = MultiviewResultAnalyzer(view_names=views, - classifier=classifier, - classification_indices=classification_indices, - k_folds=k_folds, - hps_method=hps_method, - metrics_dict=metrics, - n_iter=n_iter, - class_label_names=list(labels_dictionary.values()), - pred=full_pred, - directory=directory, - base_file_name=base_file_name, - labels=labels, - database_name=dataset_var.get_name(), - nb_cores=nb_cores, - duration=whole_duration) - string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ - confusion_matrix = result_analyzer.analyze() - logging.info("Done:\t Result Analysis for " + cl_type) - - logging.debug("Start:\t Saving preds") - save_results(string_analysis, images_analysis, output_file_name, confusion_matrix) - logging.debug("Start:\t Saving preds") - - return MultiviewResult(cl_type, classifier_config, metrics_scores, - full_pred, hps_duration, fit_duration, - pred_duration, class_metrics_scores) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py deleted file mode 100644 index 9ad93b6c55774f57f89028560bd8f82de9e801d3..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py +++ /dev/null @@ -1,197 +0,0 @@ -from abc import abstractmethod - -import numpy as np - -from .. import monoview_classifiers -from ..utils.base import BaseClassifier, ResultAnalyser -from ..utils.dataset import RAMDataset, get_examples_views_indices - - -# class FakeEstimator(): -# -# def predict(self, X, example_indices=None, view_indices=None): -# return np.zeros(example_indices.shape[0]) - - -class BaseMultiviewClassifier(BaseClassifier): - """ - BaseMultiviewClassifier base of Multiview classifiers - - Parameters - ---------- - random_state : int seed, RandomState instance, or None (default=None) - The seed of the pseudo random number multiview_generator to use when - shuffling the data. - """ - - def __init__(self, random_state): - - self.random_state = random_state - self.short_name = self.__module__.split(".")[-1] - self.weird_strings = {} - self.used_views = None - - @abstractmethod - def fit(self, X, y, train_indices=None, view_indices=None): # pragma: no cover - pass - - @abstractmethod - def predict(self, X, example_indices=None, view_indices=None): # pragma: no cover - pass - - def _check_views(self, view_indices): # pragma: no cover - if self.used_views is not None and not np.array_equal(np.sort(self.used_views), np.sort(view_indices)): - raise ValueError('Used {} views to fit, and trying to predict on {}'.format(self.used_views, view_indices)) - - # def to_str(self, param_name): - # if param_name in self.weird_strings: - # string = "" - # if "class_name" in self.weird_strings[param_name]: - # string += self.get_params()[param_name].__class__.__name__ - # if "config" in self.weird_strings[param_name]: - # string += "( with " + self.get_params()[ - # param_name].params_to_string() + ")" - # else: - # string += self.weird_strings[param_name]( - # self.get_params()[param_name]) - # return string - # else: - # return str(self.get_params()[param_name]) - - def accepts_multi_class(self, random_state, n_samples=10, dim=2, - n_classes=3, n_views=2): - if int(n_samples / n_classes) < 1: - raise ValueError( - "n_samples ({}) / n_classes ({}) must be over 1".format( - n_samples, - n_classes)) - fake_mc_X = RAMDataset( - views=[random_state.randint(low=0, high=101, - size=(n_samples, dim)) - for i in range(n_views)], - labels=[class_index - for _ in range(int(n_samples / n_classes)) - for class_index in range(n_classes)], - are_sparse=False, - name="mc_dset", - labels_names=[str(class_index) for class_index in range(n_classes)], - view_names=["V0", "V1"], - ) - - fake_mc_y = [class_index - for _ in range(int(n_samples / n_classes)) - for class_index in range(n_classes)] - fake_mc_y += [0 for _ in range(n_samples % n_classes)] - fake_mc_y = np.asarray(fake_mc_y) - try: - self.fit(fake_mc_X, fake_mc_y) - return True - except ValueError: - return False - - -class ConfigGenerator(): - - def __init__(self, classifier_names): - self.distribs = {} - for classifier_name in classifier_names: - classifier_class = get_monoview_classifier(classifier_name) - self.distribs[classifier_name] = dict((param_name, param_distrib) - for param_name, param_distrib - in - zip( - classifier_class().param_names, - classifier_class().distribs) - if - param_name != "random_state") - - def rvs(self, random_state=None): - config_sample = {} - for classifier_name, classifier_config in self.distribs.items(): - config_sample[classifier_name] = {} - for param_name, param_distrib in classifier_config.items(): - if hasattr(param_distrib, "rvs"): - config_sample[classifier_name][ - param_name] = param_distrib.rvs( - random_state=random_state) - else: - config_sample[classifier_name][ - param_name] = param_distrib[ - random_state.randint(len(param_distrib))] - return config_sample - - -def get_available_monoview_classifiers(need_probas=False): - available_classifiers = [module_name - for module_name in dir(monoview_classifiers) - if not ( - module_name.startswith("__") or module_name == "additions")] - if need_probas: - proba_classifiers = [] - for module_name in available_classifiers: - module = getattr(monoview_classifiers, module_name) - classifier_class = getattr(module, module.classifier_class_name)() - proba_prediction = getattr(classifier_class, "predict_proba", None) - if callable(proba_prediction): - proba_classifiers.append(module_name) - available_classifiers = proba_classifiers - return available_classifiers - - -def get_monoview_classifier(classifier_name, multiclass=False): - classifier_module = getattr(monoview_classifiers, classifier_name) - classifier_class = getattr(classifier_module, - classifier_module.classifier_class_name) - return classifier_class - - -from .. import multiview_classifiers - - -class MultiviewResult(object): - def __init__(self, classifier_name, classifier_config, - metrics_scores, full_labels, hps_duration, fit_duration, - pred_duration, class_metric_scores): - self.classifier_name = classifier_name - self.classifier_config = classifier_config - self.metrics_scores = metrics_scores - self.full_labels_pred = full_labels - self.hps_duration = hps_duration - self.fit_duration = fit_duration - self.pred_duration = pred_duration - self.class_metric_scores = class_metric_scores - - def get_classifier_name(self): - try: - multiview_classifier_module = getattr(multiview_classifiers, - self.classifier_name) - multiview_classifier = getattr(multiview_classifier_module, - multiview_classifier_module.classifier_class_name)( - 42, **self.classifier_config) - return multiview_classifier.short_name - except: - return self.classifier_name - - -class MultiviewResultAnalyzer(ResultAnalyser): - - def __init__(self, view_names, classifier, classification_indices, k_folds, - hps_method, metrics_dict, n_iter, class_label_names, - pred, directory, base_file_name, labels, - database_name, nb_cores, duration): - if hps_method.endswith("equiv"): - n_iter = n_iter*len(view_names) - ResultAnalyser.__init__(self, classifier, classification_indices, k_folds, - hps_method, metrics_dict, n_iter, class_label_names, - pred, directory, - base_file_name, labels, database_name, - nb_cores, duration) - self.classifier_name = classifier.short_name - self.view_names = view_names - - def get_base_string(self, ): - return "Multiview classification on {} with {}\n\n".format(self.database_name, - self.classifier_name) - - def get_view_specific_info(self): - return "\t- Views : " + ', '.join(self.view_names) + "\n" \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/profile b/multiview_platform/mono_multi_view_classifiers/multiview/profile deleted file mode 100644 index 40a016510edec99a8c0e78e9ba4bf248d41b8c62..0000000000000000000000000000000000000000 Binary files a/multiview_platform/mono_multi_view_classifiers/multiview/profile and /dev/null differ diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/__init__.py deleted file mode 100644 index 6e242133fa45a01a2506f423a543c742390259be..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -import os - -for module in os.listdir(os.path.dirname(os.path.realpath(__file__))): - if module == '__init__.py' or module[ - -4:] == '.pyc' or module == '__pycache__' or module[ - -3:] != '.py': - continue - __import__(module[:-3], locals(), globals(), [], 1) -del module -del os diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/__init__.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/diversity_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/diversity_utils.py deleted file mode 100644 index a49845191d950fa26026d7d5945ba5853275f199..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/diversity_utils.py +++ /dev/null @@ -1,200 +0,0 @@ -import itertools -import math - -import numpy as np - -from .fusion_utils import BaseFusionClassifier -from ...multiview.multiview_utils import ConfigGenerator, \ - get_available_monoview_classifiers, \ - BaseMultiviewClassifier -from ...utils.dataset import get_examples_views_indices - - -class DiversityFusionClassifier(BaseMultiviewClassifier, - BaseFusionClassifier): - """This is the base class for all the diversity fusion based classifiers.""" - - def __init__(self, random_state=None, classifier_names=None, - monoview_estimators=None, classifier_configs=None): - """Used to init the instances""" - BaseMultiviewClassifier.__init__(self, random_state) - if classifier_names is None: - classifier_names = get_available_monoview_classifiers() - self.classifier_names = classifier_names - self.param_names = ["classifier_configs"] - self.distribs = [ConfigGenerator(get_available_monoview_classifiers())] - self.monoview_estimators = monoview_estimators - self.classifier_configs = classifier_configs - - def fit(self, X, y, train_indices=None, view_indices=None): - train_indices, view_indices = get_examples_views_indices(X, - train_indices, - view_indices) - self.used_views = view_indices - # TODO : Finer analysis, may support a bit of mutliclass - if np.unique(y[train_indices]).shape[0] > 2: - raise ValueError( - "Multiclass not supported, classes used : {}".format( - np.unique(y[train_indices]))) - if self.monoview_estimators is None: - self.monoview_estimators = [] - for classifier_idx, classifier_name in enumerate( - self.classifier_names): - self.monoview_estimators.append([]) - for idx, view_idx in enumerate(view_indices): - estimator = self.init_monoview_estimator(classifier_name, - self.classifier_configs) - estimator.fit(X.get_v(view_idx, train_indices), - y[train_indices]) - self.monoview_estimators[classifier_idx].append(estimator) - else: - pass # TODO - self.choose_combination(X, y, train_indices, view_indices) - return self - - def predict(self, X, example_indices=None, view_indices=None): - """Just a weighted majority vote""" - example_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - self._check_views(view_indices) - nb_class = X.get_nb_class() - if nb_class > 2: - nb_class = 3 - votes = np.zeros((len(example_indices), nb_class), dtype=float) - monoview_predictions = [ - monoview_estimator.predict(X.get_v(view_idx, example_indices)) - for view_idx, monoview_estimator - in zip(view_indices, self.monoview_estimators)] - for idx, example_index in enumerate(example_indices): - for monoview_estimator_index, monoview_prediciton in enumerate( - monoview_predictions): - if int(monoview_prediciton[idx]) == -100: - votes[idx, 2] += 1 - else: - votes[idx, int(monoview_prediciton[idx])] += 1 - predicted_labels = np.argmax(votes, axis=1) - return predicted_labels - - def get_classifiers_decisions(self, X, view_indices, examples_indices): - classifiers_decisions = np.zeros((len(self.monoview_estimators), - len(view_indices), - len(examples_indices))) - for estimator_idx, estimator in enumerate(self.monoview_estimators): - for idx, view_index in enumerate(view_indices): - classifiers_decisions[estimator_idx, idx, :] = estimator[ - idx].predict(X.get_v(view_index, examples_indices)) - return classifiers_decisions - - def init_combinations(self, X, example_indices, view_indices): - classifiers_decisions = self.get_classifiers_decisions(X, view_indices, - example_indices) - nb_classifiers, nb_views, n_examples = classifiers_decisions.shape - combinations = itertools.combinations_with_replacement( - range(nb_classifiers), - nb_views) - nb_combinations = int( - math.factorial(nb_classifiers + nb_views - 1) / math.factorial( - nb_views) / math.factorial( - nb_classifiers - 1)) - div_measure = np.zeros(nb_combinations) - combis = np.zeros((nb_combinations, nb_views), dtype=int) - return combinations, combis, div_measure, classifiers_decisions, nb_views - - -class GlobalDiversityFusionClassifier(DiversityFusionClassifier): - - def choose_combination(self, X, y, examples_indices, view_indices): - combinations, combis, div_measure, classifiers_decisions, nb_views = self.init_combinations( - X, examples_indices, view_indices) - for combinationsIndex, combination in enumerate(combinations): - combis[combinationsIndex] = combination - div_measure[combinationsIndex] = self.diversity_measure( - classifiers_decisions, - combination, - y[examples_indices]) - best_combi_index = np.argmax(div_measure) - best_combination = combis[best_combi_index] - self.monoview_estimators = [ - self.monoview_estimators[classifier_index][view_index] - for view_index, classifier_index - in enumerate(best_combination)] - - -class CoupleDiversityFusionClassifier(DiversityFusionClassifier): - - def choose_combination(self, X, y, examples_indices, view_indices): - combinations, combis, div_measure, classifiers_decisions, nb_views = self.init_combinations( - X, examples_indices, view_indices) - for combinations_index, combination in enumerate(combinations): - combis[combinations_index] = combination - combi_with_view = [(viewIndex, combiIndex) for viewIndex, combiIndex - in - enumerate(combination)] - binomes = itertools.combinations(combi_with_view, 2) - nb_binomes = int( - math.factorial(nb_views) / 2 / math.factorial(nb_views - 2)) - couple_diversities = np.zeros(nb_binomes) - for binome_index, binome in enumerate(binomes): - (view_index_1, classifier_index_1), ( - view_index_2, classifier_index_2) = binome - couple_diversity = np.mean( - self.diversity_measure( - classifiers_decisions[classifier_index_1, view_index_1], - classifiers_decisions[classifier_index_2, view_index_2], - y[examples_indices]) - ) - couple_diversities[binome_index] = couple_diversity - div_measure[combinations_index] = np.mean(couple_diversities) - best_combi_index = np.argmax(div_measure) - best_combination = combis[best_combi_index] - self.monoview_estimators = [ - self.monoview_estimators[classifier_index][view_index] - for view_index, classifier_index - in enumerate(best_combination)] - -# -# def CQ_div_measure(classifiersNames, classifiersDecisions, measurement, -# foldsGroudTruth): -# """ -# This function is used to measure a pseudo-CQ measurement based on the minCq algorithm. -# It's a mix between couple_div_measure and global_div_measure that uses multiple measurements. -# """ -# nbViews, nbClassifiers, nbFolds, foldsLen = classifiersDecisions.shape -# combinations = itertools.combinations_with_replacement(range(nbClassifiers), -# nbViews) -# nbCombinations = int( -# math.factorial(nbClassifiers + nbViews - 1) / math.factorial( -# nbViews) / math.factorial(nbClassifiers - 1)) -# div_measure = np.zeros(nbCombinations) -# combis = np.zeros((nbCombinations, nbViews), dtype=int) -# -# for combinationsIndex, combination in enumerate(combinations): -# combis[combinationsIndex] = combination -# combiWithView = [(viewIndex, combiIndex) for viewIndex, combiIndex in -# enumerate(combination)] -# binomes = itertools.combinations(combiWithView, 2) -# nbBinomes = int( -# math.factorial(nbViews) / 2 / math.factorial(nbViews - 2)) -# disagreement = np.zeros(nbBinomes) -# div_measure[combinationsIndex] = measurement[1](classifiersDecisions, -# combination, -# foldsGroudTruth, -# foldsLen) -# for binomeIndex, binome in enumerate(binomes): -# (viewIndex1, classifierIndex1), ( -# viewIndex2, classifierIndex2) = binome -# nbDisagree = np.sum(measurement[0]( -# classifiersDecisions[viewIndex1, classifierIndex1], -# classifiersDecisions[viewIndex2, classifierIndex2], -# foldsGroudTruth) -# , axis=1) / float(foldsLen) -# disagreement[binomeIndex] = np.mean(nbDisagree) -# div_measure[combinationsIndex] /= float(np.mean(disagreement)) -# bestCombiIndex = np.argmin(div_measure) -# bestCombination = combis[bestCombiIndex] -# -# return [classifiersNames[viewIndex][index] for viewIndex, index in -# enumerate(bestCombination)], div_measure[ -# bestCombiIndex] -# diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/fusion_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/fusion_utils.py deleted file mode 100644 index 29447d15b9ce9e45c5f3365b342f1b6fbfe07b92..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/fusion_utils.py +++ /dev/null @@ -1,36 +0,0 @@ -import inspect - -from ...multiview.multiview_utils import get_monoview_classifier -from ...utils.multiclass import get_mc_estim - - -class BaseFusionClassifier(): - - def init_monoview_estimator(self, classifier_name, classifier_config, - classifier_index=None, multiclass=False): - if classifier_index is not None: - if classifier_config is not None : - classifier_configs = classifier_config - else: - classifier_configs = None - else: - classifier_configs = classifier_config - if classifier_configs is not None and classifier_name in classifier_configs: - if 'random_state' in inspect.getfullargspec( - get_monoview_classifier(classifier_name).__init__).args: - estimator = get_monoview_classifier(classifier_name)( - random_state=self.random_state, - **classifier_configs[classifier_name]) - else: - estimator = get_monoview_classifier(classifier_name)( - **classifier_configs[classifier_name]) - else: - if 'random_state' in inspect.getfullargspec( - get_monoview_classifier(classifier_name).__init__).args: - estimator = get_monoview_classifier(classifier_name)( - random_state=self.random_state) - else: - estimator = get_monoview_classifier(classifier_name)() - - return get_mc_estim(estimator, random_state=self.random_state, - multiview=False, multiclass=multiclass) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/jumbo_fusion_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/jumbo_fusion_utils.py deleted file mode 100644 index e9cbac4c770a826183d713d691f7bcee25225cbe..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/jumbo_fusion_utils.py +++ /dev/null @@ -1,83 +0,0 @@ -import numpy as np - -from .late_fusion_utils import LateFusionClassifier -from ...monoview.monoview_utils import CustomRandint -from ...utils.dataset import get_examples_views_indices - - -class BaseJumboFusion(LateFusionClassifier): - - def __init__(self, random_state, classifiers_names=None, - classifier_configs=None, - nb_cores=1, weights=None, nb_monoview_per_view=1, rs=None): - LateFusionClassifier.__init__(self, random_state, - classifiers_names=classifiers_names, - classifier_configs=classifier_configs, - nb_cores=nb_cores, weights=weights, - rs=rs) - self.param_names += ["nb_monoview_per_view", ] - self.distribs += [CustomRandint(1, 10)] - self.nb_monoview_per_view = nb_monoview_per_view - - def set_params(self, nb_monoview_per_view=1, **params): - self.nb_monoview_per_view = nb_monoview_per_view - LateFusionClassifier.set_params(self, **params) - - def predict(self, X, example_indices=None, view_indices=None): - example_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - self._check_views(view_indices) - monoview_decisions = self.predict_monoview(X, - example_indices=example_indices, - view_indices=view_indices) - return self.aggregation_estimator.predict(monoview_decisions) - - def fit(self, X, y, train_indices=None, view_indices=None): - train_indices, view_indices = get_examples_views_indices(X, - train_indices, - view_indices) - self.used_views = view_indices - self.init_classifiers(len(view_indices), - nb_monoview_per_view=self.nb_monoview_per_view) - self.fit_monoview_estimators(X, y, train_indices=train_indices, - view_indices=view_indices) - monoview_decisions = self.predict_monoview(X, - example_indices=train_indices, - view_indices=view_indices) - self.aggregation_estimator.fit(monoview_decisions, y[train_indices]) - return self - - def fit_monoview_estimators(self, X, y, train_indices=None, - view_indices=None): - if np.unique(y).shape[0] > 2: - multiclass = True - else: - multiclass = False - self.monoview_estimators = [ - [self.init_monoview_estimator(classifier_name, - self.classifier_configs[ - classifier_index], - multiclass=multiclass) - for classifier_index, classifier_name - in enumerate(self.classifiers_names)] - for _ in view_indices] - - self.monoview_estimators = [[estimator.fit( - X.get_v(view_indices[idx], train_indices), y[train_indices]) - for estimator in view_estimators] - for idx, view_estimators in - enumerate(self.monoview_estimators)] - return self - - def predict_monoview(self, X, example_indices=None, view_indices=None): - monoview_decisions = np.zeros((len(example_indices), - len(view_indices) * len( - self.classifiers_names))) - for idx, view_estimators in enumerate(self.monoview_estimators): - for estimator_index, estimator in enumerate(view_estimators): - monoview_decisions[:, len( - self.classifiers_names) * idx + estimator_index] = estimator.predict( - X.get_v(view_indices[idx], - example_indices)) - return monoview_decisions diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/late_fusion_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/late_fusion_utils.py deleted file mode 100644 index 0916f76fb8fa74c2287e6bde40a38f63cdf9743a..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/late_fusion_utils.py +++ /dev/null @@ -1,178 +0,0 @@ -import numpy as np - -from .fusion_utils import BaseFusionClassifier -from ...multiview.multiview_utils import BaseMultiviewClassifier, \ - get_available_monoview_classifiers, ConfigGenerator -from ...utils.dataset import get_examples_views_indices - - -class ClassifierDistribution: - - def __init__(self, seed=42, available_classifiers=None): - self.random_state = np.random.RandomState(seed) - self.available_classifiers = available_classifiers - - def draw(self, nb_view, rs=None): - if rs is not None: - self.random_state.seed(rs) - return self.random_state.choice(self.available_classifiers, - size=nb_view, replace=True) - - -class ClassifierCombinator: - - def __init__(self, need_probas=False): - self.available_classifiers = get_available_monoview_classifiers( - need_probas) - - def rvs(self, random_state=None): - return ClassifierDistribution(seed=random_state.randint(1), - available_classifiers=self.available_classifiers) - - -class ConfigDistribution: - - def __init__(self, seed=42, available_classifiers=None): - self.random_state = np.random.RandomState(seed) - self.config_generator = ConfigGenerator(available_classifiers) - - def draw(self, nb_view, rs=None): - if rs is not None: - self.random_state.seed(rs) - config_samples = [self.config_generator.rvs(self.random_state) - for _ in range(nb_view)] - return config_samples - - -class MultipleConfigGenerator: - - def __init__(self, ): - self.available_classifiers = get_available_monoview_classifiers() - - def rvs(self, random_state=None): - return ConfigDistribution(seed=random_state.randint(1), - available_classifiers=self.available_classifiers) - - -class WeightDistribution: - - def __init__(self, seed=42, distribution_type="uniform"): - self.random_state = np.random.RandomState(seed) - self.distribution_type = distribution_type - - def draw(self, nb_view): - if self.distribution_type == "uniform": - return self.random_state.random_sample(nb_view) - - -class WeightsGenerator: - - def __init__(self, distibution_type="uniform"): - self.distribution_type = distibution_type - - def rvs(self, random_state=None): - return WeightDistribution(seed=random_state.randint(1), - distribution_type=self.distribution_type) - - -class LateFusionClassifier(BaseMultiviewClassifier, BaseFusionClassifier): - - def __init__(self, random_state=None, classifiers_names=None, - classifier_configs=None, nb_cores=1, weights=None, - rs=None): - BaseMultiviewClassifier.__init__(self, random_state) - self.classifiers_names = classifiers_names - self.classifier_configs = classifier_configs - self.nb_cores = nb_cores - self.weights = weights - self.rs = rs - self.param_names = ["classifiers_names", "classifier_configs", - "weights", "rs"] - self.distribs = [ClassifierCombinator(need_probas=self.need_probas), - MultipleConfigGenerator(), - WeightsGenerator(), - np.arange(1000)] - - def fit(self, X, y, train_indices=None, view_indices=None): - train_indices, view_indices = get_examples_views_indices(X, - train_indices, - view_indices) - self.used_views = view_indices - if np.unique(y).shape[0] > 2: - multiclass = True - else: - multiclass = False - self.init_params(len(view_indices), multiclass) - if np.unique(y[train_indices]).shape[0] > 2: - raise ValueError("Multiclass not supported") - self.monoview_estimators = [ - monoview_estimator.fit(X.get_v(view_index, train_indices), - y[train_indices]) - for view_index, monoview_estimator - in zip(view_indices, - self.monoview_estimators)] - return self - - def init_params(self, nb_view, mutliclass=False): - if self.weights is None: - self.weights = np.ones(nb_view) / nb_view - elif isinstance(self.weights, WeightDistribution): - self.weights = self.weights.draw(nb_view) - else: - self.weights = self.weights / np.sum(self.weights) - - self.init_classifiers(nb_view) - - self.monoview_estimators = [ - self.init_monoview_estimator(classifier_name, - self.classifier_configs[ - classifier_index], - classifier_index=classifier_index, - multiclass=mutliclass) - for classifier_index, classifier_name - in enumerate(self.classifiers_names)] - - def init_classifiers(self, nb_view, nb_monoview_per_view=None): - if nb_monoview_per_view is not None: - nb_clfs = nb_monoview_per_view - else: - nb_clfs = nb_view - - if isinstance(self.classifiers_names, ClassifierDistribution): - self.classifiers_names = self.classifiers_names.draw(nb_clfs, - self.rs) - elif self.classifiers_names is None: - self.classifiers_names = ["decision_tree" for _ in range(nb_clfs)] - elif isinstance(self.classifiers_names, str): - self.classifiers_names = [self.classifiers_names - for _ in range(nb_clfs)] - - if isinstance(self.classifier_configs, ConfigDistribution): - self.classifier_configs = [{classifier_name : config[classifier_name]} for config, classifier_name in zip(self.classifier_configs.draw(nb_clfs, - self.rs), self.classifiers_names)] - elif isinstance(self.classifier_configs, dict): - self.classifier_configs = [ - {classifier_name: self.classifier_configs[classifier_name]} for - classifier_name in self.classifiers_names] - elif self.classifier_configs is None: - self.classifier_configs = [None for _ in range(nb_clfs)] - - # def verif_clf_views(self, classifier_names, nb_view): - # if classifier_names is None: - # if nb_view is None: - # raise AttributeError(self.__class__.__name__+" must have either classifier_names or nb_views provided.") - # else: - # self.classifiers_names = self.get_classifiers(get_available_monoview_classifiers(), nb_view) - # else: - # if nb_view is None: - # self.classifiers_names = classifier_names - # else: - # if len(classifier_names)==nb_view: - # self.classifiers_names = classifier_names - # else: - # warnings.warn("nb_view and classifier_names not matching, choosing nb_view random classifiers in classifier_names.", UserWarning) - # self.classifiers_names = self.get_classifiers(classifier_names, nb_view) - - def get_classifiers(self, classifiers_names, nb_choices): - return self.random_state.choice(classifiers_names, size=nb_choices, - replace=True) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py deleted file mode 100644 index 5fbd4d56aeb6ae4b5bec4f6c8be8e25f24473c44..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/additions/utils.py +++ /dev/null @@ -1,64 +0,0 @@ -import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin - - -def get_names(classed_list): - return np.array([object_.__class__.__name__ for object_ in classed_list]) - - -# class BaseMultiviewClassifier(BaseEstimator, ClassifierMixin): -# -# def __init__(self, random_state): -# self.random_state = random_state -# -# def genBestParams(self, detector): -# return dict((param_name, detector.best_params_[param_name]) -# for param_name in self.param_names) -# -# def genParamsFromDetector(self, detector): -# if self.classed_params: -# classed_dict = dict((classed_param, get_names( -# detector.cv_results_["param_" + classed_param])) -# for classed_param in self.classed_params) -# if self.param_names: -# return [(param_name, -# np.array(detector.cv_results_["param_" + param_name])) -# if param_name not in self.classed_params else ( -# param_name, classed_dict[param_name]) -# for param_name in self.param_names] -# else: -# return [()] -# -# def genDistribs(self): -# return dict((param_name, distrib) for param_name, distrib in -# zip(self.param_names, self.distribs)) -# -# def getConfig(self): -# if self.param_names: -# return "\n\t\t- " + self.__class__.__name__ + "with " + ", ".join( -# [param_name + " : " + self.to_str(param_name) for param_name in -# self.param_names]) -# else: -# return "\n\t\t- " + self.__class__.__name__ + "with no config." -# -# def to_str(self, param_name): -# if param_name in self.weird_strings: -# if self.weird_strings[param_name] == "class_name": -# return self.get_params()[param_name].__class__.__name__ -# else: -# return self.weird_strings[param_name]( -# self.get_params()[param_name]) -# else: -# return str(self.get_params()[param_name]) -# -# def get_interpretation(self): -# return "No detailed interpretation function" - -# -# def get_train_views_indices(dataset, train_indices, view_indices, ): -# """This function is used to get all the examples indices and view indices if needed""" -# if view_indices is None: -# view_indices = np.arange(dataset.nb_view) -# if train_indices is None: -# train_indices = range(dataset.get_nb_examples()) -# return train_indices, view_indices diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/bayesian_inference_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/bayesian_inference_fusion.py deleted file mode 100644 index b1cd5f9e6ea962cbffdbf5fa98bfea6e092ce9c0..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/bayesian_inference_fusion.py +++ /dev/null @@ -1,39 +0,0 @@ -import numpy as np - -from ..multiview_classifiers.additions.late_fusion_utils import \ - LateFusionClassifier -from ..utils.dataset import get_examples_views_indices - -classifier_class_name = "BayesianInferenceClassifier" - - -class BayesianInferenceClassifier(LateFusionClassifier): - def __init__(self, random_state, classifiers_names=None, - classifier_configs=None, nb_cores=1, weights=None, - rs=None): - self.need_probas = True - LateFusionClassifier.__init__(self, random_state=random_state, - classifiers_names=classifiers_names, - classifier_configs=classifier_configs, - nb_cores=nb_cores, - weights=weights, - rs=rs) - - def predict(self, X, example_indices=None, view_indices=None): - example_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - self._check_views(view_indices) - if sum(self.weights) != 1.0: - self.weights = self.weights / sum(self.weights) - - view_scores = [] - for index, view_index in enumerate(view_indices): - view_scores.append(np.power( - self.monoview_estimators[index].predict_proba( - X.get_v(view_index, - example_indices)), - self.weights[index])) - view_scores = np.array(view_scores) - predicted_labels = np.argmax(np.prod(view_scores, axis=0), axis=1) - return predicted_labels diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/difficulty_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/difficulty_fusion.py deleted file mode 100644 index 0c66e5619ba5091576808f9919583ab165c47f2f..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/difficulty_fusion.py +++ /dev/null @@ -1,28 +0,0 @@ -import numpy as np - -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.additions.diversity_utils import \ - GlobalDiversityFusionClassifier - -classifier_class_name = "DifficultyFusion" - - -class DifficultyFusion(GlobalDiversityFusionClassifier): - - def diversity_measure(self, classifiers_decisions, combination, y): - _, nb_view, nb_examples = classifiers_decisions.shape - scores = np.zeros((nb_view, nb_examples), dtype=int) - for view_index, classifier_index in enumerate(combination): - scores[view_index, :] = np.logical_not( - np.logical_xor(classifiers_decisions[classifier_index, - view_index], - y) - ) - # Table of the nuber of views that succeeded for each example : - difficulty_scores = np.sum(scores, axis=0) - - difficulty_score = np.var( - np.array([ - np.sum((difficulty_scores == view_index)) - for view_index in range(len(combination) + 1)]) - ) - return difficulty_score diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/disagree_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/disagree_fusion.py deleted file mode 100644 index cee032a878b8ba9e062654f685317f193607b014..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/disagree_fusion.py +++ /dev/null @@ -1,14 +0,0 @@ -import numpy as np - -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.additions.diversity_utils import \ - CoupleDiversityFusionClassifier - -classifier_class_name = "DisagreeFusion" - - -class DisagreeFusion(CoupleDiversityFusionClassifier): - - def diversity_measure(self, first_classifier_decision, - second_classifier_decision, _): - return np.logical_xor(first_classifier_decision, - second_classifier_decision) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/double_fault_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/double_fault_fusion.py deleted file mode 100644 index 12eb6b64c39a1606f950e24e3a2e30e35fee10b9..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/double_fault_fusion.py +++ /dev/null @@ -1,14 +0,0 @@ -import numpy as np - -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.additions.diversity_utils import \ - CoupleDiversityFusionClassifier - -classifier_class_name = "DoubleFaultFusion" - - -class DoubleFaultFusion(CoupleDiversityFusionClassifier): - - def diversity_measure(self, first_classifier_decision, - second_classifier_decision, y): - return np.logical_and(np.logical_xor(first_classifier_decision, y), - np.logical_xor(second_classifier_decision, y)) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/entropy_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/entropy_fusion.py deleted file mode 100644 index 3c3d5aef7c6453540e06083b37bba0f1935ae62b..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/entropy_fusion.py +++ /dev/null @@ -1,26 +0,0 @@ -import numpy as np - -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers.additions.diversity_utils import \ - GlobalDiversityFusionClassifier - -classifier_class_name = "EntropyFusion" - - -class EntropyFusion(GlobalDiversityFusionClassifier): - - def diversity_measure(self, classifiers_decisions, combination, y): - _, nb_view, nb_examples = classifiers_decisions.shape - scores = np.zeros((nb_view, nb_examples), dtype=int) - for view_index, classifier_index in enumerate(combination): - scores[view_index] = np.logical_not( - np.logical_xor( - classifiers_decisions[classifier_index, view_index], - y) - ) - entropy_scores = np.sum(scores, axis=0) - nb_view_matrix = np.zeros((nb_examples), - dtype=int) + nb_view - entropy_scores - entropy_score = np.mean( - np.minimum(entropy_scores, nb_view_matrix).astype(float) / ( - nb_view - int(nb_view / 2))) - return entropy_score diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/majority_voting_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/majority_voting_fusion.py deleted file mode 100644 index 53a255c764f79c8e68271caba38539dea019c774..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/majority_voting_fusion.py +++ /dev/null @@ -1,55 +0,0 @@ -import numpy as np - -from ..multiview_classifiers.additions.late_fusion_utils import \ - LateFusionClassifier -from ..utils.dataset import get_examples_views_indices - -classifier_class_name = "MajorityVoting" - - -class VotingIndecision(Exception): - pass - - -class MajorityVoting(LateFusionClassifier): - def __init__(self, random_state, classifiers_names=None, - classifier_configs=None, weights=None, nb_cores=1, rs=None): - self.need_probas = False - LateFusionClassifier.__init__(self, random_state=random_state, - classifiers_names=classifiers_names, - classifier_configs=classifier_configs, - nb_cores=nb_cores, - weights=weights, - rs=rs) - - def predict(self, X, example_indices=None, view_indices=None): - examples_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - self._check_views(view_indices) - n_examples = len(examples_indices) - votes = np.zeros((n_examples, X.get_nb_class(example_indices)), - dtype=float) - monoview_decisions = np.zeros((len(examples_indices), X.nb_view), - dtype=int) - for index, view_index in enumerate(view_indices): - monoview_decisions[:, index] = self.monoview_estimators[ - index].predict( - X.get_v(view_index, examples_indices)) - for example_index in range(n_examples): - for view_index, feature_classification in enumerate( - monoview_decisions[example_index, :]): - votes[example_index, feature_classification] += self.weights[ - view_index] - nb_maximum = len( - np.where(votes[example_index] == max(votes[example_index]))[0]) - if nb_maximum == X.nb_view: - raise VotingIndecision( - "Majority voting can't decide, each classifier has voted for a different class") - - predicted_labels = np.argmax(votes, axis=1) - # Can be upgraded by restarting a new classification process if - # there are multiple maximums ?: - # while nbMaximum>1: - # relearn with only the classes that have a maximum number of vote - return predicted_labels diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/svm_jumbo_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/svm_jumbo_fusion.py deleted file mode 100644 index d9a2e38d21a9be49690ca372616ebde60a438f55..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/svm_jumbo_fusion.py +++ /dev/null @@ -1,36 +0,0 @@ -from sklearn.svm import SVC - -from .additions.jumbo_fusion_utils import BaseJumboFusion -from ..monoview.monoview_utils import CustomUniform, CustomRandint - -classifier_class_name = "SVMJumboFusion" - - -class SVMJumboFusion(BaseJumboFusion): - - def __init__(self, random_state=None, classifiers_names=None, - classifier_configs=None, nb_cores=1, weights=None, - nb_monoview_per_view=1, C=1.0, kernel="rbf", degree=2, - rs=None): - self.need_probas = False - BaseJumboFusion.__init__(self, random_state, - classifiers_names=classifiers_names, - classifier_configs=classifier_configs, - nb_cores=nb_cores, weights=weights, - nb_monoview_per_view=nb_monoview_per_view, - rs=rs) - self.param_names += ["C", "kernel", "degree"] - self.distribs += [CustomUniform(), ["rbf", "poly", "linear"], - CustomRandint(2, 5)] - self.aggregation_estimator = SVC(C=C, kernel=kernel, degree=degree) - self.C = C - self.kernel = kernel - self.degree = degree - - def set_params(self, C=1.0, kernel="rbf", degree=1, **params): - super(SVMJumboFusion, self).set_params(**params) - self.C = C - self.degree = degree - self.kernel = kernel - self.aggregation_estimator.set_params(C=C, kernel=kernel, degree=degree) - return self diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py deleted file mode 100644 index 6635119f14390c1dddbe4dd14ccf0184615aad77..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_early_fusion.py +++ /dev/null @@ -1,118 +0,0 @@ -import numpy as np - -from multiview_platform.mono_multi_view_classifiers import monoview_classifiers -from .additions.fusion_utils import BaseFusionClassifier -from ..multiview.multiview_utils import get_available_monoview_classifiers, \ - BaseMultiviewClassifier, ConfigGenerator -from ..utils.dataset import get_examples_views_indices -from ..utils.multiclass import get_mc_estim, MultiClassWrapper - -# from ..utils.dataset import get_v - -classifier_class_name = "WeightedLinearEarlyFusion" - - -class WeightedLinearEarlyFusion(BaseMultiviewClassifier, BaseFusionClassifier): - """ - WeightedLinearEarlyFusion - - Parameters - ---------- - random_state - view_weights - monoview_classifier_name - monoview_classifier_config - - Attributes - ---------- - """ - - def __init__(self, random_state=None, view_weights=None, - monoview_classifier_name="decision_tree", - monoview_classifier_config={}): - BaseMultiviewClassifier.__init__(self, random_state=random_state) - self.view_weights = view_weights - self.monoview_classifier_name = monoview_classifier_name - self.short_name = "early_fusion" - if monoview_classifier_name in monoview_classifier_config: - self.monoview_classifier_config = monoview_classifier_config[ - monoview_classifier_name] - self.monoview_classifier_config = monoview_classifier_config - # monoview_classifier_module = getattr(monoview_classifiers, - # self.monoview_classifier_name) - # monoview_classifier_class = getattr(monoview_classifier_module, - # monoview_classifier_module.classifier_class_name) - self.monoview_classifier = self.init_monoview_estimator(monoview_classifier_name, monoview_classifier_config) - self.param_names = ["monoview_classifier_name", - "monoview_classifier_config"] - self.distribs = [get_available_monoview_classifiers(), - ConfigGenerator(get_available_monoview_classifiers())] - self.classed_params = [] - self.weird_strings = {} - - def set_params(self, monoview_classifier_name="decision_tree", - monoview_classifier_config={}, **params): - self.monoview_classifier_name = monoview_classifier_name - self.monoview_classifier = self.init_monoview_estimator( - monoview_classifier_name, - monoview_classifier_config) - self.monoview_classifier_config = self.monoview_classifier.get_params() - self.short_name = "early_fusion" - return self - - def get_params(self, deep=True): - return {"random_state": self.random_state, - "view_weights": self.view_weights, - "monoview_classifier_name": self.monoview_classifier_name, - "monoview_classifier_config": self.monoview_classifier_config} - - def fit(self, X, y, train_indices=None, view_indices=None): - train_indices, X = self.transform_data_to_monoview(X, train_indices, - view_indices) - self.used_views = view_indices - if np.unique(y[train_indices]).shape[0] > 2 and \ - not (isinstance(self.monoview_classifier, MultiClassWrapper)): - self.monoview_classifier = get_mc_estim(self.monoview_classifier, - self.random_state, - multiview=False, - y=y[train_indices]) - self.monoview_classifier.fit(X, y[train_indices]) - self.monoview_classifier_config = self.monoview_classifier.get_params() - return self - - def predict(self, X, example_indices=None, view_indices=None): - _, X = self.transform_data_to_monoview(X, example_indices, view_indices) - self._check_views(self.view_indices) - predicted_labels = self.monoview_classifier.predict(X) - return predicted_labels - - def transform_data_to_monoview(self, dataset, example_indices, - view_indices): - """Here, we extract the data from the HDF5 dataset file and store all - the concatenated views in one variable""" - example_indices, self.view_indices = get_examples_views_indices(dataset, - example_indices, - view_indices) - if self.view_weights is None: - self.view_weights = np.ones(len(self.view_indices), dtype=float) - else: - self.view_weights = np.array(self.view_weights) - self.view_weights /= float(np.sum(self.view_weights)) - - X = self.hdf5_to_monoview(dataset, example_indices) - return example_indices, X - - def hdf5_to_monoview(self, dataset, examples): - """Here, we concatenate the views for the asked examples """ - monoview_data = np.concatenate( - [dataset.get_v(view_idx, examples) - for view_weight, (index, view_idx) - in zip(self.view_weights, enumerate(self.view_indices))] - , axis=1) - return monoview_data - - # def set_monoview_classifier_config(self, monoview_classifier_name, monoview_classifier_config): - # if monoview_classifier_name in monoview_classifier_config: - # self.monoview_classifier.set_params(**monoview_classifier_config[monoview_classifier_name]) - # else: - # self.monoview_classifier.set_params(**monoview_classifier_config) diff --git a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_late_fusion.py b/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_late_fusion.py deleted file mode 100644 index 403791ceec03ef3c18e9152a996bc5a39d41bd54..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/multiview_classifiers/weighted_linear_late_fusion.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np - -from ..multiview_classifiers.additions.late_fusion_utils import \ - LateFusionClassifier -from ..utils.dataset import get_examples_views_indices - -classifier_class_name = "WeightedLinearLateFusion" - - -class WeightedLinearLateFusion(LateFusionClassifier): - def __init__(self, random_state, classifiers_names=None, - classifier_configs=None, weights=None, nb_cores=1, rs=None): - self.need_probas = True - LateFusionClassifier.__init__(self, random_state=random_state, - classifiers_names=classifiers_names, - classifier_configs=classifier_configs, - nb_cores=nb_cores, weights=weights, rs=rs) - - def predict(self, X, example_indices=None, view_indices=None): - example_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - self._check_views(view_indices) - view_scores = [] - for index, viewIndex in enumerate(view_indices): - view_scores.append( - np.array(self.monoview_estimators[index].predict_proba( - X.get_v(viewIndex, example_indices))) * self.weights[index]) - view_scores = np.array(view_scores) - predicted_labels = np.argmax(np.sum(view_scores, axis=0), axis=1) - return predicted_labels diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/__init__.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/duration_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/duration_analysis.py deleted file mode 100644 index fb3a539cf99d1c16132d838fb7c3ecf81f3c41e9..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/duration_analysis.py +++ /dev/null @@ -1,47 +0,0 @@ -import os -import plotly -import pandas as pd - - -def get_duration(results): - df = pd.DataFrame(columns=["hps", "fit", "pred"], ) - for classifier_result in results: - df.at[classifier_result.get_classifier_name(), - "hps"] = classifier_result.hps_duration - df.at[classifier_result.get_classifier_name(), - "fit"] = classifier_result.fit_duration - df.at[classifier_result.get_classifier_name(), - "pred"] = classifier_result.pred_duration - return df - -def plot_durations(durations, directory, database_name, durations_stds=None): # pragma: no cover - file_name = os.path.join(directory, database_name + "-durations") - durations.to_csv(file_name+"_dataframe.csv") - fig = plotly.graph_objs.Figure() - if durations_stds is None: - durations_stds = pd.DataFrame(0, durations.index, durations.columns) - else: - durations_stds.to_csv(file_name+"_stds_dataframe.csv") - fig.add_trace(plotly.graph_objs.Bar(name='Hyper-parameter Optimization', - x=durations.index, - y=durations['hps'], - error_y=dict(type='data', - array=durations_stds["hps"]), - marker_color="grey")) - fig.add_trace(plotly.graph_objs.Bar(name='Fit (on train set)', - x=durations.index, - y=durations['fit'], - error_y=dict(type='data', - array=durations_stds["fit"]), - marker_color="black")) - fig.add_trace(plotly.graph_objs.Bar(name='Prediction (on test set)', - x=durations.index, - y=durations['pred'], - error_y=dict(type='data', - array=durations_stds["pred"]), - marker_color="lightgrey")) - fig.update_layout(title="Durations for each classfier", - yaxis_title="Duration (s)") - fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', - plot_bgcolor='rgba(0,0,0,0)') - plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/error_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/error_analysis.py deleted file mode 100644 index 97aa6baa7ad3f1b6902c69eb4287a4005660f78e..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/error_analysis.py +++ /dev/null @@ -1,289 +0,0 @@ -# Import built-in modules -import logging -import os - -import matplotlib as mpl -# Import third party modules -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import plotly -from matplotlib.patches import Patch - -# Import own Modules - - -def get_example_errors(groud_truth, results): - r"""Used to get for each classifier and each example whether the classifier - has misclassified the example or not. - - Parameters - ---------- - ground_truth : numpy array of 0, 1 and -100 (if multiclass) - The array with the real labels of the examples - results : list of MonoviewResult and MultiviewResults objects - A list containing all the resluts for all the mono- & multi-view - experimentations. - - Returns - ------- - example_errors : dict of np.array - For each classifier, has an entry with a `np.array` over the examples, - with a 1 if the examples was - well-classified, a 0 if not and if it's multiclass classification, a - -100 if the examples was not seen during - the one versus one classification. - """ - example_errors = {} - - for classifier_result in results: - error_on_examples = np.equal(classifier_result.full_labels_pred, - groud_truth).astype(int) - unseen_examples = np.where(groud_truth == -100)[0] - error_on_examples[unseen_examples] = -100 - example_errors[ - classifier_result.get_classifier_name()] = error_on_examples - return example_errors - - -def publish_example_errors(example_errors, directory, databaseName, - labels_names, example_ids, labels): # pragma: no cover - logging.debug("Start:\t Label analysis figure generation") - - base_file_name = os.path.join(directory, databaseName + "-" ) - - nb_classifiers, nb_examples, classifiers_names, \ - data_2d, error_on_examples = gen_error_data(example_errors) - - np.savetxt(base_file_name + "2D_plot_data.csv", data_2d, delimiter=",") - np.savetxt(base_file_name + "bar_plot_data.csv", error_on_examples, - delimiter=",") - - plot_2d(data_2d, classifiers_names, nb_classifiers, base_file_name, - example_ids=example_ids, labels=labels) - - plot_errors_bar(error_on_examples, nb_examples, - base_file_name, example_ids=example_ids) - - logging.debug("Done:\t Label analysis figures generation") - - -def publish_all_example_errors(iter_results, directory, - stats_iter, - example_ids, labels): # pragma: no cover - logging.debug( - "Start:\t Global label analysis figure generation") - - nb_examples, nb_classifiers, data, \ - error_on_examples, classifier_names = gen_error_data_glob(iter_results, - stats_iter) - - np.savetxt(os.path.join(directory, "clf_errors.csv"), data, delimiter=",") - np.savetxt(os.path.join(directory, "example_errors.csv"), error_on_examples, - delimiter=",") - - plot_2d(data, classifier_names, nb_classifiers, - os.path.join(directory, ""), stats_iter=stats_iter, - example_ids=example_ids, labels=labels) - plot_errors_bar(error_on_examples, nb_examples, os.path.join(directory, ""), - example_ids=example_ids) - - logging.debug( - "Done:\t Global label analysis figures generation") - - -def gen_error_data(example_errors): - r"""Used to format the error data in order to plot it efficiently. The - data is saves in a `.csv` file. - - Parameters - ---------- - example_errors : dict of dicts of np.arrays - A dictionary conatining all the useful data. Organized as : - `example_errors[<classifier_name>]["error_on_examples"]` is a np.array - of ints with a - - 1 if the classifier `<classifier_name>` classifier well the example, - - 0 if it fail to classify the example, - - -100 if it did not classify the example (multiclass one versus one). - - Returns - ------- - nbClassifiers : int - Number of different classifiers. - nbExamples : int - NUmber of examples. - nbCopies : int - The number of times the data is copied (classifier wise) in order for - the figure to be more readable. - classifiers_names : list of strs - The names fo the classifiers. - data : np.array of shape `(nbClassifiers, nbExamples)` - A matrix with zeros where the classifier failed to classifiy the - example, ones where it classified it well - and -100 if the example was not classified. - error_on_examples : np.array of shape `(nbExamples,)` - An array counting how many classifiers failed to classifiy each - examples. - """ - nb_classifiers = len(example_errors) - nb_examples = len(list(example_errors.values())[0]) - classifiers_names = list(example_errors.keys()) - - data_2d = np.zeros((nb_examples, nb_classifiers)) - for classifierIndex, (classifier_name, error_on_examples) in enumerate( - example_errors.items()): - data_2d[:, classifierIndex] = error_on_examples - error_on_examples = np.sum(data_2d, axis=1) / nb_classifiers - return nb_classifiers, nb_examples, classifiers_names, data_2d, error_on_examples - - -def gen_error_data_glob(iter_results, stats_iter): - nb_examples = next(iter(iter_results.values())).shape[0] - nb_classifiers = len(iter_results) - data = np.zeros((nb_examples, nb_classifiers), dtype=int) - classifier_names = [] - for clf_index, (classifier_name, error_data) in enumerate( - iter_results.items()): - data[:, clf_index] = error_data - classifier_names.append(classifier_name) - error_on_examples = np.sum(data, axis=1) / ( - nb_classifiers * stats_iter) - return nb_examples, nb_classifiers, data, error_on_examples, \ - classifier_names - - -def plot_2d(data, classifiers_names, nb_classifiers, file_name, labels=None, - stats_iter=1, use_plotly=True, example_ids=None): # pragma: no cover - r"""Used to generate a 2D plot of the errors. - - Parameters - ---------- - data : np.array of shape `(nbClassifiers, nbExamples)` - A matrix with zeros where the classifier failed to classifiy the example, ones where it classified it well - and -100 if the example was not classified. - classifiers_names : list of str - The names of the classifiers. - nb_classifiers : int - The number of classifiers. - file_name : str - The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) - minSize : int, optinal, default: 10 - The minimum width and height of the figure. - width_denominator : float, optional, default: 1.0 - To obtain the image width, the number of classifiers will be divided by this number. - height_denominator : float, optional, default: 1.0 - To obtain the image width, the number of examples will be divided by this number. - stats_iter : int, optional, default: 1 - The number of statistical iterations realized. - - Returns - ------- - """ - fig, ax = plt.subplots(nrows=1, ncols=1, ) - label_index_list = np.concatenate([np.where(labels == i)[0] for i in - np.unique( - labels)]) - cmap, norm = iter_cmap(stats_iter) - cax = plt.imshow(data[np.flip(label_index_list), :], cmap=cmap, norm=norm, - aspect='auto') - plt.title('Errors depending on the classifier') - ticks = np.arange(0, nb_classifiers, 1) - tick_labels = classifiers_names - plt.xticks(ticks, tick_labels, rotation="vertical") - plt.yticks([], []) - plt.ylabel("Examples") - cbar = fig.colorbar(cax, ticks=[-100 * stats_iter / 2, 0, stats_iter]) - cbar.ax.set_yticklabels(['Unseen', 'Always Wrong', 'Always Right']) - - fig.savefig(file_name + "error_analysis_2D.png", bbox_inches="tight", - transparent=True) - plt.close() - ### The following part is used to generate an interactive graph. - if use_plotly: - # [np.where(labels==i)[0] for i in np.unique(labels)] - hover_text = [[example_ids[example_index] + " failed " + str( - stats_iter - data[ - example_index, classifier_index]) + " time(s), labelled " + str( - labels[example_index]) - for classifier_index in range(data.shape[1])] - for example_index in range(data.shape[0])] - fig = plotly.graph_objs.Figure() - fig.add_trace(plotly.graph_objs.Heatmap( - x=list(classifiers_names), - y=[example_ids[label_ind] for label_ind in label_index_list], - z=data[label_index_list, :], - text=[hover_text[label_ind] for label_ind in label_index_list], - hoverinfo=["y", "x", "text"], - colorscale="Greys", - colorbar=dict(tickvals=[0, stats_iter], - ticktext=["Always Wrong", "Always Right"]), - reversescale=True), ) - fig.update_yaxes(title_text="Examples", showticklabels=True) - fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', - plot_bgcolor='rgba(0,0,0,0)') - fig.update_xaxes(showticklabels=True, ) - plotly.offline.plot(fig, filename=file_name + "error_analysis_2D.html", - auto_open=False) - del fig - - -def plot_errors_bar(error_on_examples, nb_examples, file_name, - use_plotly=True, example_ids=None): # pragma: no cover - r"""Used to generate a barplot of the muber of classifiers that failed to classify each examples - - Parameters - ---------- - error_on_examples : np.array of shape `(nbExamples,)` - An array counting how many classifiers failed to classifiy each examples. - classifiers_names : list of str - The names of the classifiers. - nb_classifiers : int - The number of classifiers. - nb_examples : int - The number of examples. - file_name : str - The name of the file in which the figure will be saved ("error_analysis_2D.png" will be added at the end) - - Returns - ------- - """ - fig, ax = plt.subplots() - x = np.arange(nb_examples) - plt.bar(x, 1-error_on_examples) - plt.title("Number of classifiers that failed to classify each example") - fig.savefig(file_name + "error_analysis_bar.png", transparent=True) - plt.close() - if use_plotly: - fig = plotly.graph_objs.Figure([plotly.graph_objs.Bar(x=example_ids, y=1-error_on_examples)]) - fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', - plot_bgcolor='rgba(0,0,0,0)') - plotly.offline.plot(fig, filename=file_name + "error_analysis_bar.html", - auto_open=False) - - - - -def iter_cmap(statsIter): # pragma: no cover - r"""Used to generate a colormap that will have a tick for each iteration : the whiter the better. - - Parameters - ---------- - statsIter : int - The number of statistical iterations. - - Returns - ------- - cmap : matplotlib.colors.ListedColorMap object - The colormap. - norm : matplotlib.colors.BoundaryNorm object - The bounds for the colormap. - """ - cmapList = ["red", "0.0"] + [str(float((i + 1)) / statsIter) for i in - range(statsIter)] - cmap = mpl.colors.ListedColormap(cmapList) - bounds = [-100 * statsIter - 0.5, -0.5] - for i in range(statsIter): - bounds.append(i + 0.5) - bounds.append(statsIter + 0.5) - norm = mpl.colors.BoundaryNorm(bounds, cmap.N) - return cmap, norm diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/execution.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/execution.py deleted file mode 100644 index e620a9340b47b05760706f72cab16ae208eeb053..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/execution.py +++ /dev/null @@ -1,247 +0,0 @@ -import logging -import pandas as pd - -from .tracebacks_analysis import save_failed, publish_tracebacks -from .duration_analysis import plot_durations, get_duration -from .metric_analysis import get_metrics_scores, publish_metrics_graphs, publish_all_metrics_scores -from .error_analysis import get_example_errors, publish_example_errors, publish_all_example_errors -from .feature_importances import get_feature_importances, publish_feature_importances - -def analyze(results, stats_iter, benchmark_argument_dictionaries, - metrics, directory, example_ids, labels): # pragma: no cover - """Used to analyze the results of the previous benchmarks""" - data_base_name = benchmark_argument_dictionaries[0]["args"]["name"] - - results_means_std, iter_results, flagged_failed, label_names = analyze_iterations( - results, benchmark_argument_dictionaries, - stats_iter, metrics, example_ids, labels) - if flagged_failed: - save_failed(flagged_failed, directory) - - if stats_iter > 1: - results_means_std = analyze_all( - iter_results, stats_iter, directory, - data_base_name, example_ids, label_names) - return results_means_std - - -def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, - metrics, example_ids, labels): - r"""Used to extract and format the results of the different - experimentations performed. - - Parameters - ---------- - results : list - The result list returned by the benchmark execution function. For each - executed benchmark, contains - a flag & a result element. - The flag is a way to identify to which benchmark the results belong, - formatted this way : - `flag = iter_index, [classifierPositive, classifierNegative]` with - - `iter_index` the index of the statistical iteration - - `[classifierPositive, classifierNegative]` the indices of the labels - considered positive and negative - by the classifier (mainly useful for one versus one multiclass - classification). - benchmark_argument_dictionaries : list of dicts - The list of all the arguments passed to the benchmark executing - functions. - statsIter : int - The number of statistical iterations. - metrics : list of lists - THe list containing the metrics and their configuration. - - Returns - ------- - results : list of dicts of dicts - The list contains a dictionary for each statistical iteration. This - dictionary contains a dictionary for each - label combination, regrouping the scores for each metrics and the - information useful to plot errors on examples. - """ - logging.debug("Start:\t Analyzing all results") - iter_results = {"metrics_scores": [i for i in range(stats_iter)], - "class_metrics_scores": [i for i in range(stats_iter)], - "example_errors": [i for i in range(stats_iter)], - "feature_importances": [i for i in range(stats_iter)], - "durations":[i for i in range(stats_iter)]} - flagged_tracebacks_list = [] - fig_errors = [] - for iter_index, result, tracebacks in results: - arguments = get_arguments(benchmark_argument_dictionaries, iter_index) - labels_names = list(arguments["labels_dictionary"].values()) - - metrics_scores, class_metric_scores = get_metrics_scores(metrics, result, labels_names) - example_errors = get_example_errors(labels, result) - feature_importances = get_feature_importances(result) - durations = get_duration(result) - directory = arguments["directory"] - - database_name = arguments["args"]["name"] - - - flagged_tracebacks_list += publish_tracebacks(directory, database_name, - labels_names, tracebacks, - iter_index) - res = publish_metrics_graphs(metrics_scores, directory, database_name, - labels_names, class_metric_scores) - publish_example_errors(example_errors, directory, database_name, - labels_names, example_ids, labels) - publish_feature_importances(feature_importances, directory, - database_name) - plot_durations(durations, directory, database_name) - - iter_results["metrics_scores"][iter_index] = metrics_scores - iter_results["class_metrics_scores"][iter_index] = class_metric_scores - iter_results["example_errors"][iter_index] = example_errors - iter_results["feature_importances"][iter_index] = feature_importances - iter_results["labels"] = labels - iter_results["durations"][iter_index] = durations - - logging.debug("Done:\t Analyzing all results") - - return res, iter_results, flagged_tracebacks_list, labels_names - - -def analyze_all(iter_results, stats_iter, directory, data_base_name, - example_ids, label_names): # pragma: no cover - """Used to format the results in order to plot the mean results on - the iterations""" - metrics_analysis, class_metrics_analysis, error_analysis, feature_importances, \ - feature_importances_stds, labels, duration_means, \ - duration_stds = format_previous_results(iter_results) - - results = publish_all_metrics_scores(metrics_analysis, class_metrics_analysis, - directory, - data_base_name, stats_iter, label_names) - publish_all_example_errors(error_analysis, directory, stats_iter, - example_ids, labels) - publish_feature_importances(feature_importances, directory, - data_base_name, feature_importances_stds) - plot_durations(duration_means, directory, data_base_name, duration_stds) - return results - -def get_arguments(benchmark_argument_dictionaries, iter_index): - r"""Used to get the arguments passed to the benchmark executing function - corresponding to the flag of an - experimentation. - - Parameters - ---------- - flag : list - The needed experimentation's flag. - benchmark_argument_dictionaries : list of dicts - The list of all the arguments passed to the benchmark executing - functions. - - Returns - ------- - benchmark_argument_dictionary : dict - All the arguments passed to the benchmark executing function for the - needed experimentation. - """ - for benchmark_argument_dictionary in benchmark_argument_dictionaries: - if benchmark_argument_dictionary["flag"] == iter_index: - return benchmark_argument_dictionary - - -def format_previous_results(iter_results_lists): - """ - Formats each statistical iteration's result into a mean/std analysis for - the metrics and adds the errors of each statistical iteration. - - Parameters - ---------- - iter_results_lists : The raw results, for each statistical iteration i - contains - - biclass_results[i]["metrics_scores"] is a dictionary with a - pd.dataframe for each metrics - - biclass_results[i]["example_errors"], a dicaitonary with a np.array - for each classifier. - - Returns - ------- - metrics_analysis : The mean and std dataframes for each metrics - - error_analysis : A dictionary containing the added errors - arrays for each classifier - - """ - metrics_analysis = {} - class_metrics_analysis = {} - feature_importances_analysis = {} - feature_importances_stds = {} - - metric_concat_dict = {} - for iter_index, metrics_score in enumerate( - iter_results_lists["metrics_scores"]): - for metric_name, dataframe in metrics_score.items(): - if metric_name not in metric_concat_dict: - metric_concat_dict[metric_name] = dataframe - else: - metric_concat_dict[metric_name] = pd.concat( - [metric_concat_dict[metric_name], dataframe]) - - for metric_name, dataframe in metric_concat_dict.items(): - metrics_analysis[metric_name] = {} - metrics_analysis[metric_name][ - "mean"] = dataframe.groupby(dataframe.index).mean() - metrics_analysis[metric_name][ - "std"] = dataframe.groupby(dataframe.index).std(ddof=0) - - class_metric_concat_dict = {} - for iter_index, class_metrics_score in enumerate( - iter_results_lists["class_metrics_scores"]): - for metric_name, dataframe in class_metrics_score.items(): - if metric_name not in class_metric_concat_dict: - class_metric_concat_dict[metric_name] = dataframe - else: - class_metric_concat_dict[metric_name] = pd.concat( - [class_metric_concat_dict[metric_name], dataframe]) - - for metric_name, dataframe in class_metric_concat_dict.items(): - class_metrics_analysis[metric_name] = {} - class_metrics_analysis[metric_name][ - "mean"] = dataframe.groupby(dataframe.index).mean() - class_metrics_analysis[metric_name][ - "std"] = dataframe.groupby(dataframe.index).std(ddof=0) - - durations_df_concat = pd.DataFrame(dtype=float) - for iter_index, durations_df in enumerate(iter_results_lists["durations"]): - durations_df_concat = pd.concat((durations_df_concat, durations_df), - axis=1) - durations_df_concat = durations_df_concat.astype(float) - grouped_df = durations_df_concat.groupby(durations_df_concat.columns, axis=1) - duration_means = grouped_df.mean() - duration_stds = grouped_df.std() - - importance_concat_dict = {} - for iter_index, view_feature_importances in enumerate( - iter_results_lists["feature_importances"]): - for view_name, feature_importances in view_feature_importances.items(): - if view_name not in importance_concat_dict: - importance_concat_dict[view_name] = feature_importances - else: - importance_concat_dict[view_name] = pd.concat( - [importance_concat_dict[view_name], feature_importances]) - - for view_name, dataframe in importance_concat_dict.items(): - feature_importances_analysis[view_name] = dataframe.groupby( - dataframe.index).mean() - - feature_importances_stds[view_name] = dataframe.groupby( - dataframe.index).std(ddof=0) - - added_example_errors = {} - for example_errors in iter_results_lists["example_errors"]: - for classifier_name, errors in example_errors.items(): - if classifier_name not in added_example_errors: - added_example_errors[classifier_name] = errors - else: - added_example_errors[classifier_name] += errors - error_analysis = added_example_errors - return metrics_analysis, class_metrics_analysis ,error_analysis, \ - feature_importances_analysis, \ - feature_importances_stds, iter_results_lists["labels"], \ - duration_means, duration_stds diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/feature_importances.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/feature_importances.py deleted file mode 100644 index 459f664fb6231161e0e75a10ed3009e0dd27950c..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/feature_importances.py +++ /dev/null @@ -1,84 +0,0 @@ -import os -import plotly -import pandas as pd -import numpy as np - -from ..monoview.monoview_utils import MonoviewResult - - -def get_feature_importances(result, feature_names=None): - r"""Extracts the feature importance from the monoview results and stores - them in a dictionnary : - feature_importance[view_name] is a pandas.DataFrame of size n_feature*n_clf - containing a score of importance for each feature. - - Parameters - ---------- - result : list of results - - Returns - ------- - feature_importances : dict of pd.DataFrame - The dictionary containing all the feature importance for each view as - pandas DataFrames - """ - feature_importances = {} - for classifier_result in result: - if isinstance(classifier_result, MonoviewResult): - if classifier_result.view_name not in feature_importances: - feature_importances[classifier_result.view_name] = pd.DataFrame( - index=feature_names) - if hasattr(classifier_result.clf, 'feature_importances_'): - feature_importances[classifier_result.view_name][ - classifier_result.classifier_name] = classifier_result.clf.feature_importances_ - else: - feature_importances[classifier_result.view_name][ - classifier_result.classifier_name] = np.zeros( - classifier_result.n_features) - return feature_importances - -def publish_feature_importances(feature_importances, directory, database_name, - feature_stds=None): # pragma: no cover - for view_name, feature_importance in feature_importances.items(): - if not os.path.exists(os.path.join(directory, "feature_importances")): - os.mkdir(os.path.join(directory, "feature_importances")) - file_name = os.path.join(directory, "feature_importances", - database_name + "-" + view_name - + "-feature_importances") - if feature_stds is not None: - feature_std = feature_stds[view_name] - feature_std.to_csv(file_name + "_dataframe_stds.csv") - else: - feature_std = pd.DataFrame(data=np.zeros(feature_importance.shape), - index=feature_importance.index, - columns=feature_importance.columns) - plot_feature_importances(file_name, feature_importance, feature_std) - - -def plot_feature_importances(file_name, feature_importance, feature_std): # pragma: no cover - feature_importance.to_csv(file_name + "_dataframe.csv") - hover_text = [["-Feature :" + str(feature_name) + - "<br>-Classifier : " + classifier_name + - "<br>-Importance : " + str( - feature_importance.loc[feature_name][classifier_name]) + - "<br>-STD : " + str( - feature_std.loc[feature_name][classifier_name]) - for classifier_name in list(feature_importance.columns)] - for feature_name in list(feature_importance.index)] - fig = plotly.graph_objs.Figure(data=plotly.graph_objs.Heatmap( - x=list(feature_importance.columns), - y=list(feature_importance.index), - z=feature_importance.values, - text=hover_text, - hoverinfo=["text"], - colorscale="Greys", - reversescale=False)) - fig.update_layout( - xaxis={"showgrid": False, "showticklabels": False, "ticks": ''}, - yaxis={"showgrid": False, "showticklabels": False, "ticks": ''}) - fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', - plot_bgcolor='rgba(0,0,0,0)') - plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) - - del fig - diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/metric_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/metric_analysis.py deleted file mode 100644 index fff1e36511fd9ac1952ce6af6b5d7e801ff0728b..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/metric_analysis.py +++ /dev/null @@ -1,393 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -import os -import pandas as pd -import plotly -import logging - -from ..utils.organization import secure_file_path - - -def get_metrics_scores(metrics, results, label_names): - r"""Used to extract metrics scores in case of classification - - Parameters - ---------- - metrics : dict - The metrics names with configuration metrics[i][0] = name of metric i - results : list of MonoviewResult and MultiviewResults objects - A list containing all the results for all the monoview experimentations. - - Returns - ------- - metricsScores : dict of dict of list - Regroups all the scores for each metrics for each classifier and for - the train and test sets. - organized as : - -`metricScores[metric_name]["classifiers_names"]` is a list of all the - classifiers available for this metric, - -`metricScores[metric_name]["train_scores"]` is a list of all the - available classifiers scores on the train set, - -`metricScores[metric_name]["test_scores"]` is a list of all the - available classifiers scores on the test set. - """ - classifier_names = [] - classifier_names = [classifier_result.get_classifier_name() - for classifier_result in results - if classifier_result.get_classifier_name() - not in classifier_names] - metrics_scores = dict((metric, pd.DataFrame(data=np.zeros((2, - len( - classifier_names))), - index=["train", "test"], - columns=classifier_names)) - for metric in metrics.keys()) - class_metric_scores = dict((metric, pd.DataFrame( - index=pd.MultiIndex.from_product([["train", "test"], label_names]), - columns=classifier_names, dtype=float)) - for metric in metrics) - - for metric in metrics.keys(): - for classifier_result in results: - metrics_scores[metric].loc[ - "train", classifier_result.get_classifier_name()] = \ - classifier_result.metrics_scores[metric][0] - metrics_scores[metric].loc[ - "test", classifier_result.get_classifier_name()] = \ - classifier_result.metrics_scores[metric][1] - for label_index, label_name in enumerate(label_names): - class_metric_scores[metric].loc[( - "train", label_name),classifier_result.get_classifier_name()] = \ - classifier_result.class_metric_scores[metric][0][label_index] - class_metric_scores[metric].loc[( - "test", label_name), classifier_result.get_classifier_name()] = \ - classifier_result.class_metric_scores[metric][1][label_index] - - return metrics_scores, class_metric_scores - - -def publish_metrics_graphs(metrics_scores, directory, database_name, - labels_names, class_metric_scores): # pragma: no cover - r"""Used to sort the results (names and both scores) in descending test - score order. - - Parameters - ---------- - metrics_scores : dict of dicts of lists or np.arrays - Keys : The names of the metrics. - Values : The scores and names of each classifier . - directory : str - The path to the directory where the figures will be saved. - database_name : str - The name of the database on which the experiments where conducted. - labels_names : list of strs - The name corresponding to each numerical label. - - Returns - ------- - results - """ - results = [] - for metric_name in metrics_scores.keys(): - logging.debug( - "Start:\t Score graph generation for " + metric_name) - train_scores, test_scores, classifier_names, \ - file_name, nb_results, results,\ - class_test_scores = init_plot(results, metric_name, - metrics_scores[metric_name], - directory, - database_name, - class_metric_scores[metric_name]) - - plot_metric_scores(train_scores, test_scores, classifier_names, - nb_results, metric_name, file_name, - tag=" " + " vs ".join(labels_names)) - - class_file_name = os.path.join(directory, database_name + "-" - + metric_name+"-class") - plot_class_metric_scores(class_test_scores, class_file_name, - labels_names, classifier_names, metric_name) - logging.debug( - "Done:\t Score graph generation for " + metric_name) - return results - - -def publish_all_metrics_scores(iter_results, class_iter_results, directory, - data_base_name, stats_iter, label_names, - min_size=10): # pragma: no cover - results = [] - secure_file_path(os.path.join(directory, "a")) - - for metric_name, scores in iter_results.items(): - train = np.array(scores["mean"].loc["train"]) - test = np.array(scores["mean"].loc["test"]) - classifier_names = np.array(scores["mean"].columns) - train_std = np.array(scores["std"].loc["train"]) - test_std = np.array(scores["std"].loc["test"]) - - file_name = os.path.join(directory, data_base_name + "-mean_on_" + str( - stats_iter) + "_iter-" + metric_name) - nb_results = classifier_names.shape[0] - - plot_metric_scores(train, test, classifier_names, nb_results, - metric_name, file_name, tag=" averaged", - train_STDs=train_std, test_STDs=test_std) - results += [[classifier_name, metric_name, test_mean, test_std] - for classifier_name, test_mean, test_std - in zip(classifier_names, test, test_std)] - - for metric_name, scores in class_iter_results.items(): - test = np.array([np.array(scores["mean"].iloc[i, :]) for i in range(scores["mean"].shape[0]) if scores["mean"].iloc[i, :].name[0]=='test']) - classifier_names = np.array(scores["mean"].columns) - test_std = np.array([np.array(scores["std"].iloc[i, :]) for i in range(scores["std"].shape[0]) if scores["std"].iloc[i, :].name[0]=='test']) - - file_name = os.path.join(directory, data_base_name + "-mean_on_" + str( - stats_iter) + "_iter-" + metric_name+"-class") - - plot_class_metric_scores(test, file_name, label_names, classifier_names, metric_name, stds=test_std, tag="averaged") - return results - -def init_plot(results, metric_name, metric_dataframe, - directory, database_name, class_metric_scores): - train = np.array(metric_dataframe.loc["train"]) - test = np.array(metric_dataframe.loc["test"]) - class_test = np.array(class_metric_scores.loc["test"]) - classifier_names = np.array(metric_dataframe.columns) - - nb_results = metric_dataframe.shape[1] - - file_name = os.path.join(directory, database_name + "-" + metric_name) - - results += [[classifiers_name, metric_name, test_mean, test_std, class_mean] - for classifiers_name, test_mean, class_mean, test_std in - zip(classifier_names, test, np.transpose(class_test), - np.zeros(len(test)))] - return train, test, classifier_names, file_name, nb_results, results, \ - class_test - - -def plot_metric_scores(train_scores, test_scores, names, nb_results, - metric_name, - file_name, - tag="", train_STDs=None, test_STDs=None, - use_plotly=True): # pragma: no cover - r"""Used to plot and save the score barplot for a specific metric. - - Parameters - ---------- - train_scores : list or np.array of floats - The scores of each classifier on the training set. - test_scores : list or np.array of floats - The scores of each classifier on the testing set. - names : list or np.array of strs - The names of all the classifiers. - nb_results: int - The number of classifiers to plot. - metric_name : str - The plotted metric's name - file_name : str - The name of the file where the figure will be saved. - tag : str - Some text to personalize the title, must start with a whitespace. - train_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the training set. - test_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the testing set. - - Returns - ------- - """ - - figKW, barWidth = get_fig_size(nb_results) - - names, train_scores, test_scores, train_STDs, test_STDs = sort_by_test_score( - train_scores, test_scores, names, - train_STDs, test_STDs) - - f, ax = plt.subplots(nrows=1, ncols=1, **figKW) - ax.set_title(metric_name + "\n" + tag + " scores for each classifier") - - rects = ax.bar(range(nb_results), test_scores, barWidth, color="0.1", - yerr=test_STDs) - rect2 = ax.bar(np.arange(nb_results) + barWidth, train_scores, barWidth, - color="0.8", yerr=train_STDs) - autolabel(rects, ax, set=1, std=test_STDs) - autolabel(rect2, ax, set=2, std=train_STDs) - ax.legend((rects[0], rect2[0]), ('Test', 'Train')) - ax.set_ylim(-0.1, 1.1) - ax.set_xticks(np.arange(nb_results) + barWidth / 2) - ax.set_xticklabels(names, rotation="vertical") - - try: - plt.tight_layout() - except: - pass - f.savefig(file_name + '.png', transparent=True) - plt.close() - import pandas as pd - if train_STDs is None: - dataframe = pd.DataFrame(np.transpose(np.concatenate(( - train_scores.reshape((train_scores.shape[0], 1)), - test_scores.reshape((train_scores.shape[0], 1))), axis=1)), - columns=names, index=["Train", "Test"]) - else: - dataframe = pd.DataFrame(np.transpose(np.concatenate(( - train_scores.reshape((train_scores.shape[0], 1)), - train_STDs.reshape((train_scores.shape[0], 1)), - test_scores.reshape((train_scores.shape[0], 1)), - test_STDs.reshape((train_scores.shape[0], 1))), axis=1)), - columns=names, index=["Train", "Train STD", "Test", "Test STD"]) - dataframe.to_csv(file_name + ".csv") - if use_plotly: - fig = plotly.graph_objs.Figure() - fig.add_trace(plotly.graph_objs.Bar( - name='Train', - x=names, y=train_scores, - error_y=dict(type='data', array=train_STDs), - marker_color="lightgrey", - )) - fig.add_trace(plotly.graph_objs.Bar( - name='Test', - x=names, y=test_scores, - error_y=dict(type='data', array=test_STDs), - marker_color="black", - )) - - fig.update_layout( - title=metric_name + "<br>" + tag + " scores for each classifier") - fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', - plot_bgcolor='rgba(0,0,0,0)') - plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) - del fig - - -def plot_class_metric_scores(class_test_scores, class_file_name, - labels_names, classifier_names, metric_name, - stds=None, tag=""): # pragma: no cover - fig = plotly.graph_objs.Figure() - for lab_index, scores in enumerate(class_test_scores): - if stds is None: - std = None - else: - std = stds[lab_index] - fig.add_trace(plotly.graph_objs.Bar( - name=labels_names[lab_index], - x=classifier_names, y=scores, - error_y=dict(type='data', array=std), - )) - fig.update_layout( - title=metric_name + "<br>" + tag + " scores for each classifier") - fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', - plot_bgcolor='rgba(0,0,0,0)') - plotly.offline.plot(fig, filename=class_file_name + ".html", auto_open=False) - del fig - - -def get_fig_size(nb_results, min_size=15, multiplier=1.0, bar_width=0.35): - r"""Used to get the image size to save the figure and the bar width, depending on the number of scores to plot. - - Parameters - ---------- - nb_results : int - The number of couple of bar to plot. - min_size : int - The minimum size of the image, if there are few classifiers to plot. - multiplier : float - The ratio between the image size and the number of classifiers. - bar_width : float - The width of the bars in the figure. Mainly here to centralize bar_width. - - Returns - ------- - fig_kwargs : dict of arguments - The argument restraining the size of the figure, usable directly in the `subplots` function of - `matplotlib.pyplot`. - bar_width : float - The width of the bars in the figure. Mainly here to centralize bar_width. - """ - size = nb_results * multiplier - if size < min_size: - size = min_size - fig_kwargs = {"figsize": (size, size / 3)} - return fig_kwargs, bar_width - - -def autolabel(rects, ax, set=1, std=None): # pragma: no cover - r"""Used to print the score below the bars. - - Parameters - ---------- - rects : pyplot bar object - THe bars. - ax : pyplot ax object - The ax. - set : integer - 1 means the test scores, anything else means the train score - std: None or array - The standard deviations in the case of statsIter results. - - Returns - ------- - """ - if set == 1: - text_height = -0.05 - weight = "bold" - else: - text_height = -0.07 - weight = "normal" - for rectIndex, rect in enumerate(rects): - height = rect.get_height() - if std is not None: - ax.text(rect.get_x() + rect.get_width() / 2., text_height, - "%.2f" % height + u'\u00B1' + "%.2f" % std[rectIndex], - weight=weight, - ha='center', va='bottom', size="x-small") - else: - ax.text(rect.get_x() + rect.get_width() / 2., text_height, - "%.2f" % height, weight=weight, - ha='center', va='bottom', size="small") - - -def sort_by_test_score(train_scores, test_scores, names, train_STDs=None, - test_STDs=None): - r"""Used to sort the results (names and both scores) in descending test score order. - - Parameters - ---------- - train_scores : np.array of floats - The scores of each classifier on the training set. - test_scores : np.array of floats - The scores of each classifier on the testing set. - names : np.array of strs - The names of all the classifiers. - train_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the training set. - test_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the testing set. - - Returns - ------- - sorted_names : np.array of strs - The names of all the classifiers, sorted in descending test score order. - sorted_train_scores : np.array of floats - The scores of each classifier on the training set, sorted in descending test score order. - sorted_test_scores : np.array of floats - The scores of each classifier on the testing set, sorted in descending test score order. - sorted_train_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the training set, - sorted in descending test score order. - sorted_test_STDs : np.array of floats or None - The array containing the standard deviations for the averaged scores on the testing set, - sorted in descending test score order. - """ - sorted_indices = np.argsort(test_scores) - sorted_test_scores = test_scores[sorted_indices] - sorted_train_scores = train_scores[sorted_indices] - sorted_names = names[sorted_indices] - if train_STDs is not None and test_STDs is not None: - sorted_train_STDs = train_STDs[sorted_indices] - sorted_test_STDs = test_STDs[sorted_indices] - else: - sorted_train_STDs = None - sorted_test_STDs = None - return sorted_names, sorted_train_scores, sorted_test_scores, sorted_train_STDs, sorted_test_STDs \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/noise_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/noise_analysis.py deleted file mode 100644 index b4fc81215d5b50564d98108262a332adf617932c..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/noise_analysis.py +++ /dev/null @@ -1,56 +0,0 @@ -# -# import numpy as np -# import pandas as pd -# import matplotlib.pyplot as plt -# import os -# from matplotlib.patches import Patch -# -# -# def plot_results_noise(directory, noise_results, metric_to_plot, name, -# width=0.1): -# avail_colors = ["tab:blue", "tab:orange", "tab:brown", "tab:gray", -# "tab:olive", "tab:red", ] -# colors = {} -# lengend_patches = [] -# noise_levels = np.array([noise_level for noise_level, _ in noise_results]) -# df = pd.DataFrame( -# columns=['noise_level', 'classifier_name', 'mean_score', 'score_std'], ) -# if len(noise_results) > 1: -# width = np.min(np.diff(noise_levels)) -# for noise_level, noise_result in noise_results: -# classifiers_names, meaned_metrics, metric_stds = [], [], [] -# for noise_result in noise_result: -# classifier_name = noise_result[0].split("-")[0] -# if noise_result[1] is metric_to_plot: -# classifiers_names.append(classifier_name) -# meaned_metrics.append(noise_result[2]) -# metric_stds.append(noise_result[3]) -# if classifier_name not in colors: -# try: -# colors[classifier_name] = avail_colors.pop(0) -# except IndexError: -# colors[classifier_name] = "k" -# classifiers_names, meaned_metrics, metric_stds = np.array( -# classifiers_names), np.array(meaned_metrics), np.array(metric_stds) -# sorted_indices = np.argsort(-meaned_metrics) -# for index in sorted_indices: -# row = pd.DataFrame( -# {'noise_level': noise_level, -# 'classifier_name': classifiers_names[index], -# 'mean_score': meaned_metrics[index], -# 'score_std': metric_stds[index]}, index=[0]) -# df = pd.concat([df, row]) -# plt.bar(noise_level, meaned_metrics[index], yerr=metric_stds[index], -# width=0.5 * width, label=classifiers_names[index], -# color=colors[classifiers_names[index]]) -# for classifier_name, color in colors.items(): -# lengend_patches.append(Patch(facecolor=color, label=classifier_name)) -# plt.legend(handles=lengend_patches, loc='lower center', -# bbox_to_anchor=(0.5, 1.05), ncol=2) -# plt.ylabel(metric_to_plot) -# plt.title(name) -# plt.xticks(noise_levels) -# plt.xlabel("Noise level") -# plt.savefig(os.path.join(directory, name + "_noise_analysis.png")) -# plt.close() -# df.to_csv(os.path.join(directory, name + "_noise_analysis.csv")) diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis/tracebacks_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis/tracebacks_analysis.py deleted file mode 100644 index 329a27f6fe98c23b94b1053847c7482165d970d4..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis/tracebacks_analysis.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - - -def publish_tracebacks(directory, database_name, labels_names, tracebacks, - iter_index): - if tracebacks: - with open(os.path.join(directory, database_name + - "-iter" + str(iter_index) + - "-tacebacks.txt"), - "w") as traceback_file: - failed_list = save_dict_to_text(tracebacks, traceback_file) - flagged_list = [_ + "-iter" + str(iter_index) for _ in failed_list] - else: - flagged_list = {} - return flagged_list - - -def save_dict_to_text(dictionnary, output_file): - # TODO : smarter way must exist - output_file.write("Failed algorithms : \n\t" + ",\n\t".join( - dictionnary.keys()) + ".\n\n\n") - for key, value in dictionnary.items(): - output_file.write(key) - output_file.write("\n\n") - output_file.write(value) - output_file.write("\n\n\n") - return dictionnary.keys() - - -def save_failed(failed_list, directory): - with open(os.path.join(directory, "failed_algorithms.txt"), - "w") as failed_file: - failed_file.write( - "The following algorithms sent an error, the tracebacks are stored " - "in the coressponding directory :\n") - failed_file.write(", \n".join(failed_list) + ".") diff --git a/multiview_platform/mono_multi_view_classifiers/utils/__init__.py b/multiview_platform/mono_multi_view_classifiers/utils/__init__.py deleted file mode 100644 index e0473b520b385389e967e567261bdb95a360aa37..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import dataset, execution, hyper_parameter_search, transformations diff --git a/multiview_platform/mono_multi_view_classifiers/utils/base.py b/multiview_platform/mono_multi_view_classifiers/utils/base.py deleted file mode 100644 index 34894b5a6892d47b1a3843c55f7dfc30b84e97b2..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/base.py +++ /dev/null @@ -1,377 +0,0 @@ -import numpy as np -from sklearn.base import BaseEstimator -from abc import abstractmethod -from datetime import timedelta as hms -from tabulate import tabulate -from sklearn.metrics import confusion_matrix as confusion -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier - -from multiview_platform.mono_multi_view_classifiers import metrics - - -class BaseClassifier(BaseEstimator, ): - - def gen_best_params(self, detector): - """ - return best parameters of detector - Parameters - ---------- - detector : - - Returns - ------- - best param : dictionary with param name as key and best parameters - value - """ - return dict( - (param_name, detector.best_params_[param_name]) for param_name in - self.param_names) - - def gen_params_from_detector(self, detector): - if self.classed_params: - classed_dict = dict((classed_param, get_names( - detector.cv_results_["param_" + classed_param])) - for classed_param in self.classed_params) - if self.param_names: - return [(param_name, - np.array(detector.cv_results_["param_" + param_name])) - if param_name not in self.classed_params else ( - param_name, classed_dict[param_name]) - for param_name in self.param_names] - else: - return [()] - - def gen_distribs(self): - return dict((param_name, distrib) for param_name, distrib in - zip(self.param_names, self.distribs)) - - def params_to_string(self): - """ - Formats the parameters of the classifier as a string - """ - return ", ".join( - [param_name + " : " + self.to_str(param_name) for param_name in - self.param_names]) - - def get_config(self): - """ - Generates a string to containing all the information about the - classifier's configuration - """ - if self.param_names: - return self.__class__.__name__ + " with " + self.params_to_string() - else: - return self.__class__.__name__ + " with no config." - - def get_base_estimator(self, base_estimator, estimator_config): - if estimator_config is None: - estimator_config = {} - if base_estimator is None: - return DecisionTreeClassifier(**estimator_config) - if isinstance(base_estimator, str): # pragma: no cover - if base_estimator == "DecisionTreeClassifier": - return DecisionTreeClassifier(**estimator_config) - elif base_estimator == "AdaboostClassifier": - return AdaBoostClassifier(**estimator_config) - elif base_estimator == "RandomForestClassifier": - return RandomForestClassifier(**estimator_config) - else: - raise ValueError('Base estimator string {} does not match an available classifier.'.format(base_estimator)) - elif isinstance(base_estimator, BaseEstimator): - return base_estimator.set_params(**estimator_config) - else: - raise ValueError('base_estimator must be either a string or a BaseEstimator child class, it is {}'.format(type(base_estimator))) - - - def to_str(self, param_name): - """ - Formats a parameter into a string - """ - if param_name in self.weird_strings: - string = "" - if "class_name" in self.weird_strings[param_name]: - string += self.get_params()[param_name].__class__.__name__ - if "config" in self.weird_strings[param_name]: - string += "( with " + self.get_params()[ - param_name].params_to_string() + ")" - else: - return str(self.get_params()[param_name]) - - def get_interpretation(self, directory, base_file_name, y_test, - multi_class=False): - """ - Base method that returns an empty string if there is not interpretation - method in the classifier's module - """ - return "" - - def accepts_multi_class(self, random_state, n_samples=10, dim=2, - n_classes=3): - """ - Base function to test if the classifier accepts a multiclass task. - It is highly recommended to overwrite it with a simple method that - returns True or False in the classifier's module, as it will speed up - the benchmark - """ - if int(n_samples / n_classes) < 1: - raise ValueError( - "n_samples ({}) / n_class ({}) must be over 1".format( - n_samples, - n_classes)) - # if hasattr(self, "accepts_mutli_class"): - # return self.accepts_multi_class - fake_mc_X = random_state.randint(low=0, high=101, - size=(n_samples, dim)) - fake_mc_y = [class_index - for _ in range(int(n_samples / n_classes)) - for class_index in range(n_classes)] - fake_mc_y += [0 for _ in range(n_samples % n_classes)] - fake_mc_y = np.asarray(fake_mc_y) - try: - self.fit(fake_mc_X, fake_mc_y) - # self.predict(fake_mc_X) - return True - except ValueError: - return False - - -def get_names(classed_list): - return np.array([object_.__class__.__name__ for object_ in classed_list]) - - -def get_metric(metrics_dict): - """ - Fetches the metric module in the metrics package - """ - for metric_name, metric_kwargs in metrics_dict.items(): - if metric_name.endswith("*"): - princ_metric_name = metric_name[:-1] - princ_metric_kwargs = metric_kwargs - metric_module = getattr(metrics, princ_metric_name) - return metric_module, princ_metric_kwargs - - -class ResultAnalyser(): - """ - A shared result analysis tool for mono and multiview classifiers. - The main utility of this class is to generate a txt file summarizing - the results and possible interpretation for the classifier. - """ - - def __init__(self, classifier, classification_indices, k_folds, - hps_method, metrics_dict, n_iter, class_label_names, - pred, directory, base_file_name, labels, - database_name, nb_cores, duration): - """ - - Parameters - ---------- - classifier: estimator used for classification - - classification_indices: list of indices for train test sets - - k_folds: the sklearn StratifiedkFolds object - - hps_method: string naming the hyper-parameter search method - - metrics_dict: list of the metrics to compute on the results - - n_iter: number of HPS iterations - - class_label_names: list of the names of the labels - - train_pred: classifier's prediction on the training set - - test_pred: classifier's prediction on the testing set - - directory: directory where to save the result analysis - - labels: the full labels array (Y in sklearn) - - database_name: the name of the database - - nb_cores: number of cores/threads use for the classification - - duration: duration of the classification - """ - self.classifier = classifier - self.train_indices, self.test_indices = classification_indices - self.k_folds = k_folds - self.hps_method = hps_method - self.metrics_dict = metrics_dict - self.n_iter = n_iter - self.class_label_names = class_label_names - self.pred = pred - self.directory = directory - self.base_file_name = base_file_name - self.labels = labels - self.string_analysis = "" - self.database_name = database_name - self.nb_cores = nb_cores - self.duration = duration - self.metric_scores = {} - self.class_metric_scores = {} - - def get_all_metrics_scores(self, ): - """ - Get the scores for all the metrics in the list - Returns - ------- - """ - for metric, metric_args in self.metrics_dict.items(): - class_train_scores, class_test_scores, train_score, test_score\ - = self.get_metric_score(metric, metric_args) - self.class_metric_scores[metric] = (class_train_scores, - class_test_scores) - self.metric_scores[metric] = (train_score, test_score) - - def get_metric_score(self, metric, metric_kwargs): - """ - Get the train and test scores for a specific metric and its arguments - - Parameters - ---------- - - metric : name of the metric, must be implemented in metrics - - metric_kwargs : the dictionary containing the arguments for the metric. - - Returns - ------- - train_score, test_score - """ - if not metric.endswith("*"): - metric_module = getattr(metrics, metric) - else: - metric_module = getattr(metrics, metric[:-1]) - class_train_scores = [] - class_test_scores = [] - for label_value in np.unique(self.labels): - train_example_indices = self.train_indices[np.where(self.labels[self.train_indices]==label_value)[0]] - test_example_indices = self.test_indices[np.where(self.labels[self.test_indices] == label_value)[0]] - class_train_scores.append(metric_module.score(y_true=self.labels[train_example_indices], - y_pred=self.pred[train_example_indices], - **metric_kwargs)) - class_test_scores.append(metric_module.score(y_true=self.labels[test_example_indices], - y_pred=self.pred[test_example_indices], - **metric_kwargs)) - train_score = metric_module.score(y_true=self.labels[self.train_indices], - y_pred=self.pred[self.train_indices], - **metric_kwargs) - test_score = metric_module.score(y_true=self.labels[self.test_indices], - y_pred=self.pred[self.test_indices], - **metric_kwargs) - return class_train_scores, class_test_scores, train_score, test_score - - def print_metric_score(self,): - """ - Generates a string, formatting the metrics configuration and scores - - Parameters - ---------- - metric_scores : dictionary of train_score, test_score for each metric - - metric_list : list of metrics - - Returns - ------- - metric_score_string string formatting all metric results - """ - metric_score_string = "\n\n" - for metric, metric_kwargs in self.metrics_dict.items(): - if metric.endswith("*"): - metric_module = getattr(metrics, metric[:-1]) - else: - metric_module = getattr(metrics, metric) - metric_score_string += "\tFor {} : ".format(metric_module.get_config( - **metric_kwargs)) - metric_score_string += "\n\t\t- Score on train : {}".format(self.metric_scores[metric][0]) - metric_score_string += "\n\t\t- Score on test : {}".format(self.metric_scores[metric][1]) - metric_score_string += "\n\n" - metric_score_string += "Test set confusion matrix : \n\n" - self.confusion_matrix = confusion(y_true=self.labels[self.test_indices], y_pred=self.pred[self.test_indices]) - formatted_conf = [[label_name]+list(row) for label_name, row in zip(self.class_label_names, self.confusion_matrix)] - metric_score_string+=tabulate(formatted_conf, headers= ['']+self.class_label_names, tablefmt='fancy_grid') - metric_score_string += "\n\n" - return metric_score_string - - @abstractmethod - def get_view_specific_info(self): # pragma: no cover - pass - - @abstractmethod - def get_base_string(self): # pragma: no cover - pass - - def get_db_config_string(self,): - """ - Generates a string, formatting all the information on the database - - Parameters - ---------- - - Returns - ------- - db_config_string string, formatting all the information on the database - """ - learning_ratio = len(self.train_indices) / ( - len(self.train_indices) + len(self.test_indices)) - db_config_string = "Database configuration : \n" - db_config_string += "\t- Database name : {}\n".format(self.database_name) - db_config_string += self.get_view_specific_info() - db_config_string += "\t- Learning Rate : {}\n".format(learning_ratio) - db_config_string += "\t- Labels used : " + ", ".join( - self.class_label_names) + "\n" - db_config_string += "\t- Number of cross validation folds : {}\n\n".format(self.k_folds.n_splits) - return db_config_string - - def get_classifier_config_string(self, ): - """ - Formats the information about the classifier and its configuration - - Returns - ------- - A string explaining the classifier's configuration - """ - classifier_config_string = "Classifier configuration : \n" - classifier_config_string += "\t- " + self.classifier.get_config()+ "\n" - classifier_config_string += "\t- Executed on {} core(s) \n".format( - self.nb_cores) - - if self.hps_method.startswith('randomized_search'): - classifier_config_string += "\t- Got configuration using randomized search with {} iterations \n" .format(self.n_iter) - return classifier_config_string - - def analyze(self, ): - """ - Main function used in the monoview and multiview classification scripts - - Returns - ------- - string_analysis : a string that will be stored in the log and in a txt - file - image_analysis : a list of images to save - metric_scores : a dictionary of {metric: (train_score, test_score)} - used in later analysis. - """ - string_analysis = self.get_base_string() - string_analysis += self.get_db_config_string() - string_analysis += self.get_classifier_config_string() - self.get_all_metrics_scores() - string_analysis += self.print_metric_score() - string_analysis += "\n\n Classification took {}".format(hms(seconds=int(self.duration))) - string_analysis += "\n\n Classifier Interpretation : \n" - string_analysis += self.classifier.get_interpretation( - self.directory, self.base_file_name, - self.labels[self.test_indices]) - image_analysis = {} - return string_analysis, image_analysis, self.metric_scores, \ - self.class_metric_scores, self.confusion_matrix - - -base_boosting_estimators = [DecisionTreeClassifier(max_depth=1), - DecisionTreeClassifier(max_depth=2), - DecisionTreeClassifier(max_depth=3), - DecisionTreeClassifier(max_depth=4), - DecisionTreeClassifier(max_depth=5), ] \ No newline at end of file diff --git a/multiview_platform/mono_multi_view_classifiers/utils/configuration.py b/multiview_platform/mono_multi_view_classifiers/utils/configuration.py deleted file mode 100644 index fcd62c6d94ef3f24dec3dc80aa7a992400b7fa67..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/configuration.py +++ /dev/null @@ -1,96 +0,0 @@ -import os - -import yaml - - -def get_the_args(path_to_config_file="../config_files/config.yml"): - """ - The function for extracting the args for a '.yml' file. - - Parameters - ---------- - path_to_config_file : str, path to the yml file containing the configuration - - Returns - ------- - yaml_config : dict, the dictionary conaining the configuration for the - benchmark - - """ - with open(path_to_config_file, 'r') as stream: - yaml_config = yaml.safe_load(stream) - return pass_default_config(**yaml_config) - - -def pass_default_config(log=True, - name=["plausible", ], - label="_", - file_type=".hdf5", - views=None, - pathf="../data/", - nice=0, - random_state=42, - nb_cores=1, - full=True, - debug=False, - add_noise=False, - noise_std=0.0, - res_dir="../results/", - track_tracebacks=True, - split=0.49, - nb_folds=5, - nb_class=None, - classes=None, - type=["multiview", ], - algos_monoview=["all"], - algos_multiview=["svm_jumbo_fusion", ], - stats_iter=2, - metrics={"accuracy_score":{}, "f1_score":{}}, - metric_princ="accuracy_score", - hps_type="Random", - hps_iter=1, - hps_kwargs={'n_iter':10, "equivalent_draws":True}, - **kwargs): - """ - - :param log: - :param name: - :param label: - :param file_type: - :param views: - :param pathf: - :param nice: - :param random_state: - :param nb_cores: - :param full: - :param debug: - :param add_noise: - :param noise_std: - :param res_dir: - :param track_tracebacks: - :param split: - :param nb_folds: - :param nb_class: - :param classes: - :param type: - :param algos_monoview: - :param algos_multiview: - :param stats_iter: - :param metrics: - :param metric_princ: - :param hps_type: - :param hps_iter: - :return: - """ - args = dict( - (key, value) for key, value in locals().items() if key != "kwargs") - args = dict(args, **kwargs) - return args - - -def save_config(directory, arguments): - """ - Saves the config file in the result directory. - """ - with open(os.path.join(directory, "config_file.yml"), "w") as stream: - yaml.dump(arguments, stream) diff --git a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py deleted file mode 100644 index 00ea3aadd0a4237d32e85464042fd0cacb09abbc..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py +++ /dev/null @@ -1,769 +0,0 @@ -import logging -import os -import select -import sys -from abc import abstractmethod - -import h5py -import numpy as np -from scipy import sparse - -from .organization import secure_file_path - -class Dataset(): - - @abstractmethod - def get_nb_examples(self): # pragma: no cover - pass - - @abstractmethod - def get_v(self, view_index, example_indices=None): # pragma: no cover - pass - - @abstractmethod - def get_label_names(self, example_indices=None): # pragma: no cover - pass - - @abstractmethod - def get_labels(self, example_indices=None): # pragma: no cover - pass - - @abstractmethod - def filter(self, labels, label_names, example_indices, view_names, - path=None): # pragma: no cover - pass - - def init_example_indices(self, example_indices=None): - """If no example indices are provided, selects all the examples.""" - if example_indices is None: - return range(self.get_nb_examples()) - else: - return example_indices - - def get_shape(self, view_index=0, example_indices=None): - """ - Gets the shape of the needed view on the asked examples - - Parameters - ---------- - view_index : int - The index of the view to extract - example_indices : numpy.ndarray - The array containing the indices of the examples to extract. - - Returns - ------- - Tuple containing the shape - - """ - return self.get_v(view_index, example_indices=example_indices).shape - - def to_numpy_array(self, example_indices=None, view_indices=None): - """ - To concatenate the needed views in one big numpy array while saving the - limits of each view in a list, to be able to retrieve them later. - - Parameters - ---------- - example_indices : array like, - The indices of the examples to extract from the dataset - - view_indices : array like, - The indices of the view to concatenate in the numpy array - - Returns - ------- - concat_views : numpy array, - The numpy array containing all the needed views. - - view_limits : list of int - The limits of each slice used to extract the views. - - """ - view_limits = [0] - for view_index in view_indices: - view_data = self.get_v(view_index, example_indices=example_indices) - nb_features = view_data.shape[1] - view_limits.append(view_limits[-1] + nb_features) - concat_views = np.concatenate([self.get_v(view_index, - example_indices=example_indices) - for view_index in view_indices], axis=1) - return concat_views, view_limits - - def select_labels(self, selected_label_names): - selected_labels = [self.get_label_names().index(label_name.decode()) - if isinstance(label_name, bytes) - else self.get_label_names().index(label_name) - for label_name in selected_label_names] - selected_indices = np.array([index - for index, label in - enumerate(self.get_labels()) - if label in selected_labels]) - labels = np.array([selected_labels.index(self.get_labels()[idx]) - for idx in selected_indices]) - return labels, selected_label_names, selected_indices - - def select_views_and_labels(self, nb_labels=None, - selected_label_names=None, random_state=None, - view_names=None, path_for_new="../data/"): - if view_names is None and selected_label_names is None and nb_labels is None: # pragma: no cover - pass - else: - selected_label_names = self.check_selected_label_names(nb_labels, - selected_label_names, - random_state) - labels, label_names, example_indices = self.select_labels( - selected_label_names) - self.filter(labels, label_names, example_indices, view_names, - path_for_new) - labels_dictionary = dict( - (labelIndex, labelName) for labelIndex, labelName in - enumerate(self.get_label_names())) - return labels_dictionary - - def check_selected_label_names(self, nb_labels=None, - selected_label_names=None, - random_state=np.random.RandomState(42)): - if selected_label_names is None or nb_labels is None or len( - selected_label_names) < nb_labels: - if selected_label_names is None: - nb_labels_to_add = nb_labels - selected_label_names = [] - elif nb_labels is not None: - nb_labels_to_add = nb_labels - len(selected_label_names) - else: - nb_labels_to_add = 0 - labels_names_to_choose = [available_label_name - for available_label_name - in self.get_label_names() - if available_label_name - not in selected_label_names] - added_labels_names = random_state.choice(labels_names_to_choose, - nb_labels_to_add, - replace=False) - selected_label_names = list(selected_label_names) + list( - added_labels_names) - elif len(selected_label_names) > nb_labels: - selected_label_names = list( - random_state.choice(selected_label_names, nb_labels, - replace=False)) - - return selected_label_names - - -class RAMDataset(Dataset): - - def __init__(self, views=None, labels=None, are_sparse=False, - view_names=None, labels_names=None, example_ids=None, - name=None): - self.saved_on_disk = False - self.views = views - self.labels = np.asarray(labels) - if isinstance(are_sparse, bool): # pragma: no cover - self.are_sparse = [are_sparse for _ in range(len(views))] - else: - self.are_sparse = are_sparse - self.view_names = view_names - self.labels_names = labels_names - self.example_ids = example_ids - self.view_dict = dict((view_name, view_ind) - for view_name, view_ind - in zip(view_names, range(len(views)))) - self.name = name - self.nb_view = len(self.views) - self.is_temp = False - - def get_view_name(self, view_idx): - return self.view_names[view_idx] - - def init_attrs(self): - """ - Used to init the two attributes that are modified when self.dataset - changes - - Returns - ------- - - """ - - self.nb_view = len(self.views) - self.view_dict = dict((view_ind, self.view_names[view_ind]) - for view_ind in range(self.nb_view)) - - def get_nb_examples(self): - return self.views[0].shape[0] - - def get_label_names(self, example_indices=None, decode=True): - selected_labels = self.get_labels(example_indices) - if decode: - return [label_name.encode("utf-8") - for label, label_name in enumerate(self.labels_names) - if label in selected_labels] - else: - return [label_name.encode("utf-8") - for label, label_name in enumerate(self.labels_names) - if label in selected_labels] - - def get_labels(self, example_indices=None): - example_indices = self.init_example_indices(example_indices) - return self.labels[example_indices] - - def get_v(self, view_index, example_indices=None): - example_indices = self.init_example_indices(example_indices) - if type(example_indices) is int: - return self.views[view_index][example_indices, :] - else: - example_indices = np.asarray(example_indices) - # sorted_indices = np.argsort(example_indices) - # example_indices = example_indices[sorted_indices] - if not self.are_sparse[view_index]: - return self.views[view_index][ - example_indices, :] - else: # pragma: no cover - # TODO Sparse support - pass - - def get_nb_class(self, example_indices=None): - """Gets the number of class of the dataset""" - example_indices = self.init_example_indices(example_indices) - return len(np.unique(self.labels[example_indices])) - - def filter(self, labels, label_names, example_indices, view_names, - path=None): - if self.example_ids is not None: - self.example_ids = self.example_ids[example_indices] - self.labels = self.labels[example_indices] - self.labels_names = [name for lab_index, name - in enumerate(self.labels_names) - if lab_index in np.unique(self.labels)] - self.labels = np.array( - [np.where(label == np.unique(self.labels))[0] for label in - self.labels]) - self.view_names = view_names - new_views = [] - for new_view_ind, view_name in enumerate(self.view_names): - new_views.append( - self.views[self.view_dict[view_name]][example_indices, :]) - self.views = new_views - self.view_dict = dict((view_name, view_ind) - for view_ind, view_name - in enumerate(self.view_names)) - self.nb_view = len(self.views) - - def get_view_dict(self): - return self.view_dict - - def get_name(self): - return self.name - - -class HDF5Dataset(Dataset): - """ - Class of Dataset - - This class is used to encapsulate the multiview dataset while keeping it stored on the disk instead of in RAM. - - - Parameters - ---------- - views : list of numpy arrays or None - The list containing each view of the dataset as a numpy array of shape - (nb examples, nb features). - - labels : numpy array or None - The labels for the multiview dataset, of shape (nb examples, ). - - are_sparse : list of bool, or None - The list of boolean telling if each view is sparse or not. - - file_name : str, or None - The name of the hdf5 file that will be created to store the multiview - dataset. - - view_names : list of str, or None - The name of each view. - - path : str, or None - The path where the hdf5 dataset file will be stored - - hdf5_file : h5py.File object, or None - If not None, the dataset will be imported directly from this file. - - labels_names : list of str, or None - The name for each unique value of the labels given in labels. - - is_temp : bool - Used if a temporary dataset has to be used by the benchmark. - - Attributes - ---------- - dataset : h5py.File object - The h5py file pbject that points to the hdf5 dataset on the disk. - - nb_view : int - The number of views in the dataset. - - view_dict : dict - The dictionnary with the name of each view as the keys and their indices - as values - """ - - # The following methods use hdf5 - - def __init__(self, views=None, labels=None, are_sparse=False, - file_name="dataset.hdf5", view_names=None, path="", - hdf5_file=None, labels_names=None, is_temp=False, - example_ids=None, ): - self.is_temp = False - if hdf5_file is not None: - self.dataset = hdf5_file - self.init_attrs() - else: - secure_file_path(os.path.join(path, file_name)) - dataset_file = h5py.File(os.path.join(path, file_name), "w") - if view_names is None: - view_names = ["View" + str(index) for index in - range(len(views))] - if isinstance(are_sparse, bool): # pragma: no cover - are_sparse = [are_sparse for _ in views] - for view_index, (view_name, view, is_sparse) in enumerate( - zip(view_names, views, are_sparse)): - view_dataset = dataset_file.create_dataset( - "View" + str(view_index), - view.shape, - data=view) - view_dataset.attrs["name"] = view_name - view_dataset.attrs["sparse"] = is_sparse - labels_dataset = dataset_file.create_dataset("Labels", - shape=labels.shape, - data=labels) - if labels_names is None: - labels_names = [str(index) for index in np.unique(labels)] - labels_dataset.attrs["names"] = [label_name.encode() - if not isinstance(label_name, - bytes) - else label_name - for label_name in labels_names] - meta_data_grp = dataset_file.create_group("Metadata") - meta_data_grp.attrs["nbView"] = len(views) - meta_data_grp.attrs["nbClass"] = len(np.unique(labels)) - meta_data_grp.attrs["datasetLength"] = len(labels) - dataset_file.close() - self.update_hdf5_dataset(os.path.join(path, file_name)) - if example_ids is not None: - example_ids = [example_id if not is_just_number(example_id) - else "ID_" + example_id for example_id in - example_ids] - self.example_ids = example_ids - else: - self.example_ids = ["ID_" + str(i) - for i in range(labels.shape[0])] - - def get_v(self, view_index, example_indices=None): - r""" Extract the view and returns a numpy.ndarray containing the description - of the examples specified in example_indices - - Parameters - ---------- - view_index : int - The index of the view to extract - example_indices : numpy.ndarray - The array containing the indices of the examples to extract. - - Returns - ------- - A numpy.ndarray containing the view data for the needed examples - """ - example_indices = self.init_example_indices(example_indices) - if type(example_indices) is int: - return self.dataset["View" + str(view_index)][example_indices, :] - else: - example_indices = np.array(example_indices) - # sorted_indices = np.argsort(example_indices) - # example_indices = example_indices[sorted_indices] - - if not self.dataset["View" + str(view_index)].attrs["sparse"]: - return self.dataset["View" + str(view_index)][()][ - example_indices, :] # [np.argsort(sorted_indices), :] - else: # pragma: no cover - # Work in progress - pass - - def get_view_name(self, view_idx): - """ - Method to get a view's name from its index. - - Parameters - ---------- - view_idx : int - The index of the view in the dataset - - Returns - ------- - The view's name. - - """ - return self.dataset["View" + str(view_idx)].attrs["name"] - - def init_attrs(self): - """ - Used to init the attributes that are modified when self.dataset - changes - - Returns - ------- - - """ - self.nb_view = self.dataset["Metadata"].attrs["nbView"] - self.view_dict = self.get_view_dict() - if "example_ids" in self.dataset["Metadata"].keys(): - self.example_ids = [example_id.decode() - if not is_just_number(example_id.decode()) - else "ID_" + example_id.decode() - for example_id in - self.dataset["Metadata"]["example_ids"]] - else: - self.example_ids = ["ID_"+str(i) for i in - range(self.dataset["Labels"].shape[0])] - - def get_nb_examples(self): - """ - Used to get the number of examples available in hte dataset - - Returns - ------- - - """ - return self.dataset["Metadata"].attrs["datasetLength"] - - def get_view_dict(self): - """ - Returns the dictionary with view indices as keys and their corresponding - names as values - """ - view_dict = {} - for view_index in range(self.nb_view): - view_dict[self.dataset["View" + str(view_index)].attrs[ - "name"]] = view_index - return view_dict - - def get_label_names(self, decode=True, example_indices=None): - """ - Used to get the list of the label names for the given set of examples - - Parameters - ---------- - decode : bool - If True, will decode the label names before listing them - - example_indices : numpy.ndarray - The array containing the indices of the needed examples - - Returns - ------- - - """ - selected_labels = self.get_labels(example_indices) - if decode: - return [label_name.decode("utf-8") - for label, label_name in - enumerate(self.dataset["Labels"].attrs["names"]) - if label in selected_labels] - else: - return [label_name - for label, label_name in - enumerate(self.dataset["Labels"].attrs["names"]) - if label in selected_labels] - - def get_nb_class(self, example_indices=None): - """ - Gets the number of classes of the dataset for the asked examples - - Parameters - ---------- - example_indices : numpy.ndarray - The array containing the indices of the examples to extract. - - Returns - ------- - int : The number of classes - - """ - example_indices = self.init_example_indices(example_indices) - return len(np.unique(self.dataset["Labels"][()][example_indices])) - - def get_labels(self, example_indices=None): - """Gets the label array for the asked examples - - Parameters - ---------- - example_indices : numpy.ndarray - The array containing the indices of the examples to extract. - - Returns - ------- - numpy.ndarray containing the labels of the asked examples""" - example_indices = self.init_example_indices(example_indices) - return self.dataset["Labels"][()][example_indices] - - def rm(self): # pragma: no cover - """ - Method used to delete the dataset file on the disk if the dataset is - temporary. - - Returns - ------- - - """ - filename = self.dataset.filename - self.dataset.close() - if self.is_temp: - os.remove(filename) - - - def copy_view(self, target_dataset=None, source_view_name=None, - target_view_index=None, example_indices=None): - example_indices = self.init_example_indices(example_indices) - new_d_set = target_dataset.create_dataset( - "View" + str(target_view_index), - data=self.get_v(self.view_dict[source_view_name], - example_indices=example_indices)) - for key, value in self.dataset[ - "View" + str(self.view_dict[source_view_name])].attrs.items(): - new_d_set.attrs[key] = value - - def init_view_names(self, view_names=None): - if view_names is None: - return [key for key in self.get_view_dict().keys()] - else: - return view_names - - def update_hdf5_dataset(self, path): - if hasattr(self, 'dataset'): - self.dataset.close() - self.dataset = h5py.File(path, 'r') - self.is_temp = True - self.init_attrs() - - def filter(self, labels, label_names, example_indices, view_names, - path=None): - dataset_file_path = os.path.join(path, - self.get_name() + "_temp_filter.hdf5") - new_dataset_file = h5py.File(dataset_file_path, "w") - self.dataset.copy("Metadata", new_dataset_file) - if "example_ids" in self.dataset["Metadata"].keys(): - del new_dataset_file["Metadata"]["example_ids"] - ex_ids = new_dataset_file["Metadata"].create_dataset("example_ids", - data=np.array( - self.example_ids)[ - example_indices].astype( - np.dtype( - "S100"))) - else: - new_dataset_file["Metadata"].create_dataset("example_ids", - ( - len(self.example_ids),), - data=np.array( - self.example_ids).astype( - np.dtype("S100")), - dtype=np.dtype("S100")) - new_dataset_file["Metadata"].attrs["datasetLength"] = len( - example_indices) - new_dataset_file["Metadata"].attrs["nbClass"] = np.unique(labels) - new_dataset_file.create_dataset("Labels", data=labels) - new_dataset_file["Labels"].attrs["names"] = [label_name.encode() - if not isinstance( - label_name, bytes) - else label_name - for label_name in - label_names] - view_names = self.init_view_names(view_names) - new_dataset_file["Metadata"].attrs["nbView"] = len(view_names) - for new_index, view_name in enumerate(view_names): - self.copy_view(target_dataset=new_dataset_file, - source_view_name=view_name, - target_view_index=new_index, - example_indices=example_indices) - new_dataset_file.close() - self.update_hdf5_dataset(dataset_file_path) - - def add_gaussian_noise(self, random_state, path, - noise_std=0.15): - """In this function, we add a guaussian noise centered in 0 with specified - std to each view, according to it's range (the noise will be - mutliplied by this range) and we crop the noisy signal according to the - view's attributes limits. - This is done by creating a new dataset, to keep clean data.""" - noisy_dataset = h5py.File(path + self.get_name() + "_noised.hdf5", "w") - self.dataset.copy("Metadata", noisy_dataset) - self.dataset.copy("Labels", noisy_dataset) - for view_index in range(self.nb_view): - self.copy_view(target_dataset=noisy_dataset, - source_view_name=self.get_view_name(view_index), - target_view_index=view_index) - for view_index in range(noisy_dataset["Metadata"].attrs["nbView"]): - view_key = "View" + str(view_index) - view_dset = noisy_dataset[view_key] - view_limits = self.dataset[ - "Metadata/View" + str(view_index) + "_limits"][()] - view_ranges = view_limits[:, 1] - view_limits[:, 0] - normal_dist = random_state.normal(0, noise_std, view_dset[()].shape) - noise = normal_dist * view_ranges - noised_data = view_dset[()] + noise - noised_data = np.where(noised_data < view_limits[:, 0], - view_limits[:, 0], noised_data) - noised_data = np.where(noised_data > view_limits[:, 1], - view_limits[:, 1], noised_data) - noisy_dataset[view_key][...] = noised_data - noisy_dataset_path = noisy_dataset.filename - noisy_dataset.close() - self.update_hdf5_dataset(noisy_dataset_path) - - # The following methods are hdf5 free - - def get_name(self): - """Ony works if there are not multiple dots in the files name""" - return self.dataset.filename.split('/')[-1].split('.')[0] - - -def is_just_number(string): - try: - float(string) - return True - except ValueError: - return False - - -def datasets_already_exist(pathF, name, nbCores): - """Used to check if it's necessary to copy datasets""" - allDatasetExist = True - for coreIndex in range(nbCores): - allDatasetExist *= os.path.isfile(os.path.join( - pathF, name + str(coreIndex) + ".hdf5")) - return allDatasetExist - - -def extract_subset(matrix, used_indices): - """Used to extract a subset of a matrix even if it's sparse WIP""" - # if sparse.issparse(matrix): - # new_indptr = np.zeros(len(used_indices) + 1, dtype=int) - # oldindptr = matrix.indptr - # for exampleIndexIndex, exampleIndex in enumerate(used_indices): - # new_indptr[exampleIndexIndex + 1] = new_indptr[ - # exampleIndexIndex] + ( - # oldindptr[ - # exampleIndex + 1] - - # oldindptr[exampleIndex]) - # new_data = np.ones(new_indptr[-1], dtype=bool) - # new_indices = np.zeros(new_indptr[-1], dtype=int) - # old_indices = matrix.indices - # for exampleIndexIndex, exampleIndex in enumerate(used_indices): - # new_indices[new_indptr[exampleIndexIndex]:new_indptr[ - # exampleIndexIndex + 1]] = old_indices[ - # oldindptr[exampleIndex]: - # oldindptr[exampleIndex + 1]] - # return sparse.csr_matrix((new_data, new_indices, new_indptr), - # shape=(len(used_indices), matrix.shape[1])) - # else: - return matrix[used_indices] - - -def init_multiple_datasets(path_f, name, nb_cores): # pragma: no cover - r"""Used to create copies of the dataset if multicore computation is used. - - This is a temporary solution to fix the sharing memory issue with HDF5 datasets. - - Parameters - ---------- - path_f : string - Path to the original dataset directory - name : string - Name of the dataset - nb_cores : int - The number of threads that the benchmark can use - - Returns - ------- - datasetFiles : None - Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. - """ - if nb_cores > 1: - if datasets_already_exist(path_f, name, nb_cores): - logging.debug( - "Info:\t Enough copies of the dataset are already available") - pass - else: - if os.path.getsize(os.path.join(path_f, name + ".hdf5")) * nb_cores / float(1024) / 1000 / 1000 > 0.1: - logging.debug("Start:\t Creating " + str( - nb_cores) + " temporary datasets for multiprocessing") - logging.warning( - " WARNING : /!\ This may use a lot of HDD storage space : " + - str(os.path.getsize(os.path.join(path_f, name + ".hdf5")) * nb_cores / float( - 1024) / 1000 / 1000) + " Gbytes /!\ ") - confirmation = confirm() - if not confirmation: - sys.exit(0) - else: - pass - else: - pass - dataset_files = copy_hdf5(path_f, name, nb_cores) - logging.debug("Start:\t Creating datasets for multiprocessing") - return dataset_files - - -def copy_hdf5(pathF, name, nbCores): - """Used to copy a HDF5 database in case of multicore computing""" - datasetFile = h5py.File(pathF + name + ".hdf5", "r") - for coreIndex in range(nbCores): - newDataSet = h5py.File(pathF + name + str(coreIndex) + ".hdf5", "w") - for dataset in datasetFile: - datasetFile.copy("/" + dataset, newDataSet["/"]) - newDataSet.close() - - -def delete_HDF5(benchmarkArgumentsDictionaries, nbCores, dataset): - """Used to delete temporary copies at the end of the benchmark""" - if nbCores > 1: - logging.debug("Start:\t Deleting " + str( - nbCores) + " temporary datasets for multiprocessing") - args = benchmarkArgumentsDictionaries[0]["args"] - logging.debug("Start:\t Deleting datasets for multiprocessing") - - for coreIndex in range(nbCores): - os.remove(args["pathf"] + args["name"] + str(coreIndex) + ".hdf5") - if dataset.is_temp: - dataset.rm() - - -def confirm(resp=True, timeout=15): # pragma: no cover - """Used to process answer""" - ans = input_(timeout) - if not ans: - return resp - if ans not in ['y', 'Y', 'n', 'N']: - print('please enter y or n.') - if ans == 'y' or ans == 'Y': - return True - if ans == 'n' or ans == 'N': - return False - - -def input_(timeout=15): # pragma: no cover - """used as a UI to stop if too much HDD space will be used""" - logging.warning("You have " + str( - timeout) + " seconds to stop the dataset copy by typing n") - i, o, e = select.select([sys.stdin], [], [], timeout) - if i: - return sys.stdin.readline().strip() - else: - return "y" - - -def get_examples_views_indices(dataset, examples_indices, view_indices, ): - """This function is used to get all the examples indices and view indices if needed""" - if view_indices is None: - view_indices = np.arange(dataset.nb_view) - if examples_indices is None: - examples_indices = np.arange(dataset.get_nb_examples()) - return examples_indices, view_indices diff --git a/multiview_platform/mono_multi_view_classifiers/utils/execution.py b/multiview_platform/mono_multi_view_classifiers/utils/execution.py deleted file mode 100644 index 3570bb2b685a9fe0e2cdded10f367177ad046a85..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/execution.py +++ /dev/null @@ -1,426 +0,0 @@ -import argparse -import logging -import os -import pickle -import time - -import numpy as np -import sklearn - -from . import get_multiview_db as DB -from ..utils.configuration import save_config - - -def parse_the_args(arguments): - """Used to parse the args entered by the user""" - - parser = argparse.ArgumentParser( - description='This file is used to benchmark the scores fo multiple ' - 'classification algorithm on multiview data.', - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@') - - groupStandard = parser.add_argument_group('Standard arguments') - groupStandard.add_argument('--config_path', metavar='STRING', - action='store', - help='Path to the hdf5 dataset or database ' - 'folder (default: %(default)s)', - default='../config_files/config.yml') - args = parser.parse_args(arguments) - return args - - -def init_random_state(random_state_arg, directory): - r""" - Used to init a random state. - If no random state is specified, it will generate a 'random' seed. - If the `randomSateArg` is a string containing only numbers, it will be converted in - an int to generate a seed. - If the `randomSateArg` is a string with letters, it must be a path to a pickled random - state file that will be loaded. - The function will also pickle the new random state in a file tobe able to retrieve it later. - Tested - - - Parameters - ---------- - random_state_arg : None or string - See function description. - directory : string - Path to the results directory. - - Returns - ------- - random_state : numpy.random.RandomState object - This random state will be used all along the benchmark . - """ - - if random_state_arg is None: - random_state = np.random.RandomState(random_state_arg) - else: - try: - seed = int(random_state_arg) - random_state = np.random.RandomState(seed) - except ValueError: - file_name = random_state_arg - with open(file_name, 'rb') as handle: - random_state = pickle.load(handle) - with open(os.path.join(directory, "random_state.pickle"), "wb") as handle: - pickle.dump(random_state, handle) - return random_state - - -def init_stats_iter_random_states(stats_iter, random_state): - r""" - Used to initialize multiple random states if needed because of multiple statistical iteration of the same benchmark - - Parameters - ---------- - stats_iter : int - Number of statistical iterations of the same benchmark done (with a different random state). - random_state : numpy.random.RandomState object - The random state of the whole experimentation, that will be used to generate the ones for each - statistical iteration. - - Returns - ------- - stats_iter_random_states : list of numpy.random.RandomState objects - Multiple random states, one for each sattistical iteration of the same benchmark. - """ - if stats_iter > 1: - stats_iter_random_states = [ - np.random.RandomState(random_state.randint(5000)) for _ in - range(stats_iter)] - else: - stats_iter_random_states = [random_state] - return stats_iter_random_states - - -def get_database_function(name, type_var): - r"""Used to get the right database extraction function according to the type of database and it's name - - Parameters - ---------- - name : string - Name of the database. - type_var : string - type of dataset hdf5 or csv - - Returns - ------- - getDatabase : function - The function that will be used to extract the database - """ - if name not in ["fake", "plausible"]: - get_database = getattr(DB, "get_classic_db_" + type_var[1:]) - else: - get_database = getattr(DB, "get_" + name + "_db_" + type_var[1:]) - return get_database - - -def init_log_file(name, views, cl_type, log, debug, label, - result_directory, args): - r"""Used to init the directory where the preds will be stored and the log file. - - First this function will check if the result directory already exists (only one per minute is allowed). - - If the the result directory name is available, it is created, and the logfile is initiated. - - Parameters - ---------- - name : string - Name of the database. - views : list of strings - List of the view names that will be used in the benchmark. - cl_type : list of strings - Type of benchmark that will be made . - log : bool - Whether to show the log file in console or hide it. - debug : bool - for debug option - label : str for label - - result_directory : str name of the result directory - - add_noise : bool for add noise - - noise_std : level of std noise - - Returns - ------- - results_directory : string - Reference to the main results directory for the benchmark. - """ - if views is None: - views = [] - # result_directory = os.path.join(os.path.dirname( - # os.path.dirname(os.path.dirname(os.path.realpath(__file__)))), - # result_directory) - if debug: - result_directory = os.path.join(result_directory, name, - "debug_started_" + time.strftime( - "%Y_%m_%d-%H_%M_%S") + "_" + label) - else: - result_directory = os.path.join(result_directory, name, - "started_" + time.strftime( - "%Y_%m_%d-%H_%M") + "_" + label) - log_file_name = time.strftime("%Y_%m_%d-%H_%M") + "-" + ''.join( - cl_type) + "-" + "_".join(views) + "-" + name + "-LOG.log" - if os.path.exists(result_directory): # pragma: no cover - raise NameError("The result dir already exists, wait 1 min and retry") - log_file_path = os.path.join(result_directory, log_file_name) - os.makedirs(os.path.dirname(log_file_path)) - logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', - filename=log_file_path, level=logging.DEBUG, - filemode='w') - if log: - logging.getLogger().addHandler(logging.StreamHandler()) - save_config(result_directory, args) - return result_directory - - -def gen_splits(labels, split_ratio, stats_iter_random_states): - r"""Used to _gen the train/test splits using one or multiple random states. - - Parameters - ---------- - labels : numpy.ndarray - Name of the database. - split_ratio : float - The ratio of examples between train and test set. - stats_iter_random_states : list of numpy.random.RandomState - The random states for each statistical iteration. - - Returns - ------- - splits : list of lists of numpy.ndarray - For each statistical iteration a couple of numpy.ndarrays is stored with the indices for the training set and - the ones of the testing set. - """ - indices = np.arange(len(labels)) - splits = [] - for random_state in stats_iter_random_states: - folds_obj = sklearn.model_selection.StratifiedShuffleSplit(n_splits=1, - random_state=random_state, - test_size=split_ratio) - folds = folds_obj.split(indices, labels) - for fold in folds: - train_fold, test_fold = fold - train_indices = indices[train_fold] - test_indices = indices[test_fold] - splits.append([train_indices, test_indices]) - - return splits - - -def gen_k_folds(stats_iter, nb_folds, stats_iter_random_states): - r"""Used to generate folds indices for cross validation for each statistical iteration. - - Parameters - ---------- - stats_iter : integer - Number of statistical iterations of the benchmark. - nb_folds : integer - The number of cross-validation folds for the benchmark. - stats_iter_random_states : list of numpy.random.RandomState - The random states for each statistical iteration. - - Returns - ------- - folds_list : list of list of sklearn.model_selection.StratifiedKFold - For each statistical iteration a Kfold stratified (keeping the ratio between classes in each fold). - """ - if stats_iter > 1: - folds_list = [] - for random_state in stats_iter_random_states: - folds_list.append( - sklearn.model_selection.StratifiedKFold(n_splits=nb_folds, - random_state=random_state, - shuffle=True)) - else: - if isinstance(stats_iter_random_states, list): - stats_iter_random_states = stats_iter_random_states[0] - folds_list = [sklearn.model_selection.StratifiedKFold(n_splits=nb_folds, - random_state=stats_iter_random_states, - shuffle=True)] - return folds_list - - -def init_views(dataset_var, arg_views): - r"""Used to return the views names that will be used by the - benchmark, their indices and all the views names. - - Parameters - ---------- - dataset_var : HDF5 dataset file - The full dataset that wil be used by the benchmark. - arg_views : list of strings - The views that will be used by the benchmark (arg). - - Returns - ------- - views : list of strings - Names of the views that will be used by the benchmark. - view_indices : list of ints - The list of the indices of the view that will be used in the benchmark (according to the dataset). - all_views : list of strings - Names of all the available views in the dataset. - """ - nb_view = dataset_var.nb_view - if arg_views is not None: - allowed_views = arg_views - all_views = [str(dataset_var.get_view_name(view_index)) - if type(dataset_var.get_view_name(view_index)) != bytes - else dataset_var.get_view_name(view_index).decode("utf-8") - for view_index in range(nb_view)] - views = [] - views_indices = [] - for view_index in range(nb_view): - view_name = dataset_var.get_view_name(view_index) - if type(view_name) == bytes: - view_name = view_name.decode("utf-8") - if view_name in allowed_views: - views.append(view_name) - views_indices.append(view_index) - else: - views = [str(dataset_var.get_view_name(view_index)) - if type(dataset_var.get_view_name(view_index)) != bytes - else dataset_var.get_view_name(view_index).decode("utf-8") - for view_index in range(nb_view)] - views_indices = range(nb_view) - all_views = views - return views, views_indices, all_views - - -def gen_direcorties_names(directory, stats_iter): - r"""Used to generate the different directories of each iteration if needed. - - Parameters - ---------- - directory : string - Path to the results directory. - statsIter : int - The number of statistical iterations. - - Returns - ------- - directories : list of strings - Paths to each statistical iterations result directory. - """ - if stats_iter > 1: - directories = [] - for i in range(stats_iter): - directories.append(os.path.join(directory, "iter_" + str(i + 1))) - else: - directories = [directory] - return directories - - -def find_dataset_names(path, type, names): - """This function goal is to browse the dataset directory and extrats all - the needed dataset names.""" - package_path = os.path.dirname( - os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - if os.path.isdir(path): - pass - elif os.path.isdir(os.path.join(package_path, path)): - path = os.path.join(package_path, path) - else: - raise ValueError("The provided pathf does not exist ({}) SuMMIT checks " - "the prefix from where you are running your script ({}) " - "and the multiview_platform package prefix ({}). " - "You may want to try with an absolute path in the " - "config file".format(path, os.getcwd(), package_path)) - available_file_names = [file_name.strip().split(".")[0] - for file_name in - os.listdir(path) - if file_name.endswith(type)] - if names == ["all"]: - return path, available_file_names - elif isinstance(names, str): - return path, [used_name for used_name in available_file_names if names == used_name] - elif len(names) > 1: - selected_names = [used_name for used_name in available_file_names if - used_name in names] - if not selected_names: - raise ValueError( - "None of the provided dataset names are available. Available datasets are {}".format( - available_file_names)) - return path, [used_name for used_name in available_file_names if - used_name in names] - elif names[0] in available_file_names: - return path, names - else: - raise ValueError("The asked dataset ({}) is not available in {}. \n The available ones are {}".format(names[0], path, available_file_names)) - - -def gen_argument_dictionaries(labels_dictionary, directories, - splits, - hyper_param_search, args, k_folds, - stats_iter_random_states, metrics, - argument_dictionaries, - benchmark, views, views_indices,): # pragma: no cover - r"""Used to generate a dictionary for each benchmark. - - One for each label combination (if multiclass), for each statistical iteration, generates an dictionary with - all necessary information to perform the benchmark - - Parameters - ---------- - labels_dictionary : dictionary - Dictionary mapping labels indices to labels names. - directories : list of strings - List of the paths to the result directories for each statistical iteration. - multiclass_labels : list of lists of numpy.ndarray - For each label couple, for each statistical iteration a triplet of numpy.ndarrays is stored with the - indices for the biclass training set, the ones for the biclass testing set and the ones for the - multiclass testing set. - labels_combinations : list of lists of numpy.ndarray - Each original couple of different labels. - indices_multiclass : list of lists of numpy.ndarray - For each combination, contains a biclass labels numpy.ndarray with the 0/1 labels of combination. - hyper_param_search : string - Type of hyper parameter optimization method - args : parsed args objects - All the args passed by the user. - k_folds : list of list of sklearn.model_selection.StratifiedKFold - For each statistical iteration a Kfold stratified (keeping the ratio between classes in each fold). - stats_iter_random_states : list of numpy.random.RandomState objects - Multiple random states, one for each sattistical iteration of the same benchmark. - metrics : list of lists - metrics that will be used to evaluate the algorithms performance. - argument_dictionaries : dictionary - Dictionary resuming all the specific arguments for the benchmark, oe dictionary for each classifier. - benchmark : dictionary - Dictionary resuming which mono- and multiview algorithms which will be used in the benchmark. - nb_views : int - THe number of views used by the benchmark. - views : list of strings - List of the names of the used views. - views_indices : list of ints - List of indices (according to the dataset) of the used views. - - Returns - ------- - benchmarkArgumentDictionaries : list of dicts - All the needed arguments for the benchmarks. - - """ - benchmark_argument_dictionaries = [] - for iter_index, iterRandomState in enumerate(stats_iter_random_states): - benchmark_argument_dictionary = { - "labels_dictionary": labels_dictionary, - "directory": directories[iter_index], - "classification_indices": splits[iter_index], - "args": args, - "k_folds": k_folds[iter_index], - "random_state": iterRandomState, - "hyper_param_search": hyper_param_search, - "metrics": metrics, - "argument_dictionaries": argument_dictionaries, - "benchmark": benchmark, - "views": views, - "views_indices": views_indices, - "flag": iter_index} - benchmark_argument_dictionaries.append(benchmark_argument_dictionary) - return benchmark_argument_dictionaries diff --git a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py deleted file mode 100644 index b3d2a24c7acb043eb43360b63e098a49319cd275..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py +++ /dev/null @@ -1,1311 +0,0 @@ -import os - -import h5py -import numpy as np - -from .dataset import RAMDataset, HDF5Dataset -from .organization import secure_file_path - -# Author-Info -__author__ = "Baptiste Bauvin" -__status__ = "Prototype" # Production, Development, Prototype - - -def make_me_noisy(view_data, random_state, percentage=5): - """used to introduce some noise in the generated data""" - view_data = view_data.astype(bool) - nb_noisy_coord = int( - percentage / 100.0 * view_data.shape[0] * view_data.shape[1]) - rows = range(view_data.shape[0]) - cols = range(view_data.shape[1]) - for _ in range(nb_noisy_coord): - row_idx = random_state.choice(rows) - col_idx = random_state.choice(cols) - view_data[row_idx, col_idx] = 0 - noisy_view_data = view_data.astype(np.uint8) - return noisy_view_data - - -def get_plausible_db_hdf5(features, path, file_name, nb_class=3, - label_names=["No".encode(), "Yes".encode(), - "Maybe".encode()], - random_state=None, full=True, add_noise=False, - noise_std=0.15, nb_view=3, nb_examples=100, - nb_features=10): - """Used to generate a plausible dataset to test the algorithms""" - secure_file_path(os.path.join(path, "plausible.hdf5")) - example_ids = ["exmaple_id_" + str(i) for i in range(nb_examples)] - views = [] - view_names = [] - are_sparse = [] - if nb_class == 2: - labels = np.array( - [0 for _ in range(int(nb_examples / 2))] + [1 for _ in range( - nb_examples - int(nb_examples / 2))]) - label_names = ["No".encode(), "Yes".encode()] - for view_index in range(nb_view): - view_data = np.array( - [np.zeros(nb_features) for _ in range(int(nb_examples / 2))] + - [np.ones(nb_features) for _ in - range(nb_examples - int(nb_examples / 2))]) - fake_one_indices = random_state.randint(0, int(nb_examples / 2), - int(nb_examples / 12)) - fake_zero_indices = random_state.randint(int(nb_examples / 2), - nb_examples, - int(nb_examples / 12)) - for index in np.concatenate((fake_one_indices, fake_zero_indices)): - example_ids[index] += "noised" - - view_data[fake_one_indices] = np.ones( - (len(fake_one_indices), nb_features)) - view_data[fake_zero_indices] = np.zeros( - (len(fake_zero_indices), nb_features)) - view_data = make_me_noisy(view_data, random_state) - views.append(view_data) - view_names.append("ViewNumber" + str(view_index)) - are_sparse.append(False) - - dataset = RAMDataset(views=views, labels=labels, - labels_names=label_names, view_names=view_names, - are_sparse=are_sparse, example_ids=example_ids, - name='plausible') - labels_dictionary = {0: "No", 1: "Yes"} - return dataset, labels_dictionary, "plausible" - elif nb_class >= 3: - firstBound = int(nb_examples / 3) - rest = nb_examples - 2 * int(nb_examples / 3) - scndBound = 2 * int(nb_examples / 3) - thrdBound = nb_examples - labels = np.array( - [0 for _ in range(firstBound)] + - [1 for _ in range(firstBound)] + - [2 for _ in range(rest)] - ) - for view_index in range(nb_view): - view_data = np.array( - [np.zeros(nb_features) for _ in range(firstBound)] + - [np.ones(nb_features) for _ in range(firstBound)] + - [np.ones(nb_features) + 1 for _ in range(rest)]) - fake_one_indices = random_state.randint(0, firstBound, - int(nb_examples / 12)) - fakeTwoIndices = random_state.randint(firstBound, scndBound, - int(nb_examples / 12)) - fake_zero_indices = random_state.randint(scndBound, thrdBound, - int(nb_examples / 12)) - - view_data[fake_one_indices] = np.ones( - (len(fake_one_indices), nb_features)) - view_data[fake_zero_indices] = np.zeros( - (len(fake_zero_indices), nb_features)) - view_data[fakeTwoIndices] = np.ones( - (len(fakeTwoIndices), nb_features)) + 1 - view_data = make_me_noisy(view_data, random_state) - views.append(view_data) - view_names.append("ViewNumber" + str(view_index)) - are_sparse.append(False) - dataset = RAMDataset(views=views, labels=labels, - labels_names=label_names, view_names=view_names, - are_sparse=are_sparse, - name="plausible", - example_ids=example_ids) - labels_dictionary = {0: "No", 1: "Yes", 2: "Maybe"} - return dataset, labels_dictionary, "plausible" - - -class DatasetError(Exception): - def __init__(self, *args, **kwargs): - Exception.__init__(self, *args, **kwargs) - - -def get_classic_db_hdf5(views, path_f, name_DB, nb_class, asked_labels_names, - random_state, full=False, add_noise=False, - noise_std=0.15, - path_for_new="../data/"): - """Used to load a hdf5 database""" - if full: - dataset_file = h5py.File(os.path.join(path_f, name_DB + ".hdf5"), "r") - dataset = HDF5Dataset(hdf5_file=dataset_file) - dataset_name = name_DB - labels_dictionary = dict((label_index, label_name) - for label_index, label_name - in enumerate(dataset.get_label_names())) - else: - dataset_file = h5py.File(os.path.join(path_f, name_DB + ".hdf5"), "r") - dataset = HDF5Dataset(hdf5_file=dataset_file) - labels_dictionary = dataset.select_views_and_labels(nb_labels=nb_class, - selected_label_names=asked_labels_names, - view_names=views, - random_state=random_state, - path_for_new=path_for_new) - dataset_name = dataset.get_name() - - if add_noise: - dataset.add_gaussian_noise(random_state, path_for_new, noise_std) - dataset_name = dataset.get_name() - else: - pass - return dataset, labels_dictionary, dataset_name - - -def get_classic_db_csv(views, pathF, nameDB, NB_CLASS, askedLabelsNames, - random_state, full=False, add_noise=False, - noise_std=0.15, - delimiter=",", path_for_new="../data/"): - # TODO : Update this one - labels_names = np.genfromtxt(pathF + nameDB + "-labels-names.csv", - dtype='str', delimiter=delimiter) - datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") - labels = np.genfromtxt(pathF + nameDB + "-labels.csv", delimiter=delimiter) - labelsDset = datasetFile.create_dataset("Labels", labels.shape, data=labels) - labelsDset.attrs["names"] = [labelName.encode() for labelName in - labels_names] - viewFileNames = [viewFileName for viewFileName in - os.listdir(pathF + "Views/")] - for viewIndex, viewFileName in enumerate(os.listdir(pathF + "Views/")): - viewFile = pathF + "Views/" + viewFileName - if viewFileName[-6:] != "-s.csv": - viewMatrix = np.genfromtxt(viewFile, delimiter=delimiter) - viewDset = datasetFile.create_dataset("View" + str(viewIndex), - viewMatrix.shape, - data=viewMatrix) - del viewMatrix - viewDset.attrs["name"] = viewFileName[:-4] - viewDset.attrs["sparse"] = False - else: - pass - metaDataGrp = datasetFile.create_group("Metadata") - metaDataGrp.attrs["nbView"] = len(viewFileNames) - metaDataGrp.attrs["nbClass"] = len(labels_names) - metaDataGrp.attrs["datasetLength"] = len(labels) - datasetFile.close() - datasetFile, labelsDictionary, dataset_name = get_classic_db_hdf5(views, - pathF, - nameDB, - NB_CLASS, - askedLabelsNames, - random_state, - full, - path_for_new=path_for_new) - - return datasetFile, labelsDictionary, dataset_name - -# -# def get_classes(labels): -# labels_set = set(list(labels)) -# nb_labels = len(labels_set) -# if nb_labels >= 2: -# return labels_set -# else: -# raise DatasetError("Dataset must have at least two different labels") -# -# -# def all_asked_labels_are_available(asked_labels_names_set, -# available_labels_names): -# for askedLabelName in asked_labels_names_set: -# if askedLabelName in available_labels_names: -# pass -# else: -# return False -# return True -# -# -# def fill_label_names(nb_labels, selected_label_names, random_state, -# available_labels_names): -# if len(selected_label_names) < nb_labels: -# nb_labels_to_add = nb_labels - len(selected_label_names) -# labels_names_to_choose = [available_label_name -# for available_label_name -# in available_labels_names -# if available_label_name -# not in selected_label_names] -# added_labels_names = random_state.choice(labels_names_to_choose, -# nb_labels_to_add, replace=False) -# selected_label_names = list(selected_label_names) + list(added_labels_names) -# asked_labels_names_set = set(selected_label_names) -# -# elif len(selected_label_names) > nb_labels: -# selected_label_names = list( -# random_state.choice(selected_label_names, nb_labels, replace=False)) -# asked_labels_names_set = set(selected_label_names) -# -# else: -# asked_labels_names_set = set(selected_label_names) -# -# return selected_label_names, asked_labels_names_set -# -# -# def get_all_labels(full_labels, available_labels_names): -# new_labels = full_labels -# new_labels_names = available_labels_names -# used_indices = np.arange(len(full_labels)) -# return new_labels, new_labels_names, used_indices -# -# -# def select_asked_labels(asked_labels_names_set, available_labels_names, -# asked_labels_names, full_labels): -# if all_asked_labels_are_available(asked_labels_names_set, available_labels_names): -# used_labels = [available_labels_names.index(asked_label_name) for -# asked_label_name in asked_labels_names] -# used_indices = np.array( -# [labelIndex for labelIndex, label in enumerate(full_labels) if -# label in used_labels]) -# new_labels = np.array([used_labels.index(label) for label in full_labels if -# label in used_labels]) -# new_labels_names = [available_labels_names[usedLabel] for usedLabel in -# used_labels] -# return new_labels, new_labels_names, used_indices -# else: -# raise DatasetError("Asked labels are not all available in the dataset") -# -# -# def filter_labels(labels_set, asked_labels_names_set, full_labels, -# available_labels_names, asked_labels_names): -# if len(labels_set) > 2: -# if asked_labels_names == available_labels_names: -# new_labels, new_labels_names, used_indices = \ -# get_all_labels(full_labels, available_labels_names) -# elif len(asked_labels_names_set) <= len(labels_set): -# new_labels, new_labels_names, used_indices = select_asked_labels( -# asked_labels_names_set, available_labels_names, -# asked_labels_names, full_labels) -# else: -# raise DatasetError( -# "Asked more labels than available in the dataset. Available labels are : " + -# ", ".join(available_labels_names)) -# -# else: -# new_labels, new_labels_names, used_indices = get_all_labels(full_labels, -# available_labels_names) -# return new_labels, new_labels_names, used_indices -# -# -# def filter_views(dataset_file, temp_dataset, views, used_indices): -# new_view_index = 0 -# if views == [""]: -# for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): -# copyhdf5_dataset(dataset_file, temp_dataset, "View" + str(view_index), -# "View" + str(view_index), used_indices) -# else: -# for asked_view_name in views: -# for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): -# view_name = dataset_file.get("View" + str(view_index)).attrs["name"] -# if type(view_name) == bytes: -# view_name = view_name.decode("utf-8") -# if view_name == asked_view_name: -# copyhdf5_dataset(dataset_file, temp_dataset, -# "View" + str(view_index), -# "View" + str(new_view_index), used_indices) -# new_view_name = \ -# temp_dataset.get("View" + str(new_view_index)).attrs["name"] -# if type(new_view_name) == bytes: -# temp_dataset.get("View" + str(new_view_index)).attrs[ -# "name"] = new_view_name.decode("utf-8") -# -# new_view_index += 1 -# else: -# pass -# temp_dataset.get("Metadata").attrs["nbView"] = len(views) -# -# -# def copyhdf5_dataset(source_data_file, destination_data_file, source_dataset_name, -# destination_dataset_name, used_indices): -# """Used to copy a view in a new dataset file using only the examples of -# usedIndices, and copying the args""" -# new_d_set = destination_data_file.create_dataset(destination_dataset_name, -# data=source_data_file.get( -# source_dataset_name).value[ -# used_indices, :]) -# if "sparse" in source_data_file.get(source_dataset_name).attrs.keys() and \ -# source_data_file.get(source_dataset_name).attrs["sparse"]: -# # TODO : Support sparse -# pass -# else: -# for key, value in source_data_file.get(source_dataset_name).attrs.items(): -# new_d_set.attrs[key] = value - - -# -# def add_gaussian_noise(dataset_file, random_state, path_f, dataset_name, -# noise_std=0.15): -# """In this function, we add a guaussian noise centered in 0 with specified -# std to each view, according to it's range (the noise will be -# mutliplied by this range) and we crop the noisy signal according to the -# view's attributes limits. -# This is done by creating a new dataset, to keep clean data.""" -# noisy_dataset = h5py.File(path_f + dataset_name + "_noised.hdf5", "w") -# dataset_file.copy("Metadata", noisy_dataset) -# dataset_file.copy("Labels", noisy_dataset) -# for view_index in range(dataset_file.get("Metadata").attrs["nbView"]): -# dataset_file.copy("View" + str(view_index), noisy_dataset) -# for view_index in range(noisy_dataset.get("Metadata").attrs["nbView"]): -# view_name = "View" + str(view_index) -# view_dset = noisy_dataset.get(view_name) -# view_limits = dataset_file[ -# "Metadata/View" + str(view_index) + "_limits"].value -# view_ranges = view_limits[:, 1] - view_limits[:, 0] -# normal_dist = random_state.normal(0, noise_std, view_dset.value.shape) -# noise = normal_dist * view_ranges -# noised_data = view_dset.value + noise -# noised_data = np.where(noised_data < view_limits[:, 0], -# view_limits[:, 0], noised_data) -# noised_data = np.where(noised_data > view_limits[:, 1], -# view_limits[:, 1], noised_data) -# noisy_dataset[view_name][...] = noised_data -# original_dataset_filename = dataset_file.filename -# dataset_file.close() -# noisy_dataset.close() -# noisy_dataset = h5py.File(path_f + dataset_name + "_noised.hdf5", "r") -# if "_temp_" in original_dataset_filename: -# os.remove(original_dataset_filename) -# return noisy_dataset, dataset_name + "_noised" - - -# def getLabelSupports(CLASS_LABELS): -# """Used to get the number of example for each label""" -# labels = set(CLASS_LABELS) -# supports = [CLASS_LABELS.tolist().count(label) for label in labels] -# return supports, dict((label, index) for label, index in zip(labels, range(len(labels)))) - - -# def isUseful(labelSupports, index, CLASS_LABELS, labelDict): -# if labelSupports[labelDict[CLASS_LABELS[index]]] != 0: -# labelSupports[labelDict[CLASS_LABELS[index]]] -= 1 -# return True, labelSupports -# else: -# return False, labelSupports - - -# def splitDataset(DATASET, LEARNING_RATE, DATASET_LENGTH, random_state): -# LABELS = DATASET.get("Labels")[...] -# NB_CLASS = int(DATASET["Metadata"].attrs["nbClass"]) -# validationIndices = extractRandomTrainingSet(LABELS, 1 - LEARNING_RATE, DATASET_LENGTH, NB_CLASS, random_state) -# validationIndices.sort() -# return validationIndices - - -# def extractRandomTrainingSet(CLASS_LABELS, LEARNING_RATE, DATASET_LENGTH, NB_CLASS, random_state): -# labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS)) -# nbTrainingExamples = [int(support * LEARNING_RATE) for support in labelSupports] -# trainingExamplesIndices = [] -# usedIndices = [] -# while nbTrainingExamples != [0 for i in range(NB_CLASS)]: -# isUseFull = False -# index = int(random_state.randint(0, DATASET_LENGTH - 1)) -# if index not in usedIndices: -# isUseFull, nbTrainingExamples = isUseful(nbTrainingExamples, index, CLASS_LABELS, labelDict) -# if isUseFull: -# trainingExamplesIndices.append(index) -# usedIndices.append(index) -# return trainingExamplesIndices - - -# def getKFoldIndices(nbFolds, CLASS_LABELS, NB_CLASS, learningIndices, random_state): -# labelSupports, labelDict = getLabelSupports(np.array(CLASS_LABELS[learningIndices])) -# nbTrainingExamples = [[int(support / nbFolds) for support in labelSupports] for fold in range(nbFolds)] -# trainingExamplesIndices = [] -# usedIndices = [] -# for foldIndex, fold in enumerate(nbTrainingExamples): -# trainingExamplesIndices.append([]) -# while fold != [0 for i in range(NB_CLASS)]: -# index = random_state.randint(0, len(learningIndices)) -# if learningIndices[index] not in usedIndices: -# isUseFull, fold = isUseful(fold, learningIndices[index], CLASS_LABELS, labelDict) -# if isUseFull: -# trainingExamplesIndices[foldIndex].append(learningIndices[index]) -# usedIndices.append(learningIndices[index]) -# return trainingExamplesIndices -# -# -# def getPositions(labelsUsed, fullLabels): -# usedIndices = [] -# for labelIndex, label in enumerate(fullLabels): -# if label in labelsUsed: -# usedIndices.append(labelIndex) -# return usedIndices - - -# def getCaltechDBcsv(views, pathF, nameDB, NB_CLASS, LABELS_NAMES, random_state): -# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "w") -# labelsNamesFile = open(pathF + nameDB + '-ClassLabels-Description.csv') -# if len(LABELS_NAMES) != NB_CLASS: -# nbLabelsAvailable = 0 -# for l in labelsNamesFile: -# nbLabelsAvailable += 1 -# LABELS_NAMES = [line.strip().split(";")[1] for lineIdx, line in enumerate(labelsNamesFile) if -# lineIdx in random_state.randint(nbLabelsAvailable, size=NB_CLASS)] -# fullLabels = np.genfromtxt(pathF + nameDB + '-ClassLabels.csv', delimiter=';').astype(int) -# labelsDictionary = dict((classIndice, labelName) for (classIndice, labelName) in -# [(int(line.strip().split(";")[0]), line.strip().split(";")[1]) for lineIndex, line in -# labelsNamesFile if line.strip().split(";")[0] in LABELS_NAMES]) -# if len(set(fullLabels)) > NB_CLASS: -# usedIndices = getPositions(labelsDictionary.keys(), fullLabels) -# else: -# usedIndices = range(len(fullLabels)) -# for viewIndex, view in enumerate(views): -# viewFile = pathF + nameDB + "-" + view + '.csv' -# viewMatrix = np.array(np.genfromtxt(viewFile, delimiter=';'))[usedIndices, :] -# viewDset = datasetFile.create_dataset("View" + str(viewIndex), viewMatrix.shape, data=viewMatrix) -# viewDset.attrs["name"] = view -# -# labelsDset = datasetFile.create_dataset("Labels", fullLabels[usedIndices].shape, data=fullLabels[usedIndices]) -# -# metaDataGrp = datasetFile.create_group("Metadata") -# metaDataGrp.attrs["nbView"] = len(views) -# metaDataGrp.attrs["nbClass"] = NB_CLASS -# metaDataGrp.attrs["datasetLength"] = len(fullLabels[usedIndices]) -# datasetFile.close() -# datasetFile = h5py.File(pathF + nameDB + ".hdf5", "r") -# return datasetFile, labelsDictionary - -# --------------------------------------------# -# All the functions below are not useful # -# anymore but the binarization methods in # -# it must be kept # -# --------------------------------------------# - - -# def getMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES, random_state): -# datasetFile = h5py.File(path + "MultiOmic.hdf5", "w") -# -# logging.debug("Start:\t Getting Methylation data") -# methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') -# methylDset = datasetFile.create_dataset("View0", methylData.shape) -# methylDset[...] = methylData -# methylDset.attrs["name"] = "Methyl" -# methylDset.attrs["sparse"] = False -# methylDset.attrs["binary"] = False -# logging.debug("Done:\t Getting Methylation data") -# -# logging.debug("Start:\t Getting MiRNA data") -# mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',') -# mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape) -# mirnaDset[...] = mirnaData -# mirnaDset.attrs["name"] = "MiRNA_" -# mirnaDset.attrs["sparse"] = False -# mirnaDset.attrs["binary"] = False -# logging.debug("Done:\t Getting MiRNA data") -# -# logging.debug("Start:\t Getting RNASeq data") -# rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',') -# uselessRows = [] -# for rowIndex, row in enumerate(np.transpose(rnaseqData)): -# if not row.any(): -# uselessRows.append(rowIndex) -# usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows] -# rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows))) -# rnaseqDset[...] = rnaseqData[:, usefulRows] -# rnaseqDset.attrs["name"] = "RNASeq_" -# rnaseqDset.attrs["sparse"] = False -# rnaseqDset.attrs["binary"] = False -# logging.debug("Done:\t Getting RNASeq data") -# -# logging.debug("Start:\t Getting Clinical data") -# clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',') -# clinicalDset = datasetFile.create_dataset("View3", clinical.shape) -# clinicalDset[...] = clinical -# clinicalDset.attrs["name"] = "Clinic" -# clinicalDset.attrs["sparse"] = False -# clinicalDset.attrs["binary"] = False -# logging.debug("Done:\t Getting Clinical data") -# -# labelFile = open(path + 'brca_labels_triple-negatif.csv') -# labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) -# labelsDset = datasetFile.create_dataset("Labels", labels.shape) -# labelsDset[...] = labels -# labelsDset.attrs["name"] = "Labels" -# -# metaDataGrp = datasetFile.create_group("Metadata") -# metaDataGrp.attrs["nbView"] = 4 -# metaDataGrp.attrs["nbClass"] = 2 -# metaDataGrp.attrs["datasetLength"] = len(labels) -# labelDictionary = {0: "No", 1: "Yes"} -# datasetFile.close() -# datasetFile = h5py.File(path + "MultiOmic.hdf5", "r") -# # datasetFile = getPseudoRNASeq(datasetFile) -# return datasetFile, labelDictionary -# -# -# def getVector(nbGenes): -# argmax = [0, 0] -# maxi = 0 -# for i in range(nbGenes): -# for j in range(nbGenes): -# if j == i + 1: -# value = (i + 1) * (nbGenes - j) -# if value > maxi: -# maxi = value -# argmax = [i, j] -# i, j = argmax -# vectorLeft = np.zeros(nbGenes, dtype=bool) -# vectorLeft[:i + 1] = np.ones(i + 1, dtype=bool) -# vectorSup = np.zeros(nbGenes, dtype=bool) -# vectorSup[j:] = np.ones(nbGenes - j, dtype=bool) -# matrixSup = j -# matrixInf = nbGenes - j -# return vectorLeft, matrixSup, matrixInf -# -# -# def findClosestPowerOfTwo(factorizationParam): -# power = 1 -# while factorizationParam - power > 0: -# power *= 2 -# if abs(factorizationParam - power) < abs(factorizationParam - power / 2): -# return power -# else: -# return power / 2 -# -# -# def easyFactorize(nbGenes, factorizationParam, t=0): -# if math.log(factorizationParam + 1, 2) % 1 == 0.0: -# pass -# else: -# factorizationParam = findClosestPowerOfTwo(factorizationParam) - 1 -# -# if nbGenes == 2: -# return 1, np.array([True, False]) -# -# if nbGenes == 3: -# return 1, np.array([True, True, False]) -# -# if factorizationParam == 1: -# t = 1 -# return t, getVector(nbGenes)[0] -# -# vectorLeft, matrixSup, matrixInf = getVector(nbGenes) -# -# t_, vectorLeftSup = easyFactorize(matrixSup, (factorizationParam - 1) / 2, t=t) -# t__, vectorLeftInf = easyFactorize(matrixInf, (factorizationParam - 1) / 2, t=t) -# -# factorLeft = np.zeros((nbGenes, t_ + t__ + 1), dtype=bool) -# -# factorLeft[:matrixSup, :t_] = vectorLeftSup.reshape(factorLeft[:matrixSup, :t_].shape) -# if nbGenes % 2 == 1: -# factorLeft[matrixInf - 1:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf - 1:, t_:t__ + t_].shape) -# else: -# factorLeft[matrixInf:, t_:t__ + t_] = vectorLeftInf.reshape(factorLeft[matrixInf:, t_:t__ + t_].shape) -# factorLeft[:, t__ + t_] = vectorLeft -# -# # factorSup = np.zeros((t_+t__+1, nbGenes), dtype=bool) -# # -# # factorSup[:t_, :matrixSup] = vectorSupLeft.reshape(factorSup[:t_, :matrixSup].shape) -# # if nbGenes%2==1: -# # factorSup[t_:t__+t_, matrixInf-1:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf-1:].shape) -# # else: -# # factorSup[t_:t__+t_, matrixInf:] = vectorSupRight.reshape(factorSup[t_:t__+t_, matrixInf:].shape) -# # factorSup[t__+t_, :] = vectorSup -# return t__ + t_ + 1, factorLeft # , factorSup -# -# -# def getBaseMatrices(nbGenes, factorizationParam, path): -# t, factorLeft = easyFactorize(nbGenes, factorizationParam) -# np.savetxt(path + "factorLeft--n-" + str(nbGenes) + "--k-" + str(factorizationParam) + ".csv", factorLeft, -# delimiter=",") -# return factorLeft -# -# -# def findParams(arrayLen, nbPatients, random_state, maxNbBins=2000, minNbBins=10, maxLenBin=70000, minOverlapping=1, -# minNbBinsOverlapped=0, maxNbSolutions=30): -# results = [] -# if arrayLen * arrayLen * 10 / 100 > minNbBinsOverlapped * nbPatients: -# for lenBin in range(arrayLen - 1): -# lenBin += 1 -# if lenBin < maxLenBin and minNbBins * lenBin < arrayLen: -# for overlapping in sorted(range(lenBin - 1), reverse=True): -# overlapping += 1 -# if overlapping > minOverlapping and lenBin % (lenBin - overlapping) == 0: -# for nbBins in sorted(range(arrayLen - 1), reverse=True): -# nbBins += 1 -# if nbBins < maxNbBins: -# if arrayLen == (nbBins - 1) * (lenBin - overlapping) + lenBin: -# results.append({"nbBins": nbBins, "overlapping": overlapping, "lenBin": lenBin}) -# if len(results) == maxNbSolutions: -# params = preds[random_state.randrange(len(preds))] -# return params -# -# -# def findBins(nbBins=142, overlapping=493, lenBin=986): -# bins = [] -# for binIndex in range(nbBins): -# bins.append([i + binIndex * (lenBin - overlapping) for i in range(lenBin)]) -# return bins -# -# -# def getBins(array, bins, lenBin, overlapping): -# binnedcoord = [] -# for coordIndex, coord in enumerate(array): -# nbBinsFull = 0 -# for binIndex, bin_ in enumerate(bins): -# if coordIndex in bin_: -# binnedcoord.append(binIndex + (coord * len(bins))) -# -# return np.array(binnedcoord) -# -# -# def makeSortedBinsMatrix(nbBins, lenBins, overlapping, arrayLen, path): -# sortedBinsMatrix = np.zeros((arrayLen, nbBins), dtype=np.uint8) -# step = lenBins - overlapping -# for binIndex in range(nbBins): -# sortedBinsMatrix[step * binIndex:lenBins + (step * binIndex), binIndex] = np.ones(lenBins, dtype=np.uint8) -# np.savetxt(path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", -# sortedBinsMatrix, delimiter=",") -# return sortedBinsMatrix -# -# -# def makeSparseTotalMatrix(sortedRNASeq, random_state): -# nbPatients, nbGenes = sortedRNASeq.shape -# params = findParams(nbGenes, nbPatients, random_state) -# nbBins = params["nbBins"] -# overlapping = params["overlapping"] -# lenBin = params["lenBin"] -# bins = findBins(nbBins, overlapping, lenBin) -# sparseFull = sparse.csc_matrix((nbPatients, nbGenes * nbBins)) -# for patientIndex, patient in enumerate(sortedRNASeq): -# columnIndices = getBins(patient, bins, lenBin, overlapping) -# rowIndices = np.zeros(len(columnIndices), dtype=int) + patientIndex -# data = np.ones(len(columnIndices), dtype=bool) -# sparseFull = sparseFull + sparse.csc_matrix((data, (rowIndices, columnIndices)), -# shape=(nbPatients, nbGenes * nbBins)) -# return sparseFull -# -# -# def getAdjacenceMatrix(RNASeqRanking, sotredRNASeq, k=2): -# k = int(k) / 2 * 2 -# indices = np.zeros((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=int) -# data = np.ones((RNASeqRanking.shape[0] * k * RNASeqRanking.shape[1]), dtype=bool) -# indptr = np.zeros(RNASeqRanking.shape[0] + 1, dtype=int) -# nbGenes = RNASeqRanking.shape[1] -# pointer = 0 -# for patientIndex in range(RNASeqRanking.shape[0]): -# for i in range(nbGenes): -# for j in range(k / 2): -# try: -# indices[pointer] = RNASeqRanking[ -# patientIndex, (sotredRNASeq[patientIndex, i] - (j + 1))] + i * nbGenes -# pointer += 1 -# except: -# pass -# try: -# indices[pointer] = RNASeqRanking[ -# patientIndex, (sotredRNASeq[patientIndex, i] + (j + 1))] + i * nbGenes -# pointer += 1 -# except: -# pass -# # elif i<=k: -# # indices.append(patient[1]+patient[i]*nbGenes) -# # data.append(True) -# # elif i==nbGenes-1: -# # indices.append(patient[i-1]+patient[i]*nbGenes) -# # data.append(True) -# indptr[patientIndex + 1] = pointer -# -# mat = sparse.csr_matrix((data, indices, indptr), -# shape=(RNASeqRanking.shape[0], RNASeqRanking.shape[1] * RNASeqRanking.shape[1]), dtype=bool) -# return mat -# -# -# def getKMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): -# datasetFile = h5py.File(path + "KMultiOmic.hdf5", "w") -# -# # logging.debug("Start:\t Getting Methylation data") -# methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') -# logging.debug("Done:\t Getting Methylation data") -# -# logging.debug("Start:\t Getting Sorted Methyl data") -# Methyl = methylData -# sortedMethylGeneIndices = np.zeros(methylData.shape, dtype=int) -# MethylRanking = np.zeros(methylData.shape, dtype=int) -# for exampleIndex, exampleArray in enumerate(Methyl): -# sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray)) -# sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1)) -# sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int) -# sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray -# for geneIndex in range(Methyl.shape[1]): -# MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex -# logging.debug("Done:\t Getting Sorted Methyl data") -# -# logging.debug("Start:\t Getting Binarized Methyl data") -# k = findClosestPowerOfTwo(9) - 1 -# try: -# factorizedLeftBaseMatrix = np.genfromtxt( -# path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') -# except: -# factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) -# bMethylDset = datasetFile.create_dataset("View0", -# (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), -# dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): -# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] -# bMethylDset[patientIndex] = patientMatrix.flatten() -# bMethylDset.attrs["name"] = "BMethyl" + str(k) -# bMethylDset.attrs["sparse"] = False -# bMethylDset.attrs["binary"] = True -# logging.debug("Done:\t Getting Binarized Methyl data") -# -# logging.debug("Start:\t Getting Binned Methyl data") -# lenBins = 3298 -# nbBins = 9 -# overlapping = 463 -# try: -# sortedBinsMatrix = np.genfromtxt( -# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", -# delimiter=",") -# except: -# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path) -# binnedMethyl = datasetFile.create_dataset("View1", ( -# sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): -# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] -# binnedMethyl[patientIndex] = patientMatrix.flatten() -# binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins) -# binnedMethyl.attrs["sparse"] = False -# binnedMethyl.attrs["binary"] = True -# logging.debug("Done:\t Getting Binned Methyl data") -# -# logging.debug("Start:\t Getting Binarized Methyl data") -# k = findClosestPowerOfTwo(17) - 1 -# try: -# factorizedLeftBaseMatrix = np.genfromtxt( -# path + "factorLeft--n-" + str(methylData.shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') -# except: -# factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) -# bMethylDset = datasetFile.create_dataset("View2", -# (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), -# dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): -# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] -# bMethylDset[patientIndex] = patientMatrix.flatten() -# bMethylDset.attrs["name"] = "BMethyl" + str(k) -# bMethylDset.attrs["sparse"] = False -# bMethylDset.attrs["binary"] = True -# logging.debug("Done:\t Getting Binarized Methyl data") -# -# logging.debug("Start:\t Getting Binned Methyl data") -# lenBins = 2038 -# nbBins = 16 -# overlapping = 442 -# try: -# sortedBinsMatrix = np.genfromtxt( -# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", -# delimiter=",") -# except: -# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, methylData.shape[1], path) -# binnedMethyl = datasetFile.create_dataset("View3", ( -# sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): -# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] -# binnedMethyl[patientIndex] = patientMatrix.flatten() -# binnedMethyl.attrs["name"] = "bMethyl" + str(nbBins) -# binnedMethyl.attrs["sparse"] = False -# binnedMethyl.attrs["binary"] = True -# logging.debug("Done:\t Getting Binned Methyl data") -# -# labelFile = open(path + 'brca_labels_triple-negatif.csv') -# labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) -# labelsDset = datasetFile.create_dataset("Labels", labels.shape) -# labelsDset[...] = labels -# labelsDset.attrs["name"] = "Labels" -# -# metaDataGrp = datasetFile.create_group("Metadata") -# metaDataGrp.attrs["nbView"] = 4 -# metaDataGrp.attrs["nbClass"] = 2 -# metaDataGrp.attrs["datasetLength"] = len(labels) -# labelDictionary = {0: "No", 1: "Yes"} -# -# datasetFile.close() -# datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r") -# -# return datasetFile, labelDictionary -# -# -# def getKMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): -# datasetFile = h5py.File(path + "KMultiOmic.hdf5", "r") -# labelDictionary = {0: "No", 1: "Yes"} -# return datasetFile, labelDictionary -# -# -# def getModifiedMultiOmicDBcsv(features, path, name, NB_CLASS, LABELS_NAMES): -# datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "w") -# -# logging.debug("Start:\t Getting Methylation data") -# methylData = np.genfromtxt(path + "matching_methyl.csv", delimiter=',') -# methylDset = datasetFile.create_dataset("View0", methylData.shape) -# methylDset[...] = methylData -# methylDset.attrs["name"] = "Methyl_" -# methylDset.attrs["sparse"] = False -# methylDset.attrs["binary"] = False -# logging.debug("Done:\t Getting Methylation data") -# -# logging.debug("Start:\t Getting Sorted Methyl data") -# Methyl = datasetFile["View0"][...] -# sortedMethylGeneIndices = np.zeros(datasetFile.get("View0").shape, dtype=int) -# MethylRanking = np.zeros(datasetFile.get("View0").shape, dtype=int) -# for exampleIndex, exampleArray in enumerate(Methyl): -# sortedMethylDictionary = dict((index, value) for index, value in enumerate(exampleArray)) -# sortedMethylIndicesDict = sorted(sortedMethylDictionary.items(), key=operator.itemgetter(1)) -# sortedMethylIndicesArray = np.array([index for (index, value) in sortedMethylIndicesDict], dtype=int) -# sortedMethylGeneIndices[exampleIndex] = sortedMethylIndicesArray -# for geneIndex in range(Methyl.shape[1]): -# MethylRanking[exampleIndex, sortedMethylIndicesArray[geneIndex]] = geneIndex -# mMethylDset = datasetFile.create_dataset("View10", sortedMethylGeneIndices.shape, data=sortedMethylGeneIndices) -# mMethylDset.attrs["name"] = "SMethyl" -# mMethylDset.attrs["sparse"] = False -# mMethylDset.attrs["binary"] = False -# logging.debug("Done:\t Getting Sorted Methyl data") -# -# logging.debug("Start:\t Getting Binarized Methyl data") -# k = findClosestPowerOfTwo(58) - 1 -# try: -# factorizedLeftBaseMatrix = np.genfromtxt( -# path + "factorLeft--n-" + str(datasetFile.get("View0").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') -# except: -# factorizedLeftBaseMatrix = getBaseMatrices(methylData.shape[1], k, path) -# bMethylDset = datasetFile.create_dataset("View11", -# (sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * k), -# dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): -# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], k), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] -# bMethylDset[patientIndex] = patientMatrix.flatten() -# bMethylDset.attrs["name"] = "BMethyl" -# bMethylDset.attrs["sparse"] = False -# bMethylDset.attrs["binary"] = True -# logging.debug("Done:\t Getting Binarized Methyl data") -# -# logging.debug("Start:\t Getting Binned Methyl data") -# lenBins = 2095 -# nbBins = 58 -# overlapping = 1676 -# try: -# sortedBinsMatrix = np.genfromtxt( -# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", -# delimiter=",") -# except: -# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View0").shape[1], path) -# binnedMethyl = datasetFile.create_dataset("View12", ( -# sortedMethylGeneIndices.shape[0], sortedMethylGeneIndices.shape[1] * nbBins), dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedMethylGeneIndices): -# patientMatrix = np.zeros((sortedMethylGeneIndices.shape[1], nbBins), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] -# binnedMethyl[patientIndex] = patientMatrix.flatten() -# binnedMethyl.attrs["name"] = "bMethyl" -# binnedMethyl.attrs["sparse"] = False -# binnedMethyl.attrs["binary"] = True -# logging.debug("Done:\t Getting Binned Methyl data") -# -# logging.debug("Start:\t Getting MiRNA data") -# mirnaData = np.genfromtxt(path + "matching_mirna.csv", delimiter=',') -# mirnaDset = datasetFile.create_dataset("View1", mirnaData.shape) -# mirnaDset[...] = mirnaData -# mirnaDset.attrs["name"] = "MiRNA__" -# mirnaDset.attrs["sparse"] = False -# mirnaDset.attrs["binary"] = False -# logging.debug("Done:\t Getting MiRNA data") -# -# logging.debug("Start:\t Getting Sorted MiRNA data") -# MiRNA = datasetFile["View1"][...] -# sortedMiRNAGeneIndices = np.zeros(datasetFile.get("View1").shape, dtype=int) -# MiRNARanking = np.zeros(datasetFile.get("View1").shape, dtype=int) -# for exampleIndex, exampleArray in enumerate(MiRNA): -# sortedMiRNADictionary = dict((index, value) for index, value in enumerate(exampleArray)) -# sortedMiRNAIndicesDict = sorted(sortedMiRNADictionary.items(), key=operator.itemgetter(1)) -# sortedMiRNAIndicesArray = np.array([index for (index, value) in sortedMiRNAIndicesDict], dtype=int) -# sortedMiRNAGeneIndices[exampleIndex] = sortedMiRNAIndicesArray -# for geneIndex in range(MiRNA.shape[1]): -# MiRNARanking[exampleIndex, sortedMiRNAIndicesArray[geneIndex]] = geneIndex -# mmirnaDset = datasetFile.create_dataset("View7", sortedMiRNAGeneIndices.shape, data=sortedMiRNAGeneIndices) -# mmirnaDset.attrs["name"] = "SMiRNA_" -# mmirnaDset.attrs["sparse"] = False -# mmirnaDset.attrs["binary"] = False -# logging.debug("Done:\t Getting Sorted MiRNA data") -# -# logging.debug("Start:\t Getting Binarized MiRNA data") -# k = findClosestPowerOfTwo(517) - 1 -# try: -# factorizedLeftBaseMatrix = np.genfromtxt( -# path + "factorLeft--n-" + str(datasetFile.get("View1").shape[1]) + "--k-" + str(k) + ".csv", delimiter=',') -# except: -# factorizedLeftBaseMatrix = getBaseMatrices(mirnaData.shape[1], k, path) -# bmirnaDset = datasetFile.create_dataset("View8", -# (sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * k), -# dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices): -# patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], k), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] -# bmirnaDset[patientIndex] = patientMatrix.flatten() -# bmirnaDset.attrs["name"] = "BMiRNA_" -# bmirnaDset.attrs["sparse"] = False -# bmirnaDset.attrs["binary"] = True -# logging.debug("Done:\t Getting Binarized MiRNA data") -# -# logging.debug("Start:\t Getting Binned MiRNA data") -# lenBins = 14 -# nbBins = 517 -# overlapping = 12 -# try: -# sortedBinsMatrix = np.genfromtxt( -# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", -# delimiter=",") -# except: -# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View1").shape[1], path) -# binnedMiRNA = datasetFile.create_dataset("View9", ( -# sortedMiRNAGeneIndices.shape[0], sortedMiRNAGeneIndices.shape[1] * nbBins), dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedMiRNAGeneIndices): -# patientMatrix = np.zeros((sortedMiRNAGeneIndices.shape[1], nbBins), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] -# binnedMiRNA[patientIndex] = patientMatrix.flatten() -# binnedMiRNA.attrs["name"] = "bMiRNA_" -# binnedMiRNA.attrs["sparse"] = False -# binnedMiRNA.attrs["binary"] = True -# logging.debug("Done:\t Getting Binned MiRNA data") -# -# logging.debug("Start:\t Getting RNASeq data") -# rnaseqData = np.genfromtxt(path + "matching_rnaseq.csv", delimiter=',') -# uselessRows = [] -# for rowIndex, row in enumerate(np.transpose(rnaseqData)): -# if not row.any(): -# uselessRows.append(rowIndex) -# usefulRows = [usefulRowIndex for usefulRowIndex in range(rnaseqData.shape[1]) if usefulRowIndex not in uselessRows] -# rnaseqDset = datasetFile.create_dataset("View2", (rnaseqData.shape[0], len(usefulRows))) -# rnaseqDset[...] = rnaseqData[:, usefulRows] -# rnaseqDset.attrs["name"] = "RNASeq_" -# rnaseqDset.attrs["sparse"] = False -# rnaseqDset.attrs["binary"] = False -# logging.debug("Done:\t Getting RNASeq data") -# -# logging.debug("Start:\t Getting Sorted RNASeq data") -# RNASeq = datasetFile["View2"][...] -# sortedRNASeqGeneIndices = np.zeros(datasetFile.get("View2").shape, dtype=int) -# RNASeqRanking = np.zeros(datasetFile.get("View2").shape, dtype=int) -# for exampleIndex, exampleArray in enumerate(RNASeq): -# sortedRNASeqDictionary = dict((index, value) for index, value in enumerate(exampleArray)) -# sortedRNASeqIndicesDict = sorted(sortedRNASeqDictionary.items(), key=operator.itemgetter(1)) -# sortedRNASeqIndicesArray = np.array([index for (index, value) in sortedRNASeqIndicesDict], dtype=int) -# sortedRNASeqGeneIndices[exampleIndex] = sortedRNASeqIndicesArray -# for geneIndex in range(RNASeq.shape[1]): -# RNASeqRanking[exampleIndex, sortedRNASeqIndicesArray[geneIndex]] = geneIndex -# mrnaseqDset = datasetFile.create_dataset("View4", sortedRNASeqGeneIndices.shape, data=sortedRNASeqGeneIndices) -# mrnaseqDset.attrs["name"] = "SRNASeq" -# mrnaseqDset.attrs["sparse"] = False -# mrnaseqDset.attrs["binary"] = False -# logging.debug("Done:\t Getting Sorted RNASeq data") -# -# logging.debug("Start:\t Getting Binarized RNASeq data") -# k = findClosestPowerOfTwo(100) - 1 -# try: -# factorizedLeftBaseMatrix = np.genfromtxt( -# path + "factorLeft--n-" + str(datasetFile.get("View2").shape[1]) + "--k-" + str(100) + ".csv", -# delimiter=',') -# except: -# factorizedLeftBaseMatrix = getBaseMatrices(rnaseqData.shape[1], k, path) -# brnaseqDset = datasetFile.create_dataset("View5", -# (sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * k), -# dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): -# patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], k), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = factorizedLeftBaseMatrix[lineIndex, :] -# brnaseqDset[patientIndex] = patientMatrix.flatten() -# brnaseqDset.attrs["name"] = "BRNASeq" -# brnaseqDset.attrs["sparse"] = False -# brnaseqDset.attrs["binary"] = True -# logging.debug("Done:\t Getting Binarized RNASeq data") -# -# logging.debug("Start:\t Getting Binned RNASeq data") -# lenBins = 986 -# nbBins = 142 -# overlapping = 493 -# try: -# sortedBinsMatrix = np.genfromtxt( -# path + "sortedBinsMatrix--t-" + str(lenBins) + "--n-" + str(nbBins) + "--c-" + str(overlapping) + ".csv", -# delimiter=",") -# except: -# sortedBinsMatrix = makeSortedBinsMatrix(nbBins, lenBins, overlapping, datasetFile.get("View2").shape[1], path) -# binnedRNASeq = datasetFile.create_dataset("View6", ( -# sortedRNASeqGeneIndices.shape[0], sortedRNASeqGeneIndices.shape[1] * nbBins), dtype=np.uint8) -# for patientIndex, patientSortedArray in enumerate(sortedRNASeqGeneIndices): -# patientMatrix = np.zeros((sortedRNASeqGeneIndices.shape[1], nbBins), dtype=np.uint8) -# for lineIndex, geneIndex in enumerate(patientSortedArray): -# patientMatrix[geneIndex] = sortedBinsMatrix[lineIndex, :] -# binnedRNASeq[patientIndex] = patientMatrix.flatten() -# binnedRNASeq.attrs["name"] = "bRNASeq" -# binnedRNASeq.attrs["sparse"] = False -# binnedRNASeq.attrs["binary"] = True -# logging.debug("Done:\t Getting Binned RNASeq data") -# -# logging.debug("Start:\t Getting Clinical data") -# clinical = np.genfromtxt(path + "clinicalMatrix.csv", delimiter=',') -# clinicalDset = datasetFile.create_dataset("View3", clinical.shape) -# clinicalDset[...] = clinical -# clinicalDset.attrs["name"] = "Clinic_" -# clinicalDset.attrs["sparse"] = False -# clinicalDset.attrs["binary"] = False -# logging.debug("Done:\t Getting Clinical data") -# -# logging.debug("Start:\t Getting Binarized Clinical data") -# binarized_clinical = np.zeros((347, 1951), dtype=np.uint8) -# nb_already_done = 0 -# for feqtureIndex, feature in enumerate(np.transpose(clinical)): -# featureSet = set(feature) -# featureDict = dict((val, valIndex) for valIndex, val in enumerate(list(featureSet))) -# for valueIndex, value in enumerate(feature): -# binarized_clinical[valueIndex, featureDict[value] + nb_already_done] = 1 -# nb_already_done += len(featureSet) -# bClinicalDset = datasetFile.create_dataset("View13", binarized_clinical.shape, dtype=np.uint8, -# data=binarized_clinical) -# bClinicalDset.attrs["name"] = "bClinic" -# bClinicalDset.attrs["sparse"] = False -# bClinicalDset.attrs["binary"] = True -# logging.debug("Done:\t Getting Binarized Clinical data") -# -# # logging.debug("Start:\t Getting Adjacence RNASeq data") -# # sparseAdjRNASeq = getAdjacenceMatrix(RNASeqRanking, sortedRNASeqGeneIndices, k=findClosestPowerOfTwo(10)-1) -# # sparseAdjRNASeqGrp = datasetFile.create_group("View6") -# # dataDset = sparseAdjRNASeqGrp.create_dataset("data", sparseAdjRNASeq.data.shape, data=sparseAdjRNASeq.data) -# # indicesDset = sparseAdjRNASeqGrp.create_dataset("indices", -# # sparseAdjRNASeq.indices.shape, data=sparseAdjRNASeq.indices) -# # indptrDset = sparseAdjRNASeqGrp.create_dataset("indptr", -# # sparseAdjRNASeq.indptr.shape, data=sparseAdjRNASeq.indptr) -# # sparseAdjRNASeqGrp.attrs["name"]="ARNASeq" -# # sparseAdjRNASeqGrp.attrs["sparse"]=True -# # sparseAdjRNASeqGrp.attrs["shape"]=sparseAdjRNASeq.shape -# # logging.debug("Done:\t Getting Adjacence RNASeq data") -# -# labelFile = open(path + 'brca_labels_triple-negatif.csv') -# labels = np.array([int(line.strip().split(',')[1]) for line in labelFile]) -# labelsDset = datasetFile.create_dataset("Labels", labels.shape) -# labelsDset[...] = labels -# labelsDset.attrs["name"] = "Labels" -# -# metaDataGrp = datasetFile.create_group("Metadata") -# metaDataGrp.attrs["nbView"] = 14 -# metaDataGrp.attrs["nbClass"] = 2 -# metaDataGrp.attrs["datasetLength"] = len(labels) -# labelDictionary = {0: "No", 1: "Yes"} -# -# datasetFile.close() -# datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r") -# -# return datasetFile, labelDictionary -# -# -# def getModifiedMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): -# datasetFile = h5py.File(path + "ModifiedMultiOmic.hdf5", "r") -# labelDictionary = {0: "No", 1: "Yes"} -# return datasetFile, labelDictionary -# -# -# def getMultiOmicDBhdf5(features, path, name, NB_CLASS, LABELS_NAMES): -# datasetFile = h5py.File(path + "MultiOmic.hdf5", "r") -# labelDictionary = {0: "No", 1: "Yes"} -# return datasetFile, labelDictionary -# -# -# -# # def getOneViewFromDB(viewName, pathToDB, DBName): -# # view = np.genfromtxt(pathToDB + DBName +"-" + viewName, delimiter=';') -# # return view -# -# -# # def getClassLabels(pathToDB, DBName): -# # labels = np.genfromtxt(pathToDB + DBName + "-" + "ClassLabels.csv", delimiter=';') -# # return labels -# -# -# # def getDataset(pathToDB, viewNames, DBName): -# # dataset = [] -# # for viewName in viewNames: -# # dataset.append(getOneViewFromDB(viewName, pathToDB, DBName)) -# # return np.array(dataset) -# -# -# # def getAwaLabels(nbLabels, pathToAwa): -# # labelsFile = open(pathToAwa + 'Animals_with_Attributes/classes.txt', 'U') -# # linesFile = [''.join(line.strip().split()).translate(None, digits) for line in labelsFile.readlines()] -# # return linesFile -# -# -# # def getAwaDBcsv(views, pathToAwa, nameDB, nbLabels, LABELS_NAMES): -# # awaLabels = getAwaLabels(nbLabels, pathToAwa) -# # nbView = len(views) -# # nbMaxLabels = len(awaLabels) -# # if nbLabels == -1: -# # nbLabels = nbMaxLabels -# # nbNamesGiven = len(LABELS_NAMES) -# # if nbNamesGiven > nbLabels: -# # labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbLabels)} -# # elif nbNamesGiven < nbLabels and nbLabels <= nbMaxLabels: -# # if LABELS_NAMES != ['']: -# # labelDictionary = {i:LABELS_NAMES[i] for i in np.arange(nbNamesGiven)} -# # else: -# # labelDictionary = {} -# # nbNamesGiven = 0 -# # nbLabelsToAdd = nbLabels-nbNamesGiven -# # while nbLabelsToAdd > 0: -# # currentLabel = random.choice(awaLabels) -# # if currentLabel not in labelDictionary.values(): -# # labelDictionary[nbLabels-nbLabelsToAdd]=currentLabel -# # nbLabelsToAdd -= 1 -# # else: -# # pass -# # else: -# # labelDictionary = {i: LABELS_NAMES[i] for i in np.arange(nbNamesGiven)} -# # viewDictionary = {i: views[i] for i in np.arange(nbView)} -# # rawData = [] -# # labels = [] -# # nbExample = 0 -# # for view in np.arange(nbView): -# # viewData = [] -# # for labelIndex in np.arange(nbLabels): -# # pathToExamples = pathToAwa + 'Animals_with_Attributes/Features/' + viewDictionary[view] + '/' + \ -# # labelDictionary[labelIndex] + '/' -# # examples = os.listdir(pathToExamples) -# # if view == 0: -# # nbExample += len(examples) -# # for example in examples: -# # if viewDictionary[view]=='decaf': -# # exampleFile = open(pathToExamples + example) -# # viewData.append([float(line.strip()) for line in exampleFile]) -# # else: -# # exampleFile = open(pathToExamples + example) -# # viewData.append([[float(coordinate) for coordinate in raw.split()] for raw in exampleFile][0]) -# # if view == 0: -# # labels.append(labelIndex) -# # -# # rawData.append(np.array(viewData)) -# # data = rawData -# # DATASET_LENGTH = len(labels) -# # return data, labels, labelDictionary, DATASET_LENGTH -# # -# # -# # def getDbfromCSV(path): -# # files = os.listdir(path) -# # DATA = np.zeros((3,40,2)) -# # for file in files: -# # if file[-9:]=='moins.csv' and file[:7]=='sample1': -# # X = open(path+file) -# # for x, i in zip(X, range(20)): -# # DATA[0, i] = np.array([float(coord) for coord in x.strip().split('\t')]) -# # if file[-9:]=='moins.csv' and file[:7]=='sample2': -# # X = open(path+file) -# # for x, i in zip(X, range(20)): -# # DATA[1, i] = np.array([float(coord) for coord in x.strip().split('\t')]) -# # if file[-9:]=='moins.csv' and file[:7]=='sample3': -# # X = open(path+file) -# # for x, i in zip(X, range(20)): -# # DATA[2, i] = np.array([float(coord) for coord in x.strip().split('\t')]) -# # -# # for file in files: -# # if file[-8:]=='plus.csv' and file[:7]=='sample1': -# # X = open(path+file) -# # for x, i in zip(X, range(20)): -# # DATA[0, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) -# # if file[-8:]=='plus.csv' and file[:7]=='sample2': -# # X = open(path+file) -# # for x, i in zip(X, range(20)): -# # DATA[1, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) -# # if file[-8:]=='plus.csv' and file[:7]=='sample3': -# # X = open(path+file) -# # for x, i in zip(X, range(20)): -# # DATA[2, i+20] = np.array([float(coord) for coord in x.strip().split('\t')]) -# # LABELS = np.zeros(40) -# # LABELS[:20]=LABELS[:20]+1 -# # return DATA, LABELS -# -# # def makeArrayFromTriangular(pseudoRNASeqMatrix): -# # matrixShape = len(pseudoRNASeqMatrix[0,:]) -# # exampleArray = np.array(((matrixShape-1)*matrixShape)/2) -# # arrayIndex = 0 -# # for i in range(matrixShape-1): -# # for j in range(i+1, matrixShape): -# # exampleArray[arrayIndex]=pseudoRNASeqMatrix[i,j] -# # arrayIndex += 1 -# # return exampleArray -# -# -# # def getPseudoRNASeq(dataset): -# # nbGenes = len(dataset["/View2/matrix"][0, :]) -# # pseudoRNASeq = np.zeros((dataset["/datasetlength"][...], ((nbGenes - 1) * nbGenes) / 2), dtype=bool_) -# # for exampleIndex in xrange(dataset["/datasetlength"][...]): -# # arrayIndex = 0 -# # for i in xrange(nbGenes): -# # for j in xrange(nbGenes): -# # if i > j: -# # pseudoRNASeq[exampleIndex, arrayIndex] = -# # dataset["/View2/matrix"][exampleIndex, j] < dataset["/View2/matrix"][exampleIndex, i] -# # arrayIndex += 1 -# # dataset["/View4/matrix"] = pseudoRNASeq -# # dataset["/View4/name"] = "pseudoRNASeq" -# # return dataset -# -# -# # def allSame(array): -# # value = array[0] -# # areAllSame = True -# # for i in array: -# # if i != value: -# # areAllSame = False -# # return areAllSame - - -# def getFakeDBhdf5(features, pathF, name, NB_CLASS, LABELS_NAME, random_state): -# """Was used to generateafake dataset to run tests""" -# NB_VIEW = 4 -# DATASET_LENGTH = 30 -# NB_CLASS = 2 -# VIEW_DIMENSIONS = random_state.random_integers(5, 20, NB_VIEW) -# -# DATA = dict((indx, -# np.array([ -# random_state.normal(0.0, 2, viewDimension) -# for i in np.arange(DATASET_LENGTH)])) -# for indx, viewDimension in enumerate(VIEW_DIMENSIONS)) -# -# CLASS_LABELS = random_state.random_integers(0, NB_CLASS - 1, DATASET_LENGTH) -# datasetFile = h5py.File(pathF + "Fake.hdf5", "w") -# for index, viewData in enumerate(DATA.values()): -# if index == 0: -# viewData = random_state.randint(0, 1, (DATASET_LENGTH, 300)).astype( -# np.uint8) -# # np.zeros(viewData.shape, dtype=bool)+np.ones((viewData.shape[0], viewData.shape[1]/2), dtype=bool) -# viewDset = datasetFile.create_dataset("View" + str(index), viewData.shape) -# viewDset[...] = viewData -# viewDset.attrs["name"] = "View" + str(index) -# viewDset.attrs["sparse"] = False -# elif index == 1: -# viewData = sparse.csr_matrix(viewData) -# viewGrp = datasetFile.create_group("View" + str(index)) -# dataDset = viewGrp.create_dataset("data", viewData.data.shape, data=viewData.data) -# indicesDset = viewGrp.create_dataset("indices", viewData.indices.shape, data=viewData.indices) -# indptrDset = viewGrp.create_dataset("indptr", viewData.indptr.shape, data=viewData.indptr) -# viewGrp.attrs["name"] = "View" + str(index) -# viewGrp.attrs["sparse"] = True -# viewGrp.attrs["shape"] = viewData.shape -# else: -# viewDset = datasetFile.create_dataset("View" + str(index), viewData.shape) -# viewDset[...] = viewData -# viewDset.attrs["name"] = "View" + str(index) -# viewDset.attrs["sparse"] = False -# labelsDset = datasetFile.create_dataset("Labels", CLASS_LABELS.shape) -# labelsDset[...] = CLASS_LABELS -# labelsDset.attrs["name"] = "Labels" -# -# metaDataGrp = datasetFile.create_group("Metadata") -# metaDataGrp.attrs["nbView"] = NB_VIEW -# metaDataGrp.attrs["nbClass"] = NB_CLASS -# metaDataGrp.attrs["datasetLength"] = len(CLASS_LABELS) -# labels_dictionary = {0: "No", 1: "Yes"} -# datasetFile.close() -# datasetFile = h5py.File(pathF + "Fake.hdf5", "r") -# return datasetFile, labels_dictionary diff --git a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py deleted file mode 100644 index a13f6cab00c038f0668ea88bc8bf3e1a88469860..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py +++ /dev/null @@ -1,653 +0,0 @@ -import itertools -import sys -import traceback -import yaml -from abc import abstractmethod - -import matplotlib.pyplot as plt -import numpy as np -from scipy.stats import randint, uniform -from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, \ - ParameterGrid, ParameterSampler -from sklearn.base import clone, BaseEstimator - -from .multiclass import MultiClassWrapper -from .organization import secure_file_path -from .base import get_metric -from .. import metrics - - -class HPSearch: - - def get_scoring(self, metric): - if isinstance(metric, dict): - metric_module, metric_kwargs = get_metric(metric) - return metric_module.get_scorer(**metric_kwargs) - else: - return metric - - def fit_multiview(self, X, y, groups=None, **fit_params): - n_splits = self.cv.get_n_splits(self.available_indices, - y[self.available_indices]) - folds = list( - self.cv.split(self.available_indices, y[self.available_indices])) - self.get_candidate_params(X) - base_estimator = clone(self.estimator) - results = {} - self.cv_results_ = dict(("param_" + param_name, []) for param_name in - self.candidate_params[0].keys()) - self.cv_results_["mean_test_score"] = [] - self.cv_results_["params"] = [] - n_failed = 0 - self.tracebacks_params = [] - for candidate_param_idx, candidate_param in enumerate(self.candidate_params): - test_scores = np.zeros(n_splits) + 1000 - try: - for fold_idx, (train_indices, test_indices) in enumerate(folds): - current_estimator = clone(base_estimator) - current_estimator.set_params(**candidate_param) - current_estimator.fit(X, y, - train_indices=self.available_indices[ - train_indices], - view_indices=self.view_indices) - test_prediction = current_estimator.predict( - X, - self.available_indices[test_indices], - view_indices=self.view_indices) - test_score = self.scoring._score_func( - y[self.available_indices[test_indices]], - test_prediction, - **self.scoring._kwargs) - test_scores[fold_idx] = test_score - self.cv_results_['params'].append( - current_estimator.get_params()) - cross_validation_score = np.mean(test_scores) - self.cv_results_["mean_test_score"].append( - cross_validation_score) - results[candidate_param_idx] = cross_validation_score - if cross_validation_score >= max(results.values()): - self.best_params_ = self.candidate_params[candidate_param_idx] - self.best_score_ = cross_validation_score - except: - if self.track_tracebacks: - n_failed += 1 - self.tracebacks.append(traceback.format_exc()) - self.tracebacks_params.append(candidate_param) - else: - raise - if n_failed == self.n_iter: - raise ValueError( - 'No fits were performed. All HP combination returned errors \n\n' + '\n'.join( - self.tracebacks)) - self.cv_results_["mean_test_score"] = np.array( - self.cv_results_["mean_test_score"]) - if self.refit: - self.best_estimator_ = clone(base_estimator).set_params( - **self.best_params_) - self.best_estimator_.fit(X, y, **fit_params) - self.n_splits_ = n_splits - return self - - @abstractmethod - def get_candidate_params(self, X): # pragma: no cover - raise NotImplementedError - - def get_best_params(self): - best_params = self.best_params_ - if "random_state" in best_params: - best_params.pop("random_state") - return best_params - - def gen_report(self, output_file_name): - scores_array = self.cv_results_['mean_test_score'] - sorted_indices = np.argsort(-scores_array) - tested_params = [self.cv_results_["params"][score_index] - for score_index in sorted_indices] - scores_array = scores_array[sorted_indices] - output_string = "" - for parameters, score in zip(tested_params, scores_array): - formatted_params = format_params(parameters) - output_string += "\n{}\n\t\t{}".format(yaml.dump(formatted_params), score) - if self.tracebacks: - output_string += "Failed : \n\n\n" - for traceback, params in zip(self.tracebacks, self.tracebacks_params): - output_string+= '{}\n\n{}\n'.format(params, traceback) - secure_file_path(output_file_name + "hps_report.txt") - with open(output_file_name + "hps_report.txt", "w") as output_file: - output_file.write(output_string) - - -class Random(RandomizedSearchCV, HPSearch): - - def __init__(self, estimator, param_distributions=None, n_iter=10, - refit=False, n_jobs=1, scoring=None, cv=None, - random_state=None, learning_indices=None, view_indices=None, - framework="monoview", - equivalent_draws=True, track_tracebacks=True): - if param_distributions is None: - param_distributions = self.get_param_distribs(estimator) - scoring = HPSearch.get_scoring(self, scoring) - RandomizedSearchCV.__init__(self, estimator, n_iter=n_iter, - param_distributions=param_distributions, - refit=refit, n_jobs=n_jobs, scoring=scoring, - cv=cv, random_state=random_state) - self.framework = framework - self.available_indices = learning_indices - self.view_indices = view_indices - self.equivalent_draws = equivalent_draws - self.track_tracebacks = track_tracebacks - self.tracebacks=[] - - def get_param_distribs(self, estimator): - if isinstance(estimator, MultiClassWrapper): - return estimator.estimator.gen_distribs() - else: - return estimator.gen_distribs() - - def fit(self, X, y=None, groups=None, **fit_params): # pragma: no cover - if self.framework == "monoview": - return RandomizedSearchCV.fit(self, X, y=y, groups=groups, - **fit_params) - - elif self.framework == "multiview": - return HPSearch.fit_multiview(self, X, y=y, groups=groups, - **fit_params) - - def get_candidate_params(self, X): - if self.equivalent_draws: - self.n_iter = self.n_iter * X.nb_view - self.candidate_params = list( - ParameterSampler(self.param_distributions, self.n_iter, - random_state=self.random_state)) - - # def fit_multiview(self, X, y=None, groups=None, track_tracebacks=True, - # **fit_params): - # n_splits = self.cv.get_n_splits(self.available_indices, - # y[self.available_indices]) - - - - -class Grid(GridSearchCV, HPSearch): - - def __init__(self, estimator, param_grid={}, refit=False, n_jobs=1, scoring=None, cv=None, - learning_indices=None, view_indices=None, framework="monoview", - random_state=None, track_tracebacks=True): - scoring = HPSearch.get_scoring(self, scoring) - GridSearchCV.__init__(self, estimator, param_grid, scoring=scoring, - n_jobs=n_jobs, iid='deprecated', refit=refit, - cv=cv) - self.framework = framework - self.available_indices = learning_indices - self.view_indices = view_indices - self.track_tracebacks = track_tracebacks - self.tracebacks = [] - - def fit(self, X, y=None, groups=None, **fit_params): - if self.framework == "monoview": - return GridSearchCV.fit(self, X, y=y, groups=groups, - **fit_params) - elif self.framework == "multiview": - return HPSearch.fit_multiview(self, X, y=y, groups=groups, - **fit_params) - - def get_candidate_params(self, X): - self.candidate_params = list(ParameterGrid(self.param_grid)) - self.n_iter = len(self.candidate_params) - - -# class ParameterSamplerGrid: -# -# def __init__(self, param_distributions, n_iter): -# from math import floor -# n_points_per_param = int(n_iter **(1/len(param_distributions))) -# selected_params = dict((param_name, []) -# for param_name in param_distributions.keys()) -# for param_name, distribution in param_distributions.items(): -# if isinstance(distribution, list): -# if len(distribution)<n_points_per_param: -# selected_params[param_name] = distribution -# else: -# index_step = floor(len(distribution)/n_points_per_param-2) -# selected_params[param_name] = distribution[0]+[distribution[index*index_step+1] -# for index -# in range(n_points_per_param)] - - - - -# -# def hps_search(): -# pass -# -# def grid_search(X, y, framework, random_state, output_file_name, -# classifier_module, -# classifier_name, folds=4, nb_cores=1, -# metric=["accuracy_score", None], -# n_iter=30, classifier_kwargs={}, learning_indices=None, -# view_indices=None, -# equivalent_draws=True, grid_search_config=None): -# """Used to perfom gridsearch on the classifiers""" -# pass - - - -# class RS(HPSSearch): -# -# def __init__(self, X, y, framework, random_state, output_file_name, -# classifier_module, -# classifier_name, folds=4, nb_cores=1, -# metric=["accuracy_score", None], -# n_iter=30, classifier_kwargs={}, learning_indices=None, -# view_indices=None, -# equivalent_draws=True): -# HPSSearch.__init__() - - - -# def randomized_search(X, y, framework, random_state, output_file_name, -# classifier_module, -# classifier_name, folds=4, nb_cores=1, -# metric=["accuracy_score", None], -# n_iter=30, classifier_kwargs={}, learning_indices=None, -# view_indices=None, -# equivalent_draws=True): -# estimator = getattr(classifier_module, classifier_name)( -# random_state=random_state, -# **classifier_kwargs) -# params_dict = estimator.gen_distribs() -# estimator = get_mc_estim(estimator, random_state, -# multiview=(framework == "multiview"), -# y=y) -# if params_dict: -# metric_module, metric_kwargs = get_metric(metric) -# scorer = metric_module.get_scorer(**metric_kwargs) -# # nb_possible_combinations = compute_possible_combinations(params_dict) -# # n_iter_real = min(n_iter, nb_possible_combinations) -# -# random_search = MultiviewCompatibleRandomizedSearchCV(estimator, -# n_iter=n_iter, -# param_distributions=params_dict, -# refit=True, -# n_jobs=nb_cores, -# scoring=scorer, -# cv=folds, -# random_state=random_state, -# learning_indices=learning_indices, -# view_indices=view_indices, -# framework=framework, -# equivalent_draws=equivalent_draws) -# random_search.fit(X, y) -# return random_search.transform_results() -# else: -# best_estimator = estimator -# best_params = {} -# scores_array = {} -# params = {} -# test_folds_preds = np.zeros(10)#get_test_folds_preds(X, y, folds, best_estimator, -# # framework, learning_indices) -# return best_params, scores_array, params - - - - - - - - - -# -# def spear_mint(dataset, classifier_name, views_indices=None, k_folds=None, -# n_iter=1, -# **kwargs): -# """Used to perform spearmint on the classifiers to optimize hyper parameters, -# longer than randomsearch (can't be parallelized)""" -# pass -# -# -# def gen_heat_maps(params, scores_array, output_file_name): -# """Used to generate a heat map for each doublet of hyperparms -# optimized on the previous function""" -# nb_params = len(params) -# if nb_params > 2: -# combinations = itertools.combinations(range(nb_params), 2) -# elif nb_params == 2: -# combinations = [(0, 1)] -# else: -# combinations = [()] -# for combination in combinations: -# if combination: -# param_name1, param_array1 = params[combination[0]] -# param_name2, param_array2 = params[combination[1]] -# else: -# param_name1, param_array1 = params[0] -# param_name2, param_array2 = ("Control", np.array([0])) -# -# param_array1_set = np.sort(np.array(list(set(param_array1)))) -# param_array2_set = np.sort(np.array(list(set(param_array2)))) -# -# scores_matrix = np.zeros( -# (len(param_array2_set), len(param_array1_set))) - 0.1 -# for param1, param2, score in zip(param_array1, param_array2, -# scores_array): -# param1_index, = np.where(param_array1_set == param1) -# param2_index, = np.where(param_array2_set == param2) -# scores_matrix[int(param2_index), int(param1_index)] = score -# -# plt.figure(figsize=(8, 6)) -# plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95) -# plt.imshow(scores_matrix, interpolation='nearest', cmap=plt.cm.hot, -# ) -# plt.xlabel(param_name1) -# plt.ylabel(param_name2) -# plt.colorbar() -# plt.xticks(np.arange(len(param_array1_set)), param_array1_set) -# plt.yticks(np.arange(len(param_array2_set)), param_array2_set, -# rotation=45) -# plt.title('Validation metric') -# plt.savefig( -# output_file_name + "heat_map-" + param_name1 + "-" + param_name2 + ".png", -# transparent=True) -# plt.close() -# - - - -class CustomRandint: - """Used as a distribution returning a integer between low and high-1. - It can be used with a multiplier agrument to be able to perform more complex generation - for example 10 e -(randint)""" - - def __init__(self, low=0, high=0, multiplier=""): - self.randint = randint(low, high) - self.low=low - self.high=high - self.multiplier = multiplier - - def rvs(self, random_state=None): - randinteger = self.randint.rvs(random_state=random_state) - if self.multiplier == "e-": - return 10 ** -randinteger - else: - return randinteger - - def get_nb_possibilities(self): - if self.multiplier == "e-": - return abs(10 ** -self.low - 10 ** -self.high) - else: - return self.high - self.low - - -class CustomUniform: - """Used as a distribution returning a float between loc and loc + scale.. - It can be used with a multiplier agrument to be able to perform more complex generation - for example 10 e -(float)""" - - def __init__(self, loc=0, state=1, multiplier=""): - self.uniform = uniform(loc, state) - self.multiplier = multiplier - - def rvs(self, random_state=None): - unif = self.uniform.rvs(random_state=random_state) - if self.multiplier == 'e-': - return 10 ** -unif - else: - return unif - - -def format_params(params, pref=""): - if isinstance(params, dict): - dictionary = {} - for key, value in params.items(): - if isinstance(value, np.random.RandomState): - pass - elif isinstance(value, BaseEstimator): - dictionary[key] = value.__class__.__name__ - for second_key, second_value in format_params(value.get_params()).items(): - dictionary[str(key)+"__"+second_key] = second_value - else: - dictionary[str(key)] = format_params(value) - return dictionary - elif isinstance(params, np.ndarray): - return [format_params(param) for param in params] - elif isinstance(params, np.float64): - return float(params) - elif isinstance(params, np.int64): - return int(params) - elif isinstance(params, list): - return [format_params(param) for param in params] - elif isinstance(params, np.str_): - return str(params) - else: - return params - - -# def randomized_search_(dataset_var, labels, classifier_package, classifier_name, -# metrics_list, learning_indices, k_folds, random_state, -# views_indices=None, n_iter=1, -# nb_cores=1, **classification_kargs): -# """Used to perform a random search on the classifiers to optimize hyper parameters""" -# if views_indices is None: -# views_indices = range(dataset_var.get("Metadata").attrs["nbView"]) -# metric = metrics_list[0] -# metric_module = getattr(metrics, metric[0]) -# if metric[1] is not None: -# metric_kargs = dict((index, metricConfig) for index, metricConfig in -# enumerate(metric[1])) -# else: -# metric_kargs = {} -# classifier_module = getattr(classifier_package, classifier_name + "Module") -# classifier_class = getattr(classifier_module, classifier_name + "Class") -# if classifier_name != "Mumbo": -# params_sets = classifier_module.gen_params_sets(classification_kargs, -# random_state, n_iter=n_iter) -# if metric_module.getConfig()[-14] == "h": -# base_score = -1000.0 -# is_better = "higher" -# else: -# base_score = 1000.0 -# is_better = "lower" -# best_settings = None -# kk_folds = k_folds.split(learning_indices, labels[learning_indices]) -# for params_set in params_sets: -# scores = [] -# for trainIndices, testIndices in kk_folds: -# classifier = classifier_class(random_state, nb_scores=nb_cores, -# **classification_kargs) -# classifier.setParams(params_set) -# classifier.fit_hdf5(dataset_var, labels, -# train_indices=learning_indices[trainIndices], -# views_indices=views_indices) -# test_labels = classifier.predict_hdf5(dataset_var, -# used_indices=learning_indices[testIndices], -# views_indices=views_indices) -# test_score = metric_module.score( -# labels[learning_indices[testIndices]], test_labels) -# scores.append(test_score) -# cross_val_score = np.mean(np.array(scores)) -# -# if is_better == "higher" and cross_val_score > base_score: -# base_score = cross_val_score -# best_settings = params_set -# elif is_better == "lower" and cross_val_score < base_score: -# base_score = cross_val_score -# best_settings = params_set -# classifier = classifier_class(random_state, nb_cores=nb_cores, -# **classification_kargs) -# classifier.setParams(best_settings) -# -# # TODO : This must be corrected -# else: -# best_configs, _ = classifier_module.grid_search_hdf5(dataset_var, labels, -# views_indices, -# classification_kargs, -# learning_indices, -# random_state, -# metric=metric, -# nI_iter=n_iter) -# classification_kargs["classifiersConfigs"] = best_configs -# classifier = classifier_class(random_state, nb_cores=nb_cores, -# **classification_kargs) -# -# return classifier - -# -# def compute_possible_combinations(params_dict): -# n_possibs = np.ones(len(params_dict)) * np.inf -# for value_index, value in enumerate(params_dict.values()): -# if type(value) == list: -# n_possibs[value_index] = len(value) -# elif isinstance(value, CustomRandint): -# n_possibs[value_index] = value.get_nb_possibilities() -# return np.prod(n_possibs) - - -# def get_test_folds_preds(X, y, cv, estimator, framework, -# available_indices=None): -# test_folds_prediction = [] -# if framework == "monoview": -# folds = cv.split(np.arange(len(y)), y) -# if framework == "multiview": -# folds = cv.split(available_indices, y[available_indices]) -# fold_lengths = np.zeros(cv.n_splits, dtype=int) -# for fold_idx, (train_indices, test_indices) in enumerate(folds): -# fold_lengths[fold_idx] = len(test_indices) -# if framework == "monoview": -# estimator.fit(X[train_indices], y[train_indices]) -# test_folds_prediction.append(estimator.predict(X[train_indices])) -# if framework == "multiview": -# estimator.fit(X, y, available_indices[train_indices]) -# test_folds_prediction.append( -# estimator.predict(X, available_indices[test_indices])) -# min_fold_length = fold_lengths.min() -# test_folds_prediction = np.array( -# [test_fold_prediction[:min_fold_length] for test_fold_prediction in -# test_folds_prediction]) -# return test_folds_prediction - - -# nohup python ~/dev/git/spearmint/spearmint/main.py . & - -# import json -# import numpy as np -# import math -# -# from os import system -# from os.path import join -# -# -# def run_kover(dataset, split, model_type, p, max_rules, output_dir): -# outdir = join(output_dir, "%s_%f" % (model_type, p)) -# kover_command = "kover learn " \ -# "--dataset '%s' " \ -# "--split %s " \ -# "--model-type %s " \ -# "--p %f " \ -# "--max-rules %d " \ -# "--max-equiv-rules 10000 " \ -# "--hp-choice cv " \ -# "--random-seed 0 " \ -# "--output-dir '%s' " \ -# "--n-cpu 1 " \ -# "-v" % (dataset, -# split, -# model_type, -# p, -# max_rules, -# outdir) -# -# system(kover_command) -# -# return json.load(open(join(outdir, "results.json")))["cv"]["best_hp"]["score"] -# -# -# def main(job_id, params): -# print params -# -# max_rules = params["MAX_RULES"][0] -# -# species = params["SPECIES"][0] -# antibiotic = params["ANTIBIOTIC"][0] -# split = params["SPLIT"][0] -# -# model_type = params["model_type"][0] -# -# # LS31 -# if species == "saureus": -# dataset_path = "/home/droale01/droale01-ls31/projects/genome_scm/data/earle_2016/saureus/kover_datasets/%s.kover" % antibiotic -# else: -# dataset_path = "/home/droale01/droale01-ls31/projects/genome_scm/genome_scm_paper/data/%s/%s.kover" % (species, antibiotic) -# -# output_path = "/home/droale01/droale01-ls31/projects/genome_scm/manifold_scm/spearmint/vanilla_scm/%s/%s" % (species, antibiotic) -# -# # MacBook -# #dataset_path = "/Volumes/Einstein 1/kover_phylo/datasets/%s/%s.kover" % (species, antibiotic) -# #output_path = "/Volumes/Einstein 1/manifold_scm/version2/%s_spearmint" % antibiotic -# -# return run_kover(dataset=dataset_path, -# split=split, -# model_type=model_type, -# p=params["p"][0], -# max_rules=max_rules, -# output_dir=output_path) -# killall mongod && sleep 1 && rm -r database/* && rm mongo.log* -# mongod --fork --logpath mongo.log --dbpath database -# -# { -# "language" : "PYTHON", -# "experiment-name" : "vanilla_scm_cdiff_azithromycin", -# "polling-time" : 1, -# "resources" : { -# "my-machine" : { -# "scheduler" : "local", -# "max-concurrent" : 5, -# "max-finished-jobs" : 100 -# } -# }, -# "tasks": { -# "resistance" : { -# "type" : "OBJECTIVE", -# "likelihood" : "NOISELESS", -# "main-file" : "spearmint_wrapper", -# "resources" : ["my-machine"] -# } -# }, -# "variables": { -# -# "MAX_RULES" : { -# "type" : "ENUM", -# "size" : 1, -# "options": [10] -# }, -# -# -# "SPECIES" : { -# "type" : "ENUM", -# "size" : 1, -# "options": ["cdiff"] -# }, -# "ANTIBIOTIC" : { -# "type" : "ENUM", -# "size" : 1, -# "options": ["azithromycin"] -# }, -# "SPLIT" : { -# "type" : "ENUM", -# "size" : 1, -# "options": ["split_seed_2"] -# }, -# -# -# "model_type" : { -# "type" : "ENUM", -# "size" : 1, -# "options": ["conjunction", "disjunction"] -# }, -# "p" : { -# "type" : "FLOAT", -# "size" : 1, -# "min" : 0.01, -# "max" : 100 -# } -# } -# } diff --git a/multiview_platform/mono_multi_view_classifiers/utils/make_file_config.py b/multiview_platform/mono_multi_view_classifiers/utils/make_file_config.py deleted file mode 100644 index 5810e37bdddd002a96ff73d97d37d8f85245fbe9..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/make_file_config.py +++ /dev/null @@ -1,39 +0,0 @@ -import importlib -import inspect - -class ConfigurationMaker(): - """ - Find the name of the classifier from the dict classier to report - - - - """ - _path_classifier_mono = 'multiview_platform/mono_multi_view_classifier/monoview_classifiers' - _path_classifier_multi = 'multiview_platform/mono_multi_view_classifier/multiview_classifier' - - def __init__(self, classifier_dict=None): - if classifier_dict is None: - classifier_dict = {"0": ['mono', 'Adaboost', - 'multiview_platform.mono_multi_view_classifiers.monoview_classifiers.adaboost']} - names = [] - for key, val in classifier_dict.items(): - mymodule = importlib.import_module(val[2]) - names.append(self._get_module_name(mymodule)) - monInstance = getattr(mymodule, val[1]) - - def _get_module_name(self, mymodule): - for name in dir(mymodule): - att = getattr(mymodule, name) - try: - getattr(att, "__module__") - if att.__module__.startswith(mymodule.__name__): - if inspect.isclass(att): - if att == val[1]: - return name - except Exception: - return None - return None - - -if __name__ == '__main__': - ConfigurationMaker() diff --git a/multiview_platform/mono_multi_view_classifiers/utils/multiclass.py b/multiview_platform/mono_multi_view_classifiers/utils/multiclass.py deleted file mode 100644 index 0b7210a76bf0bf98596b0c5309ca69b746fb5040..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/multiclass.py +++ /dev/null @@ -1,323 +0,0 @@ -import array - -import numpy as np -import scipy.sparse as sp -from sklearn.base import clone, is_classifier, is_regressor -from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier -from sklearn.multiclass import _ovr_decision_function -from sklearn.preprocessing import LabelBinarizer - -from .dataset import get_examples_views_indices - - -def get_mc_estim(estimator, random_state, y=None, multiview=False, - multiclass=False): - r"""Used to get a multiclass-compatible estimator if the one in param does not natively support multiclass. - If perdict_proba is available in the asked estimator, a One Versus Rest wrapper is returned, - else, a One Versus One wrapper is returned. - - To be able to deal with multiview algorithm, multiview wrappers are implemented separately. - - Parameters - ---------- - estimator : sklearn-like estimator - Asked estimator - y : numpy.array - The labels of the problem - random_state : numpy.random.RandomState object - The random state, used to generate a fake multiclass problem - multiview : bool - If True, mutliview-compatible wrappers are returned. - - Returns - ------- - estimator : sklearn-like estimator - Either the aksed estimator, or a multiclass-compatible wrapper over the asked estimator - """ - if (y is not None and np.unique(y).shape[0] > 2) or multiclass: - if not clone(estimator).accepts_multi_class(random_state): - if hasattr(estimator, "predict_proba"): - if multiview: - estimator = MultiviewOVRWrapper(estimator) - else: - estimator = OVRWrapper(estimator) - else: - if multiview: - estimator = MultiviewOVOWrapper(estimator) - else: - estimator = OVOWrapper(estimator) - return estimator - - -class MultiClassWrapper: - - # TODO : Has an effect on the init of the sub-classes. - # @abstractmethod - # def __init__(self, estimator, **params): - # self.estimator = estimator - - def set_params(self, **params): - r""" - This function is useful in order for the OV_Wrappers to be transparent - in terms of parameters. - If we remove it the parameters have to be specified as estimator__param. - Witch is not relevant for the platform - - """ - self.estimator.set_params(**params) - return self - - def get_config(self): - return "multiclass_adaptation : "+self.__class__.__name__+ ", " +self.estimator.get_config() - - def format_params(self, params, deep=True): - if hasattr(self, 'estimators_'): - estim_params = self.estimators_[0].get_params(deep=deep) - for key, value in params.items(): - if key.startswith("estimator__"): - estim_param_key = '__'.join(key.split('__')[1:]) - params[key] = estim_params[estim_param_key] - params.pop("estimator") - return params - - - - def get_interpretation(self, directory, base_file_name, y_test=None): - # TODO : Multiclass interpretation - return "Multiclass wrapper is not interpretable yet" - - -class MonoviewWrapper(MultiClassWrapper): - pass - - -class OVRWrapper(MonoviewWrapper, OneVsRestClassifier): - - def get_params(self, deep=True): - return self.format_params( - OneVsRestClassifier.get_params(self, deep=deep), deep=deep) - - -class OVOWrapper(MonoviewWrapper, OneVsOneClassifier): - def decision_function(self, X): - # check_is_fitted(self) - - indices = self.pairwise_indices_ - if indices is None: - Xs = [X] * len(self.estimators_) - else: - Xs = [X[:, idx] for idx in indices] - - predictions = np.vstack([est.predict(Xi) - for est, Xi in zip(self.estimators_, Xs)]).T - confidences = np.ones(predictions.shape) - Y = _ovr_decision_function(predictions, - confidences, len(self.classes_)) - if self.n_classes_ == 2: - return Y[:, 1] - return Y - - def get_params(self, deep=True): - return self.format_params( - OneVsOneClassifier.get_params(self, deep=deep), deep=deep) - - -# The following code is a mutliview adaptation of sklearns multiclass package - -def _multiview_fit_binary(estimator, X, y, train_indices, - view_indices, classes=None, ): - # TODO : Verifications des sklearn - estimator = clone(estimator) - estimator.fit(X, y, train_indices=train_indices, - view_indices=view_indices) - return estimator - - -def _multiview_predict_binary(estimator, X, example_indices, view_indices): - if is_regressor(estimator): - return estimator.predict(X, example_indices=example_indices, - view_indices=view_indices) - try: - score = np.ravel(estimator.decision_function(X)) - except (AttributeError, NotImplementedError): - # probabilities of the positive class - score = estimator.predict_proba(X, example_indices=example_indices, - view_indices=view_indices)[:, 1] - return score - - -class MultiviewWrapper(MultiClassWrapper): - - def __init__(self, estimator=None, **args): - super(MultiviewWrapper, self).__init__(estimator=estimator, **args) - self.short_name = estimator.short_name - - -class MultiviewOVRWrapper(MultiviewWrapper, OneVsRestClassifier): - - def fit(self, X, y, train_indices=None, view_indices=None): - self.label_binarizer_ = LabelBinarizer(sparse_output=True) - Y = self.label_binarizer_.fit_transform(y) - Y = Y.tocsc() - self.classes_ = self.label_binarizer_.classes_ - columns = (col.toarray().ravel() for col in Y.T) - # In cases where individual estimators are very fast to train setting - # n_jobs > 1 in can results in slower performance due to the overhead - # of spawning threads. See joblib issue #112. - self.estimators_ = [_multiview_fit_binary( - self.estimator, X, column, classes=[ - "not %s" % self.label_binarizer_.classes_[i], - self.label_binarizer_.classes_[i]], train_indices=train_indices, - view_indices=view_indices) - for i, column in - enumerate(columns)] - return self - - def predict(self, X, example_indices=None, view_indices=None): - example_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - n_samples = len(example_indices) - if self.label_binarizer_.y_type_ == "multiclass": - maxima = np.empty(n_samples, dtype=float) - maxima.fill(-np.inf) - argmaxima = np.zeros(n_samples, dtype=int) - for i, e in enumerate(self.estimators_): - pred = _multiview_predict_binary(e, X, example_indices, - view_indices) - np.maximum(maxima, pred, out=maxima) - argmaxima[maxima == pred] = i - return self.classes_[argmaxima] - else: # pragma: no cover - if (hasattr(self.estimators_[0], "decision_function") and - is_classifier(self.estimators_[0])): - thresh = 0 - else: - thresh = .5 - indices = array.array('i') - indptr = array.array('i', [0]) - for e in self.estimators_: - indices.extend( - np.where(_multiview_predict_binary(e, X, - example_indices, - view_indices) > thresh)[ - 0]) - indptr.append(len(indices)) - - data = np.ones(len(indices), dtype=int) - indicator = sp.csc_matrix((data, indices, indptr), - shape=(n_samples, len(self.estimators_))) - return self.label_binarizer_.inverse_transform(indicator) - - def get_params(self, deep=True): - return self.format_params( - OneVsRestClassifier.get_params(self, deep=deep), deep=deep) - - -def _multiview_fit_ovo_binary(estimator, X, y, i, j, train_indices, - view_indices): - cond = np.logical_or(y == i, y == j) - # y = y[cond] - y_binary = np.empty(y.shape, np.int) - y_binary[y == i] = 0 - y_binary[y == j] = 1 - indcond = np.arange(X.get_nb_examples())[cond] - train_indices = np.intersect1d(train_indices, indcond) - return _multiview_fit_binary(estimator, - X, - y_binary, train_indices, view_indices, - classes=[i, j]), train_indices - - -class MultiviewOVOWrapper(MultiviewWrapper, OneVsOneClassifier): - - def fit(self, X, y, train_indices=None, view_indices=None): - """Fit underlying estimators. - - Parameters - ---------- - X : (sparse) array-like of shape (n_samples, n_features) - Data. - - y : array-like of shape (n_samples,) - Multi-class targets. - - Returns - ------- - self - """ - # X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - # check_classification_targets(y) - train_indices, view_indices = get_examples_views_indices(X, - train_indices, - view_indices) - self.classes_ = np.unique(y) - if len(self.classes_) == 1: - raise ValueError("OneVsOneClassifier can not be fit when only one" - " class is present.") - n_classes = self.classes_.shape[0] - estimators_indices = list(zip(*([_multiview_fit_ovo_binary( - self.estimator, X, y, self.classes_[i], self.classes_[j], - train_indices, - view_indices - ) - for i in range(n_classes) for j in range(i + 1, n_classes) - ]))) - - self.estimators_ = estimators_indices[0] - self.pairwise_indices_ = ( - estimators_indices[1] if self._pairwise else None) - - return self - - def predict(self, X, example_indices=None, view_indices=None): - """Estimate the best class label for each sample in X. - - This is implemented as ``argmax(decision_function(X), axis=1)`` which - will return the label of the class with most votes by estimators - predicting the outcome of a decision for each possible class pair. - - Parameters - ---------- - X : (sparse) array-like of shape (n_samples, n_features) - Data. - - Returns - ------- - y : numpy array of shape [n_samples] - Predicted multi-class targets. - """ - example_indices, view_indices = get_examples_views_indices(X, - example_indices, - view_indices) - Y = self.multiview_decision_function(X, example_indices=example_indices, - view_indices=view_indices) - if self.n_classes_ == 2: - return self.classes_[(Y > 0).astype(np.int)] - return self.classes_[Y.argmax(axis=1)] - - def multiview_decision_function(self, X, example_indices, view_indices): # pragma: no cover - # check_is_fitted(self) - - indices = self.pairwise_indices_ - if indices is None: - Xs = [X] * len(self.estimators_) - else: - # TODO Gram matrix compatibility - Xs = [X[:, idx] for idx in indices] - predictions = np.vstack( - [est.predict(Xi, example_indices=example_indices, - view_indices=view_indices) - for est, Xi in zip(self.estimators_, Xs)]).T - confidences = np.ones(predictions.shape) - # confidences = np.vstack([_predict_binary(est, Xi) - # for est, Xi in zip(self.estimators_, Xs)]).T - Y = _ovr_decision_function(predictions, - confidences, len(self.classes_)) - if self.n_classes_ == 2: - return Y[:, 1] - return Y - - def get_params(self, deep=True): - return self.format_params( - OneVsOneClassifier.get_params(self, deep=deep), deep=deep) diff --git a/multiview_platform/mono_multi_view_classifiers/utils/multiview_result_analysis.py b/multiview_platform/mono_multi_view_classifiers/utils/multiview_result_analysis.py deleted file mode 100644 index a980b3befc0bf8cf955db16ad5a9de0b92e578af..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/multiview_result_analysis.py +++ /dev/null @@ -1,54 +0,0 @@ -# from .. import metrics -# -# # Author-Info -# __author__ = "Baptiste Bauvin" -# __status__ = "Prototype" # Production, Development, Prototype -# -# -# def print_metric_score(metric_scores, metrics): -# metric_score_string = "\n\n" -# for metric in metrics: -# metric_module = getattr(metrics, metric[0]) -# if metric[1] is not None: -# metric_kwargs = dict( -# (index, metricConfig) for index, metricConfig in -# enumerate(metric[1])) -# else: -# metric_kwargs = {} -# metric_score_string += "\tFor " + metric_module.get_config( -# **metric_kwargs) + " : " -# metric_score_string += "\n\t\t- Score on train : " + str( -# metric_scores[metric[0]][0]) -# metric_score_string += "\n\t\t- Score on test : " + str( -# metric_scores[metric[0]][1]) -# metric_score_string += "\n\n" -# return metric_score_string -# -# -# def get_total_metric_scores(metric, train_labels, test_labels, -# validation_indices, -# learning_indices, labels): -# metric_module = getattr(metrics, metric[0]) -# if metric[1] is not None: -# metric_kwargs = dict((index, metricConfig) for index, metricConfig in -# enumerate(metric[1])) -# else: -# metric_kwargs = {} -# train_score = metric_module.score(labels[learning_indices], train_labels, -# **metric_kwargs) -# test_score = metric_module.score(labels[validation_indices], test_labels, -# **metric_kwargs) -# return [train_score, test_score] -# -# -# def get_metrics_scores(metrics_var, train_labels, test_labels, -# validation_indices, learning_indices, labels): -# metrics_scores = {} -# for metric in metrics_var: -# metrics_scores[metric[0]] = get_total_metric_scores(metric, -# train_labels, -# test_labels, -# validation_indices, -# learning_indices, -# labels) -# return metrics_scores diff --git a/multiview_platform/mono_multi_view_classifiers/utils/organization.py b/multiview_platform/mono_multi_view_classifiers/utils/organization.py deleted file mode 100644 index 1fdc0ecf608350c98bf66ff9fdc4e1be238e5b45..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/organization.py +++ /dev/null @@ -1,11 +0,0 @@ -import os -import errno - - -def secure_file_path(file_name): # pragma: no cover - if not os.path.exists(os.path.dirname(file_name)): - try: - os.makedirs(os.path.dirname(file_name)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise diff --git a/multiview_platform/mono_multi_view_classifiers/utils/transformations.py b/multiview_platform/mono_multi_view_classifiers/utils/transformations.py deleted file mode 100644 index 17e7b90d3b0d186495893220514524b4e0a648a2..0000000000000000000000000000000000000000 --- a/multiview_platform/mono_multi_view_classifiers/utils/transformations.py +++ /dev/null @@ -1,44 +0,0 @@ -import numpy as np - - -def sign_labels(labels): - """ - Returns a label array with (-1,1) as labels. - If labels was already made of (-1,1), returns labels. - If labels is made of (0,1), returns labels with all - zeros transformed in -1. - - Parameters - ---------- - labels - - The original label numpy array - - Returns - ------- - A np.array with labels made of (-1,1) - """ - if 0 in labels: - return np.array([label if label != 0 else -1 for label in labels]) - else: - return labels - - -def unsign_labels(labels): - """ - The inverse function - - Parameters - ---------- - labels - - Returns - ------- - - """ - if len(labels.shape) == 2: - labels = labels.reshape((labels.shape[0],)) - if -1 in labels: - return np.array([label if label != -1 else 0 for label in labels]) - else: - return labels diff --git a/multiview_platform/tests/__init__.py b/multiview_platform/tests/__init__.py deleted file mode 100644 index 194018ae5ef03ba4d863b4e1497acae3b317589a..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from . import test_exec_classif -from .utils import rm_tmp, gen_test_dataset, tmp_path \ No newline at end of file diff --git a/multiview_platform/tests/test_config_hps.yml b/multiview_platform/tests/test_config_hps.yml deleted file mode 100644 index bce80ba6a8ee25002b78e0bbcc333481537f9fa1..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_config_hps.yml +++ /dev/null @@ -1,80 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: False -# The name of each dataset in the directory on which the benchmark should be run -name: "digits_doc" -# A label for the resul directory -label: "example_0" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "../examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "tmp_tests/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# The ratio of test examples/number of train examples -split: 0.25 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 2 -# The number of classes to select in the dataset -nb_class: -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree"] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_early_fusion",] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 1 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: "Random" -# The number of iteration in the hyper-parameter optimization process -hps_args: - n_iter: 2 - equivalent_draws: False - -### Configuring the hyper-parameters for the classifiers - -decision_tree: - max_depth: 3 - -weighted_linear_early_fusion: - monoview_classifier_name: "decision_tree" - monoview_classifier_config: - decision_tree: - max_depth: 6 - -weighted_linear_late_fusion: - classifiers_names: "decision_tree" - classifier_configs: - decision_tree: - max_depth: 3 diff --git a/multiview_platform/tests/test_config_iter.yml b/multiview_platform/tests/test_config_iter.yml deleted file mode 100644 index f44b34fe9afaa80a4fa68bfc60554dd4394d88db..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_config_iter.yml +++ /dev/null @@ -1,78 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: False -# The name of each dataset in the directory on which the benchmark should be run -name: "digits_doc" -# A label for the resul directory -label: "example_0" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "../examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "tmp_tests/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# The ratio of test examples/number of train examples -split: 0.25 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 2 -# The number of classes to select in the dataset -nb_class: -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree"] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_early_fusion",] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 2 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: "None" -# The number of iteration in the hyper-parameter optimization process -hps_args: {} - -### Configuring the hyper-parameters for the classifiers - -decision_tree: - max_depth: 3 - -weighted_linear_early_fusion: - monoview_classifier_name: "decision_tree" - monoview_classifier_config: - decision_tree: - max_depth: 6 - -weighted_linear_late_fusion: - classifiers_names: "decision_tree" - classifier_configs: - decision_tree: - max_depth: 3 diff --git a/multiview_platform/tests/test_config_simple.yml b/multiview_platform/tests/test_config_simple.yml deleted file mode 100644 index 02b85b5848885d6f501cfa9265c3263b35b3b70a..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_config_simple.yml +++ /dev/null @@ -1,78 +0,0 @@ -# The base configuration of the benchmark - -# Enable logging -log: False -# The name of each dataset in the directory on which the benchmark should be run -name: "digits_doc" -# A label for the resul directory -label: "example_0" -# The type of dataset, currently supported ".hdf5", and ".csv" -file_type: ".hdf5" -# The views to use in the banchmark, an empty value will result in using all the views -views: -# The path to the directory where the datasets are stored, an absolute path is advised -pathf: "../examples/data/" -# The niceness of the processes, useful to lower their priority -nice: 0 -# The random state of the benchmark, useful for reproducibility -random_state: 42 -# The number of parallel computing threads -nb_cores: 1 -# Used to run the benchmark on the full dataset -full: True -# Used to be able to run more than one benchmark per minute -debug: False -# The directory in which the results will be stored, an absolute path is advised -res_dir: "tmp_tests/" -# If an error occurs in a classifier, if track_tracebacks is set to True, the -# benchmark saves the traceback and continues, if it is set to False, it will -# stop the benchmark and raise the error -track_tracebacks: True - -# All the classification-realted configuration options - -# The ratio of test examples/number of train examples -split: 0.25 -# The nubmer of folds in the cross validation process when hyper-paramter optimization is performed -nb_folds: 2 -# The number of classes to select in the dataset -nb_class: -# The name of the classes to select in the dataset -classes: -# The type of algorithms to run during the benchmark (monoview and/or multiview) -type: ["monoview","multiview"] -# The name of the monoview algorithms to run, ["all"] to run all the available classifiers -algos_monoview: ["decision_tree"] -# The names of the multiview algorithms to run, ["all"] to run all the available classifiers -algos_multiview: ["weighted_linear_early_fusion", "weighted_linear_late_fusion",] -# The number of times the benchamrk is repeated with different train/test -# split, to have more statistically significant results -stats_iter: 1 -# The metrics that will be use din the result analysis -metrics: - accuracy_score: {} - f1_score: - average: "micro" -# The metric that will be used in the hyper-parameter optimization process -metric_princ: "accuracy_score" -# The type of hyper-parameter optimization method -hps_type: "None" -# The number of iteration in the hyper-parameter optimization process -hps_args: {} - -### Configuring the hyper-parameters for the classifiers - -decision_tree: - max_depth: 3 - -weighted_linear_early_fusion: - monoview_classifier_name: "decision_tree" - monoview_classifier_config: - decision_tree: - max_depth: 6 - -weighted_linear_late_fusion: - classifiers_names: "decision_tree" - classifier_configs: - decision_tree: - max_depth: 3 diff --git a/multiview_platform/tests/test_database.hdf5 b/multiview_platform/tests/test_database.hdf5 deleted file mode 100644 index 63206a1219daf9e09d9a91d5a2440ef19a88af82..0000000000000000000000000000000000000000 Binary files a/multiview_platform/tests/test_database.hdf5 and /dev/null differ diff --git a/multiview_platform/tests/test_exec_classif.py b/multiview_platform/tests/test_exec_classif.py deleted file mode 100644 index 71cdc9e8948cbd1a129091c08e8c6fdedde0f486..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_exec_classif.py +++ /dev/null @@ -1,415 +0,0 @@ -import os -import unittest - -import h5py -import numpy as np - -from multiview_platform.tests.utils import rm_tmp, tmp_path, test_dataset - -from multiview_platform.mono_multi_view_classifiers import exec_classif - - -class Test_execute(unittest.TestCase): - - @classmethod - def setUpClass(cls): - os.mkdir(tmp_path) - - def test_exec_simple(self): - exec_classif.exec_classif(["--config_path", os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_config_simple.yml")]) - - def test_exec_iter(self): - exec_classif.exec_classif(["--config_path", os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_config_iter.yml")]) - - def test_exec_hps(self): - exec_classif.exec_classif(["--config_path", os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_config_hps.yml")]) - - @classmethod - def tearDown(self): - rm_tmp() - -class Test_gen_single_monoview_arg_dictionary(unittest.TestCase): - - def test_no_config(self): - conf = exec_classif.gen_single_monoview_arg_dictionary("classifier_name1", - {}, "nb_class", - "view_index", - "view_name", - "hps_kwargs") - self.assertEqual(conf, {"classifier_name1": {}, - "view_name": "view_name", - "view_index": "view_index", - "classifier_name": "classifier_name1", - "nb_class": "nb_class", - "hps_kwargs":"hps_kwargs" } ) - -class Test_initBenchmark(unittest.TestCase): - - def test_benchmark_wanted(self): - benchmark_output = exec_classif.init_benchmark(cl_type=["monoview", "multiview"], monoview_algos=["decision_tree"], multiview_algos=["weighted_linear_late_fusion"]) - self.assertEqual(benchmark_output , {'monoview': ['decision_tree'], 'multiview': ['weighted_linear_late_fusion']}) - benchmark_output = exec_classif.init_benchmark( - cl_type=["monoview", "multiview"], monoview_algos=["all"], - multiview_algos=["all"]) - self.assertEqual(benchmark_output, {'monoview': ['adaboost', - 'decision_tree', - 'gradient_boosting', - 'knn', - 'lasso', - 'random_forest', - 'sgd', - 'svm_linear', - 'svm_poly', - 'svm_rbf'], - 'multiview': ['bayesian_inference_fusion', - 'difficulty_fusion', - 'disagree_fusion', - 'double_fault_fusion', - 'entropy_fusion', - 'majority_voting_fusion', - 'svm_jumbo_fusion', - 'weighted_linear_early_fusion', - 'weighted_linear_late_fusion']}) - - -class Test_Functs(unittest.TestCase): - - @classmethod - def setUpClass(cls): - os.mkdir(tmp_path) - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_initKWARGSFunc_no_monoview(self): - benchmark = {"monoview": {}, "multiview": {}} - args = exec_classif.init_kwargs_func({}, benchmark) - self.assertEqual(args, {"monoview": {}, "multiview": {}}) - - def test_init_kwargs(self): - kwargs = exec_classif.init_kwargs({"decision_tree":""},["decision_tree"]) - self.assertEqual(kwargs, {"decision_tree":""}) - kwargs = exec_classif.init_kwargs({"weighted_linear_late_fusion": ""}, - ["weighted_linear_late_fusion"], framework="multiview") - self.assertEqual(kwargs, {"weighted_linear_late_fusion": ""}) - kwargs = exec_classif.init_kwargs({}, ["decision_tree"],) - self.assertEqual(kwargs, {"decision_tree":{}}) - self.assertRaises(AttributeError, exec_classif.init_kwargs, {}, ["test"]) - - def test_arange_metrics(self): - metrics = exec_classif.arange_metrics({"accuracy_score":{}}, "accuracy_score") - self.assertEqual(metrics, {"accuracy_score*":{}}) - self.assertRaises(ValueError, exec_classif.arange_metrics, {"test1":{}}, "test") - - def test_banchmark_init(self): - from sklearn.model_selection import StratifiedKFold - folds = StratifiedKFold(n_splits=2) - res, lab_names = exec_classif.benchmark_init(directory=tmp_path, - classification_indices=[np.array([0,1,2,3]), np.array([4])], - labels=test_dataset.get_labels(), - labels_dictionary={"yes":0, "no":1}, - k_folds=folds, - dataset_var=test_dataset) - self.assertEqual(res, []) - self.assertEqual(lab_names, [0, 1]) - - - - -class Test_InitArgumentDictionaries(unittest.TestCase): - @classmethod - def setUpClass(cls): - rm_tmp() - cls.benchmark = {"monoview": ["fake_monoview_classifier"], "multiview": {}} - cls.views_dictionnary = {'test_view_0': 0, 'test_view': 1} - cls.nb_class = 2 - cls.monoview_classifier_name = "fake_monoview_classifier" - cls.monoview_classifier_arg_name = "fake_arg" - cls.monoview_classifier_arg_value = "fake_value_1" - cls.multiview_classifier_name = "fake_multiview_classifier" - cls.multiview_classifier_arg_name = "fake_arg_mv" - cls.multiview_classifier_arg_value = "fake_value_2" - cls.init_kwargs = { - 'monoview':{ - cls.monoview_classifier_name: - {cls.monoview_classifier_arg_name:cls.monoview_classifier_arg_value} - }, - "multiview":{ - cls.multiview_classifier_name:{ - cls.multiview_classifier_arg_name:cls.multiview_classifier_arg_value} - } - } - - def test_init_argument_dictionaries_monoview(self): - arguments = exec_classif.init_argument_dictionaries(self.benchmark, - self.views_dictionnary, - self.nb_class, - self.init_kwargs, - "None", {}) - expected_output = [{ - self.monoview_classifier_name: { - self.monoview_classifier_arg_name:self.monoview_classifier_arg_value}, - "view_name": "test_view_0", - 'hps_kwargs': {}, - "classifier_name": self.monoview_classifier_name, - "nb_class": self.nb_class, - "view_index": 0}, - {self.monoview_classifier_name: { - self.monoview_classifier_arg_name: self.monoview_classifier_arg_value}, - "view_name": "test_view", - 'hps_kwargs': {}, - "classifier_name": self.monoview_classifier_name, - "nb_class": self.nb_class, - "view_index": 1}, - ] - self.assertEqual(arguments["monoview"], expected_output) - - def test_init_argument_dictionaries_multiview(self): - self.benchmark["multiview"] = ["fake_multiview_classifier"] - self.benchmark["monoview"] = {} - arguments = exec_classif.init_argument_dictionaries(self.benchmark, - self.views_dictionnary, - self.nb_class, - self.init_kwargs, - "None", {}) - expected_output = [{ - "classifier_name": self.multiview_classifier_name, - "view_indices": [0,1], - "view_names": ["test_view_0", "test_view"], - "nb_class": self.nb_class, - 'hps_kwargs': {}, - "labels_names":None, - self.multiview_classifier_name: { - self.multiview_classifier_arg_name: - self.multiview_classifier_arg_value}, - },] - self.assertEqual(arguments["multiview"][0], expected_output[0]) - - - def test_init_argument_dictionaries_multiview_complex(self): - self.multiview_classifier_arg_value = {"fake_value_2":"plif", "plaf":"plouf"} - self.init_kwargs = { - 'monoview': { - self.monoview_classifier_name: - { - self.monoview_classifier_arg_name: self.monoview_classifier_arg_value} - }, - "multiview": { - self.multiview_classifier_name: { - self.multiview_classifier_arg_name: self.multiview_classifier_arg_value} - } - } - self.benchmark["multiview"] = ["fake_multiview_classifier"] - self.benchmark["monoview"] = {} - arguments = exec_classif.init_argument_dictionaries(self.benchmark, - self.views_dictionnary, - self.nb_class, - self.init_kwargs, - "None", {}) - expected_output = [{ - "classifier_name": self.multiview_classifier_name, - "view_indices": [0,1], - 'hps_kwargs': {}, - "view_names": ["test_view_0", "test_view"], - "nb_class": self.nb_class, - "labels_names":None, - self.multiview_classifier_name: { - self.multiview_classifier_arg_name: - self.multiview_classifier_arg_value}, - }] - self.assertEqual(arguments["multiview"][0], expected_output[0]) - - -def fakeBenchmarkExec(core_index=-1, a=7, args=1): - return [core_index, a] - - -def fakeBenchmarkExec_mutlicore(nb_cores=-1, a=6, args=1): - return [nb_cores, a] - - -def fakeBenchmarkExec_monocore(dataset_var=1, a=4, args=1, track_tracebacks=False): - return [a] - - -def fakegetResults(results, stats_iter, - benchmark_arguments_dictionaries, metrics, directory, - example_ids, labels): - return 3 - - -def fakeDelete(a, b, c): - return 9 - -def fake_analyze(a, b, c, d, example_ids=None, labels=None): - pass - -class Test_execBenchmark(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - - os.mkdir(tmp_path) - cls.Dataset = test_dataset - cls.argument_dictionaries = [{"a": 4, "args": {}}] - cls.args = { - "Base":{"name": "chicken_is_heaven", "type": "type", "pathf": "pathF"}, - "Classification":{"hps_iter": 1}} - - def test_simple(cls): - res = exec_classif.exec_benchmark(nb_cores=1, - stats_iter=2, - benchmark_arguments_dictionaries=cls.argument_dictionaries, - directory="", - metrics=[[[1, 2], [3, 4, 5]]], - dataset_var=cls.Dataset, - track_tracebacks=6, - # exec_one_benchmark=fakeBenchmarkExec, - # exec_one_benchmark_multicore=fakeBenchmarkExec_mutlicore, - exec_one_benchmark_mono_core=fakeBenchmarkExec_monocore, - analyze=fakegetResults, - delete=fakeDelete, - analyze_iterations=fake_analyze) - cls.assertEqual(res, 3) - - def test_multiclass_no_iter(cls): - cls.argument_dictionaries = [{"a": 10, "args": cls.args}, - {"a": 4, "args": cls.args}] - res = exec_classif.exec_benchmark(nb_cores=1, - stats_iter=1, - benchmark_arguments_dictionaries=cls.argument_dictionaries, - directory="", - metrics=[[[1, 2], [3, 4, 5]]], - dataset_var=cls.Dataset, - track_tracebacks=6, - # exec_one_benchmark=fakeBenchmarkExec, - # exec_one_benchmark_multicore=fakeBenchmarkExec_mutlicore, - exec_one_benchmark_mono_core=fakeBenchmarkExec_monocore, - analyze=fakegetResults, - delete=fakeDelete, - analyze_iterations=fake_analyze) - cls.assertEqual(res, 3) - - def test_multiclass_and_iter(cls): - cls.argument_dictionaries = [{"a": 10, "args": cls.args}, - {"a": 4, "args": cls.args}, - {"a": 55, "args": cls.args}, - {"a": 24, "args": cls.args}] - res = exec_classif.exec_benchmark(nb_cores=1, - stats_iter=2, - benchmark_arguments_dictionaries=cls.argument_dictionaries, - directory="", - metrics=[[[1, 2], [3, 4, 5]]], - dataset_var=cls.Dataset, - track_tracebacks=6, - # exec_one_benchmark=fakeBenchmarkExec, - # exec_one_benchmark_multicore=fakeBenchmarkExec_mutlicore, - exec_one_benchmark_mono_core=fakeBenchmarkExec_monocore, - analyze=fakegetResults, - delete=fakeDelete, - analyze_iterations=fake_analyze) - cls.assertEqual(res, 3) - - def test_no_iter_biclass_multicore(cls): - res = exec_classif.exec_benchmark(nb_cores=1, - stats_iter=1, - benchmark_arguments_dictionaries=cls.argument_dictionaries, - directory="", - metrics=[[[1, 2], [3, 4, 5]]], - dataset_var=cls.Dataset, - track_tracebacks=6, - # exec_one_benchmark=fakeBenchmarkExec, - # exec_one_benchmark_multicore=fakeBenchmarkExec_mutlicore, - exec_one_benchmark_mono_core=fakeBenchmarkExec_monocore, - analyze=fakegetResults, - delete=fakeDelete, - analyze_iterations=fake_analyze) - cls.assertEqual(res, 3) - - @classmethod - def tearDownClass(cls): - rm_tmp() - -def fakeExecMono(directory, name, labels_names, classification_indices, k_folds, - coreIndex, type, pathF, random_state, labels, - hyper_param_search="try", metrics="try", n_iter=1, **arguments): - return ["Mono", arguments] - - -def fakeExecMulti(directory, coreIndex, name, classification_indices, k_folds, - type, pathF, labels_dictionary, - random_state, labels, hyper_param_search="", metrics=None, - n_iter=1, **arguments): - return ["Multi", arguments] - - -def fakeInitMulti(args, benchmark, views, views_indices, argument_dictionaries, - random_state, directory, resultsMonoview, - classification_indices): - return {"monoview": [{"try": 0}, {"try2": 100}], - "multiview": [{"try3": 5}, {"try4": 10}]} - - -class FakeKfold(): - def __init__(self): - self.n_splits = 2 - pass - - def split(self, X, Y): - return [([X[0], X[1]], [X[2], X[3]]), (([X[2], X[3]], [X[0], X[1]]))] - - -class Test_set_element(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.dictionary = {"a": - {"b":{ - "c":{ - "d":{ - "e":1, - "f":[1] - } - } - }}} - cls.elements = {"a.b.c.d.e":1, "a.b.c.d.f":[1]} - - @classmethod - def tearDownClass(cls): - pass - - def test_simple(self): - simplified_dict = {} - for path, value in self.elements.items(): - simplified_dict = exec_classif.set_element(simplified_dict, path, value) - self.assertEqual(simplified_dict, self.dictionary) - - -class Test_get_path_dict(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.dictionary = {"a": - {"b":{ - "c":{ - "d":{ - "e":1, - "f":[1] - } - } - }}} - - @classmethod - def tearDownClass(cls): - pass - - def test_simple(self): - path_dict = exec_classif.get_path_dict(self.dictionary) - self.assertEqual(path_dict, {"a.b.c.d.e":1, "a.b.c.d.f":[1]}) - - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/multiview_platform/tests/test_metrics/__init__.py b/multiview_platform/tests/test_metrics/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_metrics/test_metrics.py b/multiview_platform/tests/test_metrics/test_metrics.py deleted file mode 100644 index 301a42d463af85bd67082fdccddeae9e3e9c83c2..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_metrics/test_metrics.py +++ /dev/null @@ -1,29 +0,0 @@ -import unittest -import multiview_platform.mono_multi_view_classifiers.metrics as metrics -import pkgutil -import os -from sklearn.metrics._scorer import _BaseScorer - -# Tester que chaque metrique a bien les bonnes fonctions qui renvoient bien les bons types d'outputs avec les bons types d'inputs -# Faire de meme pour les differents classifeurs monovues et les differents classifeurs multivues - - -class Test_metric(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.test="a" - - - def test_simple(self): - pkgpath = os.path.dirname(metrics.__file__) - for _, metric, _ in pkgutil.iter_modules([pkgpath]): - module = getattr(metrics, metric) - self.assertTrue(hasattr(module, "score")) - self.assertTrue(isinstance(module.score([1,0],[1,0]), float)) - self.assertTrue(hasattr(module, "get_scorer")) - self.assertTrue(isinstance(module.get_scorer(), _BaseScorer)) - self.assertTrue(hasattr(module, "get_config")) - self.assertTrue(isinstance(module.get_config(), str)) - - diff --git a/multiview_platform/tests/test_mono_view/__init__.py b/multiview_platform/tests/test_mono_view/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_mono_view/test_exec_classif_mono_view.py b/multiview_platform/tests/test_mono_view/test_exec_classif_mono_view.py deleted file mode 100644 index 784bac2a394c614d1693a343a9e039ca20ef4e06..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_mono_view/test_exec_classif_mono_view.py +++ /dev/null @@ -1,245 +0,0 @@ -import os -import unittest - -import h5py -import numpy as np -from sklearn.model_selection import StratifiedKFold - -from multiview_platform.tests.utils import rm_tmp, tmp_path, test_dataset - -from multiview_platform.mono_multi_view_classifiers.monoview import exec_classif_mono_view -from multiview_platform.mono_multi_view_classifiers.monoview_classifiers import decision_tree - - -class Test_initConstants(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - os.mkdir(tmp_path) - cls.view_name="test_dataset" - cls.datasetFile = h5py.File( - tmp_path+"test.hdf5", "w") - cls.random_state = np.random.RandomState(42) - cls.args = {"classifier_name": "test_clf"} - cls.X_value = cls.random_state.randint(0, 500, (10, 20)) - cls.X = cls.datasetFile.create_dataset("View0", data=cls.X_value) - cls.X.attrs["name"] = "test_dataset" - cls.X.attrs["sparse"] = False - cls.classification_indices = [np.array([0, 2, 4, 6, 8]), - np.array([1, 3, 5, 7, 9]), - np.array([1, 3, 5, 7, 9])] - cls.labels_names = ["test_true", "test_false"] - cls.name = "test" - cls.directory = os.path.join(tmp_path, "test_dir/") - - def test_simple(cls): - kwargs, \ - t_start, \ - feat, \ - CL_type, \ - X, \ - learningRate, \ - labelsString, \ - output_file_name,\ - directory,\ - base_file_name = exec_classif_mono_view.init_constants(cls.args, - cls.X, - cls.classification_indices, - cls.labels_names, - cls.name, - cls.directory, - cls.view_name) - cls.assertEqual(kwargs, cls.args) - cls.assertEqual(feat, "test_dataset") - cls.assertEqual(CL_type, "test_clf") - np.testing.assert_array_equal(X, cls.X_value) - cls.assertEqual(learningRate, 0.5) - cls.assertEqual(labelsString, "test_true-test_false") - # cls.assertEqual(output_file_name, "Code/tests/temp_tests/test_dir/test_clf/test_dataset/results-test_clf-test_true-test_false-learnRate0.5-test-test_dataset-") - - @classmethod - def tearDownClass(cls): - os.remove(tmp_path+"test.hdf5") - os.rmdir( - tmp_path+"test_dir/test_clf/test_dataset") - os.rmdir(tmp_path+"test_dir/test_clf") - os.rmdir(tmp_path+"test_dir") - os.rmdir(tmp_path) - - -class Test_initTrainTest(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - cls.random_state = np.random.RandomState(42) - cls.X = cls.random_state.randint(0, 500, (10, 5)) - cls.Y = cls.random_state.randint(0, 2, 10) - cls.classification_indices = [np.array([0, 2, 4, 6, 8]), - np.array([1, 3, 5, 7, 9]), - ] - - def test_simple(cls): - X_train, y_train, X_test, y_test = exec_classif_mono_view.init_train_test( - cls.X, cls.Y, cls.classification_indices) - - np.testing.assert_array_equal(X_train, np.array( - [np.array([102, 435, 348, 270, 106]), - np.array([466, 214, 330, 458, 87]), - np.array([149, 308, 257, 343, 491]), - np.array([276, 160, 459, 313, 21]), - np.array([58, 169, 475, 187, 463])])) - np.testing.assert_array_equal(X_test, np.array( - [np.array([71, 188, 20, 102, 121]), - np.array([372, 99, 359, 151, 130]), - np.array([413, 293, 385, 191, 443]), - np.array([252, 235, 344, 48, 474]), - np.array([270, 189, 445, 174, 445])])) - np.testing.assert_array_equal(y_train, np.array([0, 0, 1, 0, 0])) - np.testing.assert_array_equal(y_test, np.array([1, 1, 0, 0, 0])) - - -class Test_getHPs(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - os.mkdir(tmp_path) - cls.classifierModule = decision_tree - cls.hyper_param_search = "Random" - cls.classifier_name = "decision_tree" - cls.random_state = np.random.RandomState(42) - cls.X = cls.random_state.randint(0,10,size=(10,5)) - cls.y = cls.random_state.randint(0,2,size=10) - cls.output_file_name = tmp_path - cls.cv = StratifiedKFold(n_splits=2, random_state=cls.random_state, shuffle=True) - cls.nb_cores = 1 - cls.metrics = {"accuracy_score*": {}} - cls.kwargs = {"decision_tree" : {"max_depth": 1, - "criterion": "gini", - "splitter": "best"}} - cls.classifier_class_name = "DecisionTree" - cls.hps_kwargs = {"n_iter": 2} - - @classmethod - def tearDownClass(cls): - for file_name in os.listdir(tmp_path): - os.remove( - os.path.join(tmp_path, file_name)) - os.rmdir(tmp_path) - - def test_simple(self): - kwargs = exec_classif_mono_view.get_hyper_params(self.classifierModule, - self.hyper_param_search, - self.classifier_name, - self.classifier_class_name, - self.X, - self.y, - self.random_state, - self.output_file_name, - self.cv, - self.nb_cores, - self.metrics, - self.kwargs, - **self.hps_kwargs) - def test_simple_config(self): - kwargs = exec_classif_mono_view.get_hyper_params(self.classifierModule, - "None", - self.classifier_name, - self.classifier_class_name, - self.X, - self.y, - self.random_state, - self.output_file_name, - self.cv, - self.nb_cores, - self.metrics, - self.kwargs, - **self.hps_kwargs) - - -class Test_exec_monoview(unittest.TestCase): - - def test_simple(self): - os.mkdir(tmp_path) - out = exec_classif_mono_view.exec_monoview(tmp_path, - test_dataset.get_v(0), - test_dataset.get_labels(), - "test dataset", - ["yes", "no"], - [np.array([0,1,2,4]), np.array([4])], - StratifiedKFold(n_splits=2), - 1, - "", - "", - np.random.RandomState(42), - "Random", - n_iter=2, - **{"classifier_name":"decision_tree", - "view_index":0, - "decision_tree":{}}) - rm_tmp() - -# class Test_getKWARGS(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.classifierModule = None -# cls.hyper_param_search = "None" -# cls.nIter = 2 -# cls.CL_type = "string" -# cls.X_train = np.zeros((10,20)) -# cls.y_train = np.zeros((10)) -# cls.random_state = np.random.RandomState(42) -# cls.outputFileName = "test_file" -# cls.KFolds = None -# cls.nbCores = 1 -# cls.metrics = {"accuracy_score":""} -# cls.kwargs = {} -# -# def test_simple(cls): -# clKWARGS = ExecClassifMonoView.getHPs(cls.classifierModule, -# cls.hyper_param_search, -# cls.nIter, -# cls.CL_type, -# cls.X_train, -# cls.y_train, -# cls.random_state, -# cls.outputFileName, -# cls.KFolds, -# cls.nbCores, -# cls.metrics, -# cls.kwargs) -# pass -# -# class Test_saveResults(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.stringAnalysis = "string analysis" -# cls.outputFileName = "test_file" -# cls.full_labels_pred = np.zeros(10) -# cls.y_train_pred = np.ones(5) -# cls.y_train = np.zeros(5) -# cls.imagesAnalysis = {} -# -# def test_simple(cls): -# ExecClassifMonoView.saveResults(cls.stringAnalysis, -# cls.outputFileName, -# cls.full_labels_pred, -# cls.y_train_pred, -# cls.y_train, -# cls.imagesAnalysis) -# # Test if the files are created with the right content -# -# def test_with_image_analysis(cls): -# cls.imagesAnalysis = {"test_image":"image.png"} # Image to gen -# ExecClassifMonoView.saveResults(cls.stringAnalysis, -# cls.outputFileName, -# cls.full_labels_pred, -# cls.y_train_pred, -# cls.y_train, -# cls.imagesAnalysis) -# # Test if the files are created with the right content -# diff --git a/multiview_platform/tests/test_mono_view/test_monoview_utils.py b/multiview_platform/tests/test_mono_view/test_monoview_utils.py deleted file mode 100644 index b0f414ba102a1e55e8882d26052c1af695518695..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_mono_view/test_monoview_utils.py +++ /dev/null @@ -1,51 +0,0 @@ -import unittest - -import numpy as np -from sklearn.model_selection import StratifiedKFold -from sklearn.tree import DecisionTreeClassifier - -from multiview_platform.mono_multi_view_classifiers.monoview import monoview_utils -from multiview_platform.mono_multi_view_classifiers.utils.hyper_parameter_search import CustomRandint - -class TestFunctions(unittest.TestCase): - - def test_gen_test_folds_preds(self): - self.random_state = np.random.RandomState(42) - self.X_train = self.random_state.random_sample((31, 10)) - self.y_train = np.ones(31, dtype=int) - self.KFolds = StratifiedKFold(n_splits=3, ) - - self.estimator = DecisionTreeClassifier(max_depth=1) - - self.y_train[15:] = -1 - testFoldsPreds = monoview_utils.gen_test_folds_preds(self.X_train, - self.y_train, - self.KFolds, - self.estimator) - self.assertEqual(testFoldsPreds.shape, (3, 10)) - np.testing.assert_array_equal(testFoldsPreds[0], np.array( - [ 1, 1, -1, -1, 1, 1, -1, 1, -1, 1])) - - def test_change_label_to_minus(self): - lab = monoview_utils.change_label_to_minus(np.array([0,1,0])) - np.testing.assert_array_equal(lab, np.array([-1,1,-1])) - - def test_change_label_to_zero(self): - lab = monoview_utils.change_label_to_zero(np.array([-1,1,-1])) - np.testing.assert_array_equal(lab, np.array([0,1,0])) - - def test_compute_possible_combinations(self): - n_possib = monoview_utils.compute_possible_combinations({"a":[1, 2], "b":{"c":[2,3]}, "d":CustomRandint(0,10)}) - np.testing.assert_array_equal(n_possib, np.array([2, np.inf, 10])) - -class FakeClf(monoview_utils.BaseMonoviewClassifier): - - def __init__(self): - pass - - -class TestBaseMonoviewClassifier(unittest.TestCase): - - def test_simple(self): - name = FakeClf().get_name_for_fusion() - self.assertEqual(name, 'Fake') diff --git a/multiview_platform/tests/test_monoview_classifiers/__init__.py b/multiview_platform/tests/test_monoview_classifiers/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_monoview_classifiers/test_adaboost.py b/multiview_platform/tests/test_monoview_classifiers/test_adaboost.py deleted file mode 100644 index 94f5f835d05d4292a57998d9a3175d0afe6fec89..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_monoview_classifiers/test_adaboost.py +++ /dev/null @@ -1,80 +0,0 @@ -# import unittest -# import numpy as np -# from sklearn.tree import DecisionTreeClassifier -# -# from ...mono_multi_view_classifiers.monoview_classifiers import Adaboost -# -# -# class Test_canProbas(unittest.TestCase): -# -# def test_simple(cls): -# cls.assertTrue(Adaboost.canProbas()) -# -# -# class Test_paramsToSet(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.n_iter = 4 -# cls.random_state = np.random.RandomState(42) -# -# def test_simple(cls): -# res = Adaboost.paramsToSet(cls.n_iter, cls.random_state) -# cls.assertEqual(len(res), cls.n_iter) -# cls.assertEqual(type(res[0][0]), int) -# cls.assertEqual(type(res[0][1]), type(DecisionTreeClassifier())) -# cls.assertEqual([7,4,13,11], [resIter[0] for resIter in res]) -# -# -# class Test_getKWARGS(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.kwargs_list = [("CL_Adaboost_n_est", 10), -# ("CL_Adaboost_b_est", DecisionTreeClassifier())] -# -# def test_simple(cls): -# res = Adaboost.getKWARGS(cls.kwargs_list) -# cls.assertIn("0", res) -# cls.assertIn("1", res) -# cls.assertEqual(type(res), dict) -# cls.assertEqual(res["0"], 10) -# # Can't test decision tree -# -# def test_wrong(cls): -# cls.kwargs_list[0] = ("chicken_is_heaven",42) -# with cls.assertRaises(ValueError) as catcher: -# Adaboost.getKWARGS(cls.kwargs_list) -# exception = catcher.exception -# # cls.assertEqual(exception, "Wrong arguments served to Adaboost") -# -# -# class Test_randomizedSearch(unittest.TestCase): -# -# def test_simple(cls): -# pass # Test with simple params -# -# -# class Test_fit(unittest.TestCase): -# -# def setUp(self): -# self.random_state = np.random.RandomState(42) -# self.dataset = self.random_state.randint(0, 100, (10, 5)) -# self.labels = self.random_state.randint(0, 2, 10) -# self.kwargs = {"0": 5} -# self.classifier = Adaboost.fit(self.dataset, self.labels, 42, NB_CORES=1, **self.kwargs) -# -# def test_fit_kwargs_string(self): -# self.kwargs = {"0": "5"} -# classifier = Adaboost.fit(self.dataset, self.labels, 42, NB_CORES=1, **self.kwargs) -# self.assertEqual(classifier.n_estimators, 5) -# -# def test_fit_kwargs_int(self): -# self.kwargs = {"0": 5} -# classifier = Adaboost.fit(self.dataset, self.labels, 42, NB_CORES=1, **self.kwargs) -# self.assertEqual(classifier.n_estimators, 5) -# -# def test_fit_labels(self): -# predicted_labels = self.classifier.predict(self.dataset) -# np.testing.assert_array_equal(predicted_labels, self.labels) -# diff --git a/multiview_platform/tests/test_monoview_classifiers/test_compatibility.py b/multiview_platform/tests/test_monoview_classifiers/test_compatibility.py deleted file mode 100644 index 91c566df14d478c9e8040955d94e70b96bf75df0..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_monoview_classifiers/test_compatibility.py +++ /dev/null @@ -1,157 +0,0 @@ -# import os -# import unittest -# - -# Actuellement problématique a cause de la pep8isation du code. A voir plus tard - - -# import numpy as np -# -# from ...mono_multi_view_classifiers import monoview_classifiers -# -# -# class Test_methods(unittest.TestCase): -# -# def test_simple(self): -# for fileName in os.listdir( -# "multiview_platform/mono_multi_view_classifiers/monoview_classifiers"): -# if fileName[-3:] == ".py" and fileName != "__init__.py": -# monoview_classifier_module = getattr(monoview_classifiers, -# fileName[:-3]) -# self.assertIn("formatCmdArgs", dir(monoview_classifier_module), -# fileName[ -# :-3] + " must have getKWARGS method implemented") -# self.assertIn("paramsToSet", dir(monoview_classifier_module), -# fileName[ -# :-3] + " must have randomizedSearch method implemented") -# #test to be changed find name of class not same name of module -# # self.assertIn(fileName[:-3], dir(monoview_classifier_module), -# # fileName[ -# # :-3] + " must have it's own class implemented") -# -# monoview_classifier_class = getattr(monoview_classifier_module, -# fileName[:-3]) -# self.assertTrue( -# hasattr(monoview_classifier_class, "getInterpret"), -# fileName[:-3] + " class must have getInterpret implemented") -# self.assertTrue( -# hasattr(monoview_classifier_class, "canProbas", ), -# fileName[:-3] + " class must have canProbas implemented") -# monoview_classifier_instance = monoview_classifier_class() -# self.assertTrue( -# hasattr(monoview_classifier_instance, "param_names", ), -# fileName[:-3] + " class must have param_names attribute") -# self.assertTrue( -# hasattr(monoview_classifier_instance, "classed_params", ), -# fileName[:-3] + " class must have classed_params attribute") -# self.assertTrue( -# hasattr(monoview_classifier_instance, "distribs", ), -# fileName[:-3] + " class must have distribs attribute") -# self.assertTrue( -# hasattr(monoview_classifier_instance, "weird_strings", ), -# fileName[:-3] + " class must have weird_strings attribute") -# # check_estimator(monoview_classifier_instance) -# -# -# class Test_canProbas(unittest.TestCase): -# -# def test_outputs(self): -# for fileName in os.listdir( -# "multiview_platform/mono_multi_view_classifiers/monoview_classifiers"): -# if fileName[-3:] == ".py" and fileName != "__init__.py": -# monoview_classifier_module = getattr(monoview_classifiers, -# fileName[:-3]) -# monoview_classifier_class = getattr(monoview_classifier_module, -# fileName[:-3])() -# res = monoview_classifier_class.canProbas() -# self.assertEqual(type(res), bool, -# "canProbas must return a boolean") -# -# def test_inputs(self): -# for fileName in os.listdir( -# "multiview_platform/mono_multi_view_classifiers/monoview_classifiers"): -# if fileName[-3:] == ".py" and fileName != "__init__.py": -# monoview_classifier_module = getattr(monoview_classifiers, -# fileName[:-3]) -# monoview_classifier_class = getattr(monoview_classifier_module, -# fileName[:-3])() -# with self.assertRaises(TypeError, -# msg="canProbas must have 0 args") as catcher: -# monoview_classifier_class.canProbas(35) -# -# -# class Test_fit(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.random_state = np.random.RandomState(42) -# cls.dataset = cls.random_state.random_sample((10, 20)) -# cls.labels = cls.random_state.randint(0, 2, 10) -# -# # def test_inputs(cls): -# # # DATASET, CLASS_LABELS, random_state, NB_CORES=1, **kwargs -# # for fileName in os.listdir("Code/mono_multi_view_classifiers/monoview_classifiers"): -# # if fileName[-3:] == ".py" and fileName != "__init__.py": -# # monoview_classifier_module = getattr(monoview_classifiers, fileName[:-3]) -# # cls.args = dict((str(index), value) for index, value in -# # enumerate(monoview_classifier_module.paramsToSet(1, cls.random_state)[0])) -# # res = monoview_classifier_module.fit(cls.dataset, cls.labels, cls.random_state, **cls.args) -# # with cls.assertRaises(TypeError, msg="fit must have 3 positional args, one kwarg") as catcher: -# # monoview_classifier_module.fit() -# # monoview_classifier_module.fit(cls.dataset) -# # monoview_classifier_module.fit(cls.dataset,cls.labels) -# # monoview_classifier_module.fit(cls.dataset,cls.labels, cls.random_state, 1, 10) -# -# # def test_outputs(cls): -# # for fileName in os.listdir("Code/mono_multi_view_classifiers/monoview_classifiers"): -# # if fileName[-3:] == ".py" and fileName != "__init__.py": -# # monoview_classifier_module = getattr(monoview_classifiers, fileName[:-3]) -# # cls.args = dict((str(index), value) for index, value in -# # enumerate(monoview_classifier_module.paramsToSet(1, cls.random_state)[0])) -# # res = monoview_classifier_module.fit(cls.dataset, cls.labels, cls.random_state, **cls.args) -# # cls.assertIn("predict", dir(res), "fit must return an object able to predict") -# -# -# class Test_paramsToSet(unittest.TestCase): -# -# def test_inputs(self): -# for fileName in os.listdir( -# "multiview_platform/mono_multi_view_classifiers/monoview_classifiers"): -# if fileName[-3:] == ".py" and fileName != "__init__.py": -# monoview_classifier_module = getattr(monoview_classifiers, -# fileName[:-3]) -# with self.assertRaises(TypeError, -# msg="paramsToSet must have 2 positional args") as catcher: -# monoview_classifier_module.paramsToSet(2, -# np.random.RandomState( -# 42), 10) -# monoview_classifier_module.paramsToSet(2) -# monoview_classifier_module.paramsToSet() -# res = monoview_classifier_module.paramsToSet(2, -# np.random.RandomState( -# 42)) -# -# def test_outputs(self): -# for fileName in os.listdir( -# "multiview_platform/mono_multi_view_classifiers/monoview_classifiers"): -# if fileName[-3:] == ".py" and fileName != "__init__.py": -# monoview_classifier_module = getattr(monoview_classifiers, -# fileName[:-3]) -# res = monoview_classifier_module.paramsToSet(2, -# np.random.RandomState( -# 42)) -# self.assertEqual(type(res), list) -# self.assertEqual(len(res), 2) -# self.assertEqual(type(res[0]), dict) -# -# # class Test_getKWARGS(unittest.TestCase): -# # -# # # TODO : Find a way to enter the right args -# # -# # def test_inputs(self): -# # for fileName in os.listdir("Code/mono_multi_view_classifiers/monoview_classifiers"): -# # if fileName[-3:] == ".py" and fileName != "__init__.py": -# # monoview_classifier_module = getattr(monoview_classifiers, fileName[:-3]) -# # with self.assertRaises(TypeError, msg="getKWARGS must have 1 positional args") as catcher: -# # monoview_classifier_module.getKWARGS() -# # monoview_classifier_module.getKWARGS([1],2) diff --git a/multiview_platform/tests/test_multi_view/__init__.py b/multiview_platform/tests/test_multi_view/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_multi_view/test_exec_multiview.py b/multiview_platform/tests/test_multi_view/test_exec_multiview.py deleted file mode 100644 index e0e6d872a215820518b098715fe297dd9f7c1fce..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multi_view/test_exec_multiview.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import unittest - -import h5py -import numpy as np -from sklearn.model_selection import StratifiedKFold - -from multiview_platform.tests.utils import rm_tmp, tmp_path, test_dataset - -from multiview_platform.mono_multi_view_classifiers.multiview import exec_multiview - - -class Test_init_constants(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - os.mkdir(tmp_path) - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_simple(self): - classifier_name, t_start, views_indices, \ - classifier_config, views, learning_rate, labels, output_file_name, \ - directory, base_file_name, metrics = exec_multiview.init_constants( - kwargs={"view_names":["ViewN0", "ViewN2", "ViewN1", ], - "view_indices": [0,2,1], - "classifier_name":"test_clf", - "test_clf":{}}, - classification_indices=[np.array([0,1,4,2]), np.array([3])], - metrics={"accuracy_score*":{}}, - name="test_dataset", - nb_cores=1, - k_folds=StratifiedKFold(n_splits=2), - dataset_var=test_dataset, - directory=tmp_path - ) - self.assertEqual(classifier_name, "test_clf") - self.assertEqual(views_indices, [0,2,1]) - self.assertEqual(classifier_config, {}) - self.assertEqual(views, ["ViewN0", "ViewN2", "ViewN1", ]) - self.assertEqual(learning_rate, 4/5) - - def test_exec_multiview_no_hps(self): - res = exec_multiview.exec_multiview( - directory=tmp_path, - dataset_var=test_dataset, - name="test_dataset", - classification_indices=[np.array([0,1,4,2]), np.array([3])], - k_folds=StratifiedKFold(n_splits=2), - nb_cores=1, - database_type="", path="", - labels_dictionary={0:"yes", 1:"no"}, - random_state=np.random.RandomState(42), - labels=test_dataset.get_labels(), - hps_method="None", - hps_kwargs={}, - metrics=None, - n_iter=30, - **{"view_names":["ViewN0", "ViewN2", "ViewN1", ], - "view_indices": [0,2,1], - "classifier_name":"weighted_linear_early_fusion", - "weighted_linear_early_fusion":{}} - ) - - def test_exec_multiview(self): - res = exec_multiview.exec_multiview( - directory=tmp_path, - dataset_var=test_dataset, - name="test_dataset", - classification_indices=[np.array([0,1,4,2]), np.array([3])], - k_folds=StratifiedKFold(n_splits=2), - nb_cores=1, - database_type="", path="", - labels_dictionary={0:"yes", 1:"no"}, - random_state=np.random.RandomState(42), - labels=test_dataset.get_labels(), - hps_method="Grid", - hps_kwargs={"param_grid": - {"monoview_classifier_config":[{"max_depth":3}, {"max_depth":1}]}, - }, - metrics=None, - n_iter=30, - **{"view_names":["ViewN0", "ViewN2", "ViewN1", ], - "view_indices": [0,2,1], - "classifier_name":"weighted_linear_early_fusion", - "weighted_linear_early_fusion":{}} - ) \ No newline at end of file diff --git a/multiview_platform/tests/test_multi_view/test_multiview_utils.py b/multiview_platform/tests/test_multi_view/test_multiview_utils.py deleted file mode 100644 index 6cb880637e3d415844199fb103c6122184c3a143..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multi_view/test_multiview_utils.py +++ /dev/null @@ -1,89 +0,0 @@ -import os -import unittest - -import h5py -import numpy as np -from sklearn.model_selection import StratifiedKFold - -from multiview_platform.tests.utils import rm_tmp, tmp_path, test_dataset - -from multiview_platform.mono_multi_view_classifiers.multiview import multiview_utils - - -class FakeMVClassif(multiview_utils.BaseMultiviewClassifier): - - def __init__(self, mc=True): - self.mc=mc - pass - - def fit(self, X, y): - if not self.mc: - raise ValueError - else: - pass - - - -class TestBaseMultiviewClassifier(unittest.TestCase): - - @classmethod - def setUpClass(cls): - os.mkdir(tmp_path) - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_accepts_multiclass(self): - rs = np.random.RandomState(42) - accepts = FakeMVClassif().accepts_multi_class(rs) - self.assertEqual(accepts, True) - accepts = FakeMVClassif(mc=False).accepts_multi_class(rs) - self.assertEqual(accepts, False) - self.assertRaises(ValueError, FakeMVClassif(mc=False).accepts_multi_class, rs,**{"n_samples":2, "n_classes":3}) - -class TestConfigGenerator(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.rs = np.random.RandomState(42) - - def test_simple(self): - cfg_gen = multiview_utils.ConfigGenerator(["decision_tree", "decision_tree"]) - sample = cfg_gen.rvs(self.rs) - self.assertEqual(sample, {'decision_tree': {'criterion': 'entropy', - 'max_depth': 103, - 'splitter': 'best'}}) - -class TestFunctions(unittest.TestCase): - - @classmethod - def setUpClass(cls): - os.mkdir(tmp_path) - cls.rs = np.random.RandomState(42) - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_get_available_monoview_classifiers(self): - avail = multiview_utils.get_available_monoview_classifiers() - self.assertEqual(avail, ['adaboost', - 'decision_tree', - 'gradient_boosting', - 'knn', - 'lasso', - 'random_forest', - 'sgd', - 'svm_linear', - 'svm_poly', - 'svm_rbf']) - avail = multiview_utils.get_available_monoview_classifiers(need_probas=True) - self.assertEqual(avail, ['adaboost', - 'decision_tree', - 'gradient_boosting', - 'knn', - 'random_forest', - 'svm_linear', - 'svm_poly', - 'svm_rbf']) diff --git a/multiview_platform/tests/test_multiview_classifiers/Test_PseudoCQMeasure/__init__.py b/multiview_platform/tests/test_multiview_classifiers/Test_PseudoCQMeasure/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_multiview_classifiers/Test_PseudoCQMeasure/test_PseudoCQFusionModule.py b/multiview_platform/tests/test_multiview_classifiers/Test_PseudoCQMeasure/test_PseudoCQFusionModule.py deleted file mode 100644 index 65e22eb8f7dff86aec92af8d1c7adc9e21838d49..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/Test_PseudoCQMeasure/test_PseudoCQFusionModule.py +++ /dev/null @@ -1,22 +0,0 @@ -# import unittest -# -# import numpy as np -# -# from ....mono_multi_view_classifiers.multiview_classifiers.entropy_fusion_old import EntropyFusionModule -# -# class Test_entropy(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.classifiersDecisions = np.array([ -# [np.random.randint(0,2,(2,5)), [[0,0,1,0,1], [0,1,0,1,0]], np.random.randint(0,2,(2,5)), np.random.randint(0,2,(2,5)), np.random.randint(0,2,(2,5))], -# [np.random.randint(0,2, (2, 5)), np.random.randint(0,2, (2, 5)), np.random.randint(0,2, (2, 5)), [[0, 0, 1, 1, 0], [0, 1, 0, 1, 0]], np.random.randint(0,2, (2, 5))], -# [np.random.randint(0,2, (2, 5)), np.random.randint(0,2, (2, 5)), np.random.randint(0,2, (2, 5)), np.random.randint(0,2, (2, 5)), [[0, 1, 1, 1, 1], [0, 1, 0, 1, 0]]], -# ]) -# cls.combination = [1,3,4] -# cls.foldsGroudTruth = np.array([[1,1,0,0,1], [0,1,0,1,0]]) -# cls.foldsLen = "" -# -# def test_simple(cls): -# entropy_score = EntropyFusionModule.entropy(cls.classifiersDecisions, cls.combination, cls.foldsGroudTruth,cls.foldsLen) -# cls.assertEqual(entropy_score, 0.15, 'Wrong values for entropy measure') diff --git a/multiview_platform/tests/test_multiview_classifiers/__init__.py b/multiview_platform/tests/test_multiview_classifiers/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_multiview_classifiers/test_additions/__init__.py b/multiview_platform/tests/test_multiview_classifiers/test_additions/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_multiview_classifiers/test_additions/test_diversity_utils.py b/multiview_platform/tests/test_multiview_classifiers/test_additions/test_diversity_utils.py deleted file mode 100644 index debdc51b799833cf87064c8dfe788f49c4dda879..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/test_additions/test_diversity_utils.py +++ /dev/null @@ -1,74 +0,0 @@ -import unittest -import numpy as np - -import multiview_platform.mono_multi_view_classifiers.multiview_classifiers.additions.diversity_utils as du - - -class FakeDataset(): - - def __init__(self, views, labels): - self.nb_views = views.shape[0] - self.dataset_length = views.shape[2] - self.views = views - self.labels = labels - - def get_v(self, view_index, example_indices): - return self.views[view_index, example_indices] - - def get_nb_class(self, example_indices): - return np.unique(self.labels[example_indices]) - - -class FakeDivCoupleClf(du.CoupleDiversityFusionClassifier): - - def __init__(self, rs, classifier_names=None, - classifiers_config=None, monoview_estimators=None): - super(FakeDivCoupleClf, self).__init__(random_state=rs, - classifier_names=classifier_names, - classifier_configs=classifiers_config, - monoview_estimators=monoview_estimators) - self.rs = rs - - def diversity_measure(self, a, b, c): - return self.rs.randint(0,100) - - -class FakeDivGlobalClf(du.GlobalDiversityFusionClassifier): - - def __init__(self, rs, classifier_names=None, - classifiers_config=None, monoview_estimators=None): - super(FakeDivGlobalClf, self).__init__(random_state=rs, - classifier_names=classifier_names, - classifier_configs=classifiers_config, - monoview_estimators=monoview_estimators) - self.rs = rs - - def diversity_measure(self, a, b, c): - return self.rs.randint(0,100) - -class Test_DiversityFusion(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.classifier_names = ["adaboost", "decision_tree"] - cls.classifiers_config = {"adaboost":{"n_estimators":5,}} - cls.random_state = np.random.RandomState(42) - cls.y = cls.random_state.randint(0,2,6) - cls.X = FakeDataset(cls.random_state.randint(0,100,(2,5,6)), cls.y) - cls.train_indices = [0,1,2,4] - cls.views_indices = [0,1] - - def test_simple_couple(self): - clf = FakeDivCoupleClf(self.random_state, classifier_names=self.classifier_names, - classifiers_config=self.classifiers_config) - clf.fit(self.X, self.y, self.train_indices, self.views_indices) - - def test_simple_global(self): - clf = FakeDivGlobalClf(self.random_state, - classifier_names=self.classifier_names, - classifiers_config=self.classifiers_config) - clf.fit(self.X, self.y, self.train_indices, self.views_indices) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/multiview_platform/tests/test_multiview_classifiers/test_additions/test_jumbo_fusion_utils.py b/multiview_platform/tests/test_multiview_classifiers/test_additions/test_jumbo_fusion_utils.py deleted file mode 100644 index 9e242ed89bd067148b0d4caa5da39f4057d04c26..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/test_additions/test_jumbo_fusion_utils.py +++ /dev/null @@ -1,22 +0,0 @@ -import unittest -import numpy as np - -import multiview_platform.mono_multi_view_classifiers.multiview_classifiers.additions.jumbo_fusion_utils as ju - - -class FakeDataset(): - - def __init__(self, views, labels): - self.nb_views = views.shape[0] - self.dataset_length = views.shape[2] - self.views = views - self.labels = labels - - def get_v(self, view_index, example_indices): - return self.views[view_index, example_indices] - - def get_nb_class(self, example_indices): - return np.unique(self.labels[example_indices]) - - -#TODO \ No newline at end of file diff --git a/multiview_platform/tests/test_multiview_classifiers/test_difficulty_fusion.py b/multiview_platform/tests/test_multiview_classifiers/test_difficulty_fusion.py deleted file mode 100644 index ff298b8c07ed5a8225f27885ad911b935bbd17ba..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/test_difficulty_fusion.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -import numpy as np - -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers import difficulty_fusion - - -class Test_difficulty_fusion(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state=np.random.RandomState(42) - cls.classifiers_decisions = cls.random_state.randint(0, 2, size=(5, 3, 5)) - cls.combination = [1, 3, 4] - cls.y = np.array([1, 1, 0, 0, 1]) - cls.difficulty_fusion_clf = difficulty_fusion.DifficultyFusion() - - def test_simple(cls): - difficulty_measure = cls.difficulty_fusion_clf.diversity_measure( - cls.classifiers_decisions, - cls.combination, - cls.y) - cls.assertAlmostEqual(difficulty_measure, 0.1875) diff --git a/multiview_platform/tests/test_multiview_classifiers/test_disagree_fusion.py b/multiview_platform/tests/test_multiview_classifiers/test_disagree_fusion.py deleted file mode 100644 index bb08e016e5ea75ac78f373ead873e8721eea31e3..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/test_disagree_fusion.py +++ /dev/null @@ -1,23 +0,0 @@ -# # import unittest -# -import numpy as np -import unittest -# -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers import disagree_fusion - - -class Test_disagree(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.monoview_decision_1 = np.array([0, 0, 1, 1]) - cls.monoview_decision_2 = np.array([0, 1, 0, 1]) - cls.ground_truth = None - cls.clf = disagree_fusion.DisagreeFusion() - - def test_simple(cls): - disagreement = cls.clf.diversity_measure(cls.monoview_decision_1, - cls.monoview_decision_2, - cls.ground_truth) - np.testing.assert_array_equal(disagreement, - np.array([False, True, True, False])) diff --git a/multiview_platform/tests/test_multiview_classifiers/test_diversity_utils.py b/multiview_platform/tests/test_multiview_classifiers/test_diversity_utils.py deleted file mode 100644 index 46c9e59652d29787f2a10a3faca09f796c300f72..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/test_diversity_utils.py +++ /dev/null @@ -1,42 +0,0 @@ -# -# import numpy as np -# import unittest -# -# from multiview_platform.mono_multi_view_classifiers.multiview.additions import \ -# diversity_utils -# -# -# def fake_measure(a, b, c, d, e): -# return 42 -# -# -# class Test_global_div_measure(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.random_state = np.random.RandomState(42) -# cls.allClassifiersNames = [["SCM", "SVM", "DT"], ["SCM", "SVM", "DT"]] -# cls.views_indices = np.array([0, 1]) -# cls.classifiersDecisions = np.zeros( -# (cls.views_indices.shape[0], len(cls.allClassifiersNames), 3, 6), -# dtype=int) -# for classifer_index, classifier in enumerate(cls.allClassifiersNames): -# for view_index, view in enumerate(cls.views_indices): -# cls.classifiersDecisions[ -# view_index, classifer_index] = np.array([ -# cls.random_state.randint(0, 2, 6), -# cls.random_state.randint(0, 2, 6), -# cls.random_state.randint(0, 2, 6)]) -# cls.folds_ground_truth = np.array( -# [np.array([1, 1, 1, 0, 0, 0]) for _ in range(3)]) -# cls.classification_indices = np.array([]) -# cls.measurement = fake_measure -# -# def test_simple(cls): -# clf_names, diversity_measure = diversity_utils.global_div_measure( -# cls.allClassifiersNames, -# cls.classifiersDecisions, -# cls.measurement, -# cls.folds_ground_truth) -# cls.assertEqual(len(clf_names), 2) -# cls.assertEqual(diversity_measure, 42) diff --git a/multiview_platform/tests/test_multiview_classifiers/test_double_fault_fusion.py b/multiview_platform/tests/test_multiview_classifiers/test_double_fault_fusion.py deleted file mode 100644 index 7e6fd3d70bd5ad84ec62aee0b3e00c8ec336cc34..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/test_double_fault_fusion.py +++ /dev/null @@ -1,22 +0,0 @@ - -import numpy as np -import unittest - -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers import double_fault_fusion - - -class Test_disagree(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.monoview_decision_1 = np.array([0, 0, 0, 0, 1, 1, 1, 1]) - cls.monoview_decision_2 = np.array([0, 0, 1, 1, 0, 0, 1, 1]) - cls.ground_truth = np.array([0, 1, 0, 1, 0, 1, 0, 1]) - cls.clf = double_fault_fusion.DoubleFaultFusion() - - def test_simple(cls): - double_fault = cls.clf.diversity_measure(cls.monoview_decision_1, - cls.monoview_decision_2, - cls.ground_truth) - np.testing.assert_array_equal(double_fault, - np.array([False, True, False, False, False, False, True, False])) diff --git a/multiview_platform/tests/test_multiview_classifiers/test_entropy_fusion.py b/multiview_platform/tests/test_multiview_classifiers/test_entropy_fusion.py deleted file mode 100644 index dc88bfcc3070b57707f4f8931a55ea1c337f468d..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/test_entropy_fusion.py +++ /dev/null @@ -1,23 +0,0 @@ -import unittest - -import numpy as np - -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers import entropy_fusion - - -class Test_difficulty_fusion(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state=np.random.RandomState(42) - cls.classifiers_decisions = cls.random_state.randint(0, 2, size=(5, 3, 5)) - cls.combination = [1, 3, 4] - cls.y = np.array([1, 1, 0, 0, 1]) - cls.clf = entropy_fusion.EntropyFusion() - - def test_simple(cls): - entropy = cls.clf.diversity_measure( - cls.classifiers_decisions, - cls.combination, - cls.y) - cls.assertAlmostEqual(entropy, 0.2) diff --git a/multiview_platform/tests/test_multiview_classifiers/test_weighted_linear_early_fusion.py b/multiview_platform/tests/test_multiview_classifiers/test_weighted_linear_early_fusion.py deleted file mode 100644 index c86607525dc958adf14907a8d3da190682a6adb8..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_multiview_classifiers/test_weighted_linear_early_fusion.py +++ /dev/null @@ -1,67 +0,0 @@ -import unittest - -import numpy as np -import os - -from multiview_platform.tests.utils import rm_tmp, tmp_path, test_dataset - -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers import \ - weighted_linear_early_fusion - -class Test_WeightedLinearEarlyFusion(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - cls.random_state = np.random.RandomState(42) - cls.view_weights = [0.5, 0.5] - cls.monoview_classifier_name = "decision_tree" - cls.monoview_classifier_config = {"max_depth":1, "criterion": "gini", "splitter": "best"} - cls.classifier = weighted_linear_early_fusion.WeightedLinearEarlyFusion( - random_state=cls.random_state, view_weights=cls.view_weights, - monoview_classifier_name=cls.monoview_classifier_name, - monoview_classifier_config=cls.monoview_classifier_config) - cls.dataset = test_dataset - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_simple(self): - np.testing.assert_array_equal(self.view_weights, self.classifier.view_weights) - - def test_fit(self): - self.assertRaises(AttributeError, getattr, - self.classifier.monoview_classifier, "classes_") - self.classifier.fit(self.dataset, test_dataset.get_labels(), None, None) - np.testing.assert_array_equal(self.classifier.monoview_classifier.classes_, - np.array([0,1])) - - def test_predict(self): - self.classifier.fit(self.dataset, test_dataset.get_labels(), None, None) - predicted_labels = self.classifier.predict(self.dataset, None, None) - np.testing.assert_array_equal(predicted_labels, test_dataset.get_labels()) - - def test_transform_data_to_monoview_simple(self): - example_indices, X = self.classifier.transform_data_to_monoview(self.dataset, - None, None) - self.assertEqual(X.shape, (5,12)) - np.testing.assert_array_equal(X, np.concatenate((self.dataset.get_v(0), self.dataset.get_v(1)), axis=1)) - np.testing.assert_array_equal(example_indices, np.arange(5)) - - def test_transform_data_to_monoview_view_select(self): - example_indices, X = self.classifier.transform_data_to_monoview( - self.dataset, - None, np.array([0])) - self.assertEqual(X.shape, (5, 6)) - np.testing.assert_array_equal(X, self.dataset.get_v(0)) - np.testing.assert_array_equal(example_indices, np.arange(5)) - - def test_transform_data_to_monoview_example_view_select(self): - example_indices, X = self.classifier.transform_data_to_monoview( - self.dataset, - np.array([1,2,3]), np.array([0])) - self.assertEqual(X.shape, (3, 6)) - np.testing.assert_array_equal(X, self.dataset.get_v(0)[np.array([1,2,3]), :]) - np.testing.assert_array_equal(example_indices, np.array([1,2,3])) - diff --git a/multiview_platform/tests/test_result_analysis/__init__.py b/multiview_platform/tests/test_result_analysis/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_result_analysis/test_duration_analysis.py b/multiview_platform/tests/test_result_analysis/test_duration_analysis.py deleted file mode 100644 index efe6c68d792fe1d638821ea70f649a342f41a664..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_result_analysis/test_duration_analysis.py +++ /dev/null @@ -1,41 +0,0 @@ -import unittest -import numpy as np -import pandas as pd - -from multiview_platform.mono_multi_view_classifiers.result_analysis import duration_analysis - -class FakeClassifierResult: - - def __init__(self, i=0): - self.i=i - if i == 0: - self.hps_duration = 10 - self.fit_duration = 12 - self.pred_duration = 15 - else: - self.hps_duration = 1 - self.fit_duration = 2 - self.pred_duration = 5 - - - def get_classifier_name(self): - if self.i == 0: - return 'test1' - else: - return 'test2' - - - -class Test_get_duration(unittest.TestCase): - - def test_simple(self): - results = [FakeClassifierResult(), FakeClassifierResult(i=1)] - durs = duration_analysis.get_duration(results) - pd.testing.assert_frame_equal(durs, - pd.DataFrame(index=['test1', 'test2'], - columns=['hps', 'fit', 'pred'], - data=np.array([np.array([10,12,15]), - np.array([1,2,5])]), - dtype=object)) - - diff --git a/multiview_platform/tests/test_result_analysis/test_error_analysis.py b/multiview_platform/tests/test_result_analysis/test_error_analysis.py deleted file mode 100644 index 07ec87c2e4b8d96dc93db8d92e63d346ddcfcf2b..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_result_analysis/test_error_analysis.py +++ /dev/null @@ -1,76 +0,0 @@ -import unittest -import numpy as np - -from multiview_platform.mono_multi_view_classifiers.monoview.monoview_utils import MonoviewResult -from multiview_platform.mono_multi_view_classifiers.multiview.multiview_utils import MultiviewResult - -from multiview_platform.mono_multi_view_classifiers.result_analysis.error_analysis import get_example_errors, gen_error_data, gen_error_data_glob - - -class Test_get_example_errors(unittest.TestCase): - - def test_simple(self): - ground_truth = np.array([0,1,0,1,0,1,0,1, -100]) - results = [MultiviewResult("mv", "", {"accuracy_score": [0.7, 0.75], - "f1_score": [0.71, 0.76]}, - np.array([0,0,0,0,1,1,1,1,1]), - 0,0,0, {}), - MonoviewResult(0, - "dt", - "1", - {"accuracy_score": [0.8, 0.85], - "f1_score": [0.81, 0.86]} - , np.array([0,0,1,1,0,0,1,1,0]), "", "", - "", "",0,0, {}) - ] - example_errors = get_example_errors(ground_truth, - results) - self.assertIsInstance(example_errors, dict) - np.testing.assert_array_equal(example_errors["mv"], - np.array([1,0,1,0,0,1,0,1,-100])) - np.testing.assert_array_equal(example_errors["dt-1"], - np.array([1, 0, 0, 1, 1, 0, 0, 1,-100])) - -class Test_gen_error_data(unittest.TestCase): - - def test_simple(self): - random_state = np.random.RandomState(42) - ada_data = random_state.randint(0,2,size=7) - mv_data = random_state.randint(0, 2, size=7) - example_errors = {"ada-1": ada_data, - "mv": mv_data} - nb_classifiers, nb_examples, classifiers_names, \ - data_2d, error_on_examples = gen_error_data(example_errors) - self.assertEqual(nb_classifiers, 2) - self.assertEqual(nb_examples, 7) - self.assertEqual(classifiers_names, ["ada-1", "mv"]) - np.testing.assert_array_equal(data_2d, np.array([ada_data, mv_data]).transpose()) - np.testing.assert_array_equal(error_on_examples, (ada_data+mv_data)/nb_classifiers) - - - -class Test_gen_error_data_glob(unittest.TestCase): - - def test_simple(self): - random_state = np.random.RandomState(42) - - ada_error_data_1 = random_state.randint(0,2,7) - ada_error_data_2 = random_state.randint(0, 2, 7) - ada_sum = ada_error_data_1+ada_error_data_2 - mv_error_data_1 = random_state.randint(0, 2, 7) - mv_error_data_2 = random_state.randint(0, 2, 7) - mv_sum = mv_error_data_1+mv_error_data_2 - - combi_results = {"ada-1":ada_sum, "mv": mv_sum} - - stats_iter = 2 - - nb_examples, nb_classifiers, \ - data, error_on_examples, \ - classifier_names = gen_error_data_glob(combi_results, - stats_iter) - self.assertEqual(nb_examples, 7) - self.assertEqual(nb_classifiers, 2) - np.testing.assert_array_equal(data, np.array([ada_sum, mv_sum]).transpose()) - np.testing.assert_array_equal(error_on_examples, np.sum(np.array([ada_sum, mv_sum]), axis=0)/(nb_classifiers*stats_iter)) - self.assertEqual(classifier_names, ["ada-1", "mv"]) \ No newline at end of file diff --git a/multiview_platform/tests/test_result_analysis/test_execution.py b/multiview_platform/tests/test_result_analysis/test_execution.py deleted file mode 100644 index f42f818c48a2e774c23a51b75542a5b9b1cd76f9..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_result_analysis/test_execution.py +++ /dev/null @@ -1,139 +0,0 @@ -import unittest -import numpy as np -import pandas as pd -import os - -from multiview_platform.mono_multi_view_classifiers.monoview.monoview_utils import MonoviewResult -from multiview_platform.mono_multi_view_classifiers.multiview.multiview_utils import MultiviewResult - -from multiview_platform.mono_multi_view_classifiers.result_analysis.execution import format_previous_results, get_arguments, analyze_iterations -from multiview_platform.tests.utils import rm_tmp, tmp_path, test_dataset - - -class FakeClassifierResult: - - def __init__(self, i=1): - self.classifier_name='test'+str(i) - self.full_labels_pred = np.array([0,1,1,2,1]) - self.hps_duration=i - self.fit_duration=i - self.pred_duration=i - - def get_classifier_name(self): - return self.classifier_name - -class Test_format_previous_results(unittest.TestCase): - - def test_simple(self): - iter_results = {"metrics_scores":[], "example_errors":[], "feature_importances":[], "labels":[], "durations":[], "class_metrics_scores":[]} - random_state = np.random.RandomState(42) - - # Gen metrics data - metrics_1_data = random_state.uniform(size=(2,2)) - metrics_2_data = random_state.uniform(size=(2,2)) - metric_1_df = pd.DataFrame(data=metrics_1_data, index=["train", "test"], - columns=["ada-1", "mv"]) - metric_2_df = pd.DataFrame(data=metrics_2_data, index=["train", "test"], - columns=["ada-1", "mv"]) - iter_results["metrics_scores"].append({"acc": metric_1_df}) - iter_results["metrics_scores"].append({"acc": metric_2_df}) - - # Gen error data - ada_error_data_1 = random_state.randint(0,2,7) - ada_error_data_2 = random_state.randint(0, 2, 7) - ada_sum = ada_error_data_1+ada_error_data_2 - mv_error_data_1 = random_state.randint(0, 2, 7) - mv_error_data_2 = random_state.randint(0, 2, 7) - mv_sum = mv_error_data_1+mv_error_data_2 - iter_results["example_errors"].append({}) - iter_results["example_errors"].append({}) - iter_results["example_errors"][0]["ada-1"] = ada_error_data_1 - iter_results["example_errors"][0]["mv"] = mv_error_data_1 - iter_results["example_errors"][1]["ada-1"] = ada_error_data_2 - iter_results["example_errors"][1]["mv"] = mv_error_data_2 - - iter_results["durations"].append(pd.DataFrame(index=["ada-1", "mv"], - columns=["plif", "plaf"], - data=np.zeros((2,2)))) - iter_results["durations"].append(pd.DataFrame(index=["ada-1", "mv"], - columns=["plif", - "plaf"], - data=np.ones((2, 2)))) - - # Running the function - metric_analysis, class_met, error_analysis, \ - feature_importances, feature_stds, \ - labels, durations_mean, duration_std = format_previous_results(iter_results) - mean_df = pd.DataFrame(data=np.mean(np.array([metrics_1_data, - metrics_2_data]), - axis=0), - index=["train", "test"], - columns=["ada-1", "mvm"]) - std_df = pd.DataFrame(data=np.std(np.array([metrics_1_data, - metrics_2_data]), - axis=0), - index=["train", "test"], - columns=["ada-1", "mvm"]) - - # Testing - np.testing.assert_array_equal(metric_analysis["acc"]["mean"].loc["train"], - mean_df.loc["train"]) - np.testing.assert_array_equal(metric_analysis["acc"]["mean"].loc["test"], - mean_df.loc["test"]) - np.testing.assert_array_equal(metric_analysis["acc"]["std"].loc["train"], - std_df.loc["train"]) - np.testing.assert_array_equal(metric_analysis["acc"]["std"].loc["test"], - std_df.loc["test"]) - np.testing.assert_array_equal(ada_sum, error_analysis["ada-1"]) - np.testing.assert_array_equal(mv_sum, error_analysis["mv"]) - self.assertEqual(durations_mean.at["ada-1", 'plif'], 0.5) - -class Test_get_arguments(unittest.TestCase): - - def setUp(self): - self.benchamrk_argument_dictionaries = [{"flag":"good_flag", "valid":True}, - {"flag":"bad_flag", "valid":False}] - - def test_benchmark_wanted(self): - argument_dict = get_arguments(self.benchamrk_argument_dictionaries, "good_flag") - self.assertTrue(argument_dict["valid"]) - - -class Test_analyze_iterations(unittest.TestCase): - - @classmethod - def setUpClass(cls): - os.mkdir(tmp_path) - cls.results = [[0, [FakeClassifierResult(), FakeClassifierResult(i=2)], []], [1, [FakeClassifierResult(), FakeClassifierResult(i=2)], []]] - cls.benchmark_argument_dictionaries = [{"labels_dictionary":{0:"zero",1:"one",2:"two"}, "flag":0, "directory":tmp_path, "args":{"name":"test_dataset"}},{"labels_dictionary":{0:"zero",1:"one",2:"two"}, "flag":1, "directory":tmp_path, "args":{"name":"test_dataset"}} ] - cls.stats_iter = 2 - cls.metrics = {} - cls.example_ids = ['ex1', 'ex5','ex4','ex3','ex2',] - cls.labels = np.array([0,1,2,1,1]) - - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_simple(self): - analysis = analyze_iterations(self.results, - self.benchmark_argument_dictionaries, - self.stats_iter, - self.metrics, - self.example_ids, - self.labels) - res, iter_res, tracebacks, labels_names = analysis - self.assertEqual(labels_names, ['zero', 'one', 'two']) - - self.assertEqual(iter_res['class_metrics_scores'], [{}, {}]) - - pd.testing.assert_frame_equal(iter_res['durations'][0], pd.DataFrame(index=['test1','test2'], - columns=['hps', 'fit', 'pred'], - data=np.array([1,1,1,2,2,2]).reshape((2,3)), dtype=object)) - np.testing.assert_array_equal(iter_res['example_errors'][0]['test1'], np.array([1, 1, 0, 0, 1])) - self.assertEqual(iter_res["feature_importances"], [{},{}]) - np.testing.assert_array_equal(iter_res['labels'], np.array([0, 1, 2, 1, 1])) - self.assertEqual(iter_res['metrics_scores'], [{},{}]) - - diff --git a/multiview_platform/tests/test_result_analysis/test_feature_importances.py b/multiview_platform/tests/test_result_analysis/test_feature_importances.py deleted file mode 100644 index 2a69e88c2bcbb036611d4a3006a69ac144b22004..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_result_analysis/test_feature_importances.py +++ /dev/null @@ -1,36 +0,0 @@ -import unittest -import numpy as np -import pandas as pd - -from multiview_platform.mono_multi_view_classifiers.result_analysis import feature_importances -from multiview_platform.mono_multi_view_classifiers.monoview.monoview_utils import MonoviewResult - -class FakeClassifier: - def __init__(self, i=0): - self.feature_importances_ = [i, i+1] - -class FakeClassifierResult(MonoviewResult): - - def __init__(self, i=0): - self.i=i - self.hps_duration = i*10 - self.fit_duration = (i+2)*10 - self.pred_duration = (i+5)*10 - self.clf = FakeClassifier(i) - self.view_name = 'testview'+str(i) - self.classifier_name = "test"+str(i) - - def get_classifier_name(self): - return self.classifier_name - - - -class Test_get_duration(unittest.TestCase): - - def test_simple(self): - results = [FakeClassifierResult(), FakeClassifierResult(i=1)] - feat_importance = feature_importances.get_feature_importances(results) - pd.testing.assert_frame_equal(feat_importance["testview1"], - pd.DataFrame(index=None,columns=['test1'], - data=np.array([1,2]).reshape((2,1)), - )) \ No newline at end of file diff --git a/multiview_platform/tests/test_result_analysis/test_metric_analysis.py b/multiview_platform/tests/test_result_analysis/test_metric_analysis.py deleted file mode 100644 index a34f06a462784b1358b0af57c48df95da62fbd82..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_result_analysis/test_metric_analysis.py +++ /dev/null @@ -1,180 +0,0 @@ -import unittest -import numpy as np -import pandas as pd -import os - -from multiview_platform.mono_multi_view_classifiers.monoview.monoview_utils import MonoviewResult -from multiview_platform.mono_multi_view_classifiers.multiview.multiview_utils import MultiviewResult - -from multiview_platform.mono_multi_view_classifiers.result_analysis.metric_analysis import get_metrics_scores, init_plot, get_fig_size, sort_by_test_score - -class Test_get_metrics_scores(unittest.TestCase): - - - def test_simple(self): - metrics = {"accuracy_score*":{},"f1_score":{}} - results = [MonoviewResult(0, - "ada", - "0", - {"accuracy_score*":[0.9, 0.95], - "f1_score":[0.91, 0.96]} - , "", "", "", "", "",0,0,{})] - metrics_scores, class_met = get_metrics_scores(metrics, - results, []) - self.assertIsInstance(metrics_scores, dict) - self.assertIsInstance(metrics_scores["accuracy_score*"], pd.DataFrame) - np.testing.assert_array_equal(np.array(metrics_scores["accuracy_score*"].loc["train"]), np.array([0.9])) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score*"].loc["test"]), - np.array([0.95])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["train"]), - np.array([0.91])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["test"]), - np.array([0.96])) - np.testing.assert_array_equal(np.array(metrics_scores["f1_score"].columns), - np.array(["ada-0"])) - - def test_multiple_monoview_classifiers(self): - metrics = {"accuracy_score*":{},"f1_score":{}} - results = [MonoviewResult(view_index=0, - classifier_name="ada", - view_name="0", - metrics_scores={"accuracy_score*": [0.9, 0.95], - "f1_score": [0.91, 0.96]}, - full_labels_pred="", - classifier_config="", - classifier="", - n_features="", - hps_duration=0, - fit_duration=0, - pred_duration=0, - class_metric_scores={}), - MonoviewResult(view_index=0, - classifier_name="dt", - view_name="1", - metrics_scores={"accuracy_score*": [0.8, 0.85], - "f1_score": [0.81, 0.86]}, - full_labels_pred="", - classifier_config="", - classifier="", - n_features="", - hps_duration=0, - fit_duration=0, - pred_duration=0, - class_metric_scores={}) - ] - metrics_scores, class_met = get_metrics_scores(metrics, - results, []) - self.assertIsInstance(metrics_scores, dict) - self.assertIsInstance(metrics_scores["accuracy_score*"], pd.DataFrame) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score*"].loc["train"]), - np.array([0.9, 0.8])) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score*"].loc["test"]), - np.array([0.95, 0.85])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["train"]), - np.array([0.91, 0.81])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["test"]), - np.array([0.96, 0.86])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].columns), - np.array(["ada-0", "dt-1"])) - - def test_mutiview_result(self): - metrics = {"accuracy_score*":{},"f1_score":{}} - results = [MultiviewResult("mv", "", {"accuracy_score*": [0.7, 0.75], - "f1_score": [0.71, 0.76]}, "",0,0,0, {}), - MonoviewResult(view_index=0, - classifier_name="dt", - view_name="1", - metrics_scores={"accuracy_score*": [0.8, 0.85], - "f1_score": [0.81, 0.86]}, - full_labels_pred="", - classifier_config="", - classifier="", - n_features="", - hps_duration=0, - fit_duration=0, - pred_duration=0, - class_metric_scores={}) - ] - metrics_scores, class_met = get_metrics_scores(metrics, - results, []) - self.assertIsInstance(metrics_scores, dict) - self.assertIsInstance(metrics_scores["accuracy_score*"], pd.DataFrame) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score*"].loc["train"]), - np.array([0.7, 0.8])) - np.testing.assert_array_equal( - np.array(metrics_scores["accuracy_score*"].loc["test"]), - np.array([0.75, 0.85])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["train"]), - np.array([0.71, 0.81])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].loc["test"]), - np.array([0.76, 0.86])) - np.testing.assert_array_equal( - np.array(metrics_scores["f1_score"].columns), - np.array(["mv", "dt-1"])) - - -class Test_init_plot(unittest.TestCase): - - def test_simple(self): - results = [] - metric_name = "acc" - data = np.random.RandomState(42).uniform(0,1,(2,2)) - metric_dataframe = pd.DataFrame(index=["train", "test"], - columns=["dt-1", "mv"], data=data) - directory = "dir" - database_name = 'db' - labels_names = ['lb1', "lb2"] - class_met = metric_dataframe = pd.DataFrame(index=["train", "test"], - columns=["dt-1", "mv"], data=data) - train, test, classifier_names, \ - file_name, nb_results, results, class_test = init_plot(results, - metric_name, - metric_dataframe, - directory, - database_name, - class_met) - self.assertEqual(file_name, os.path.join("dir", "db-acc")) - np.testing.assert_array_equal(train, data[0,:]) - np.testing.assert_array_equal(test, data[1, :]) - np.testing.assert_array_equal(classifier_names, np.array(["dt-1", "mv"])) - self.assertEqual(nb_results, 2) - self.assertEqual(results, [["dt-1", "acc", data[1,0], 0.0, data[1,0]], - ["mv", "acc", data[1,1], 0.0, data[1,1]]]) - - -class Test_small_func(unittest.TestCase): - - def test_fig_size(self): - kw, width = get_fig_size(5) - self.assertEqual(kw, {"figsize":(15,5)}) - self.assertEqual(width, 0.35) - kw, width = get_fig_size(100) - self.assertEqual(kw, {"figsize": (100, 100/3)}) - self.assertEqual(width, 0.35) - - def test_sort_by_test_scores(self): - train_scores = np.array([1,2,3,4]) - test_scores = np.array([4, 3, 2, 1]) - train_STDs = np.array([1, 2, 3, 4]) - test_STDs = np.array([1, 2, 3, 4]) - names = np.array(['1', '2', '3', '4']) - sorted_names, sorted_train_scores, \ - sorted_test_scores, sorted_train_STDs, \ - sorted_test_STDs = sort_by_test_score(train_scores, test_scores, - names, train_STDs, test_STDs) - np.testing.assert_array_equal(sorted_names, np.array(['4', '3', '2', '1'])) - np.testing.assert_array_equal(sorted_test_scores, [1, 2, 3, 4]) - np.testing.assert_array_equal(sorted_test_STDs, [4, 3, 2, 1]) - np.testing.assert_array_equal(sorted_train_scores, [4, 3, 2, 1]) - np.testing.assert_array_equal(sorted_train_STDs, [4, 3, 2, 1]) \ No newline at end of file diff --git a/multiview_platform/tests/test_result_analysis/test_noise_analysis.py b/multiview_platform/tests/test_result_analysis/test_noise_analysis.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_result_analysis/test_tracebacks_analysis.py b/multiview_platform/tests/test_result_analysis/test_tracebacks_analysis.py deleted file mode 100644 index 61296f85e01400c823ecf7bc384a8f2751d20f4a..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_result_analysis/test_tracebacks_analysis.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -import numpy as np -import pandas as pd -import os - -from multiview_platform.mono_multi_view_classifiers.result_analysis import tracebacks_analysis -from multiview_platform.tests.utils import tmp_path, rm_tmp - -class FakeClassifierResult: - - def __init__(self, i=0): - self.i=i - if i == 0: - self.hps_duration = 10 - self.fit_duration = 12 - self.pred_duration = 15 - else: - self.hps_duration = 1 - self.fit_duration = 2 - self.pred_duration = 5 - - - def get_classifier_name(self): - if self.i == 0: - return 'test1' - else: - return 'test2' - - - -class Test_funcs(unittest.TestCase): - - @classmethod - def setUpClass(cls): - os.mkdir(tmp_path) - cls.res_file = open(os.path.join(tmp_path,"tmp.txt"), "w") - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_save_dict_to_text(self): - keys = tracebacks_analysis.save_dict_to_text({"a":"i", "b":"j"}, self.res_file) - self.res_file.close() - self.assertEqual(list(keys),["a", "b"]) - with open(os.path.join(tmp_path,"tmp.txt"), 'r') as res_file: - self.assertEqual(res_file.read(), 'Failed algorithms : \n\ta,\n\tb.\n\n\na\n\ni\n\n\nb\n\nj\n\n\n') diff --git a/multiview_platform/tests/test_utils/__init__.py b/multiview_platform/tests/test_utils/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/multiview_platform/tests/test_utils/test_GetMultiviewDB.py b/multiview_platform/tests/test_utils/test_GetMultiviewDB.py deleted file mode 100644 index c1068e7be8527b3302b868a8463bcdab0a479c51..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_utils/test_GetMultiviewDB.py +++ /dev/null @@ -1,166 +0,0 @@ -import os -import unittest - -import h5py -import numpy as np - -from multiview_platform.mono_multi_view_classifiers.utils import get_multiview_db -from multiview_platform.tests.utils import rm_tmp, tmp_path - - -class Test_get_classic_db_hdf5(unittest.TestCase): - - def setUp(self): - rm_tmp() - os.mkdir(tmp_path) - self.rs = np.random.RandomState(42) - self.nb_view = 3 - self.file_name = "test.hdf5" - self.nb_examples = 5 - self.nb_class = 3 - self.views = [self.rs.randint(0, 10, size=(self.nb_examples, 7)) - for _ in range(self.nb_view)] - self.labels = self.rs.randint(0, self.nb_class, self.nb_examples) - self.dataset_file = h5py.File(os.path.join(tmp_path, self.file_name), 'w') - self.view_names = ["ViewN" + str(index) for index in - range(len(self.views))] - self.are_sparse = [False for _ in self.views] - for view_index, (view_name, view, is_sparse) in enumerate( - zip(self.view_names, self.views, self.are_sparse)): - view_dataset = self.dataset_file.create_dataset( - "View" + str(view_index), - view.shape, - data=view) - view_dataset.attrs["name"] = view_name - view_dataset.attrs["sparse"] = is_sparse - labels_dataset = self.dataset_file.create_dataset("Labels", - shape=self.labels.shape, - data=self.labels) - self.labels_names = [str(index) for index in np.unique(self.labels)] - labels_dataset.attrs["names"] = [label_name.encode() - for label_name in self.labels_names] - meta_data_grp = self.dataset_file.create_group("Metadata") - meta_data_grp.attrs["nbView"] = len(self.views) - meta_data_grp.attrs["nbClass"] = len(np.unique(self.labels)) - meta_data_grp.attrs["datasetLength"] = len(self.labels) - - def test_simple(self): - dataset , labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5( - ["ViewN2"], tmp_path, self.file_name.split(".")[0], - self.nb_class, ["0", "2"], - self.rs, path_for_new=tmp_path) - self.assertEqual(dataset.nb_view, 1) - self.assertEqual(labels_dictionary, - {0: "0", 1: "2", 2:"1"}) - self.assertEqual(dataset.get_nb_examples(), 5) - self.assertEqual(len(np.unique(dataset.get_labels())), 3) - - - def test_all_views_asked(self): - dataset, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5( - None, tmp_path, self.file_name.split(".")[0], - self.nb_class, ["0", "2"], - self.rs, path_for_new=tmp_path) - self.assertEqual(dataset.nb_view, 3) - self.assertEqual(dataset.get_view_dict(), {'ViewN0': 0, 'ViewN1': 1, 'ViewN2': 2}) - - def test_asked_the_whole_dataset(self): - dataset, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_hdf5( - ["ViewN2"], tmp_path, self.file_name.split(".")[0], - self.nb_class, ["0", "2"], - self.rs, path_for_new=tmp_path, full=True) - self.assertEqual(dataset.dataset, self.dataset_file) - - def tearDown(self): - rm_tmp() - - -class Test_get_classic_db_csv(unittest.TestCase): - - def setUp(self): - rm_tmp() - os.mkdir(tmp_path) - self.pathF = tmp_path - self.NB_CLASS = 2 - self.nameDB = "test_dataset" - self.askedLabelsNames = ["test_label_1", "test_label_3"] - self.random_state = np.random.RandomState(42) - self.views = ["test_view_1", "test_view_3"] - np.savetxt(self.pathF + self.nameDB + "-labels-names.csv", - np.array(["test_label_0", "test_label_1", - "test_label_2", "test_label_3"]), fmt="%s", - delimiter=",") - np.savetxt(self.pathF + self.nameDB + "-labels.csv", - self.random_state.randint(0, 4, 10), delimiter=",") - os.mkdir(self.pathF + "Views") - self.datas = [] - for i in range(4): - data = self.random_state.randint(0, 100, (10, 20)) - np.savetxt(self.pathF + "Views/test_view_" + str(i) + ".csv", - data, delimiter=",") - self.datas.append(data) - - - def test_simple(self): - dataset, labels_dictionary, dataset_name = get_multiview_db.get_classic_db_csv( - self.views, self.pathF, self.nameDB, - self.NB_CLASS, self.askedLabelsNames, - self.random_state, delimiter=",", path_for_new=tmp_path) - self.assertEqual(dataset.nb_view, 2) - self.assertEqual(dataset.get_view_dict(), {'test_view_1': 0, 'test_view_3': 1}) - self.assertEqual(labels_dictionary, - {0: "test_label_1", 1: "test_label_3"}) - self.assertEqual(dataset.get_nb_examples(), 3) - self.assertEqual(dataset.get_nb_class(), 2) - - - @classmethod - def tearDown(self): - for i in range(4): - os.remove( - tmp_path+"Views/test_view_" + str( - i) + ".csv") - os.rmdir(tmp_path+"Views") - os.remove( - tmp_path+"test_dataset-labels-names.csv") - os.remove(tmp_path+"test_dataset-labels.csv") - os.remove(tmp_path+"test_dataset.hdf5") - os.remove( - tmp_path+"test_dataset_temp_filter.hdf5") - os.rmdir(tmp_path) - -class Test_get_plausible_db_hdf5(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - cls.path = tmp_path - cls.nb_class=3 - cls.rs = np.random.RandomState(42) - cls.nb_view=3 - cls.nb_examples = 5 - cls.nb_features = 4 - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_simple(self): - dataset, labels_dict, name = get_multiview_db.get_plausible_db_hdf5( - "", self.path, "", nb_class=self.nb_class, random_state=self.rs, - nb_view=3, nb_examples=self.nb_examples, - nb_features=self.nb_features) - self.assertEqual(dataset.init_example_indices(), range(5)) - self.assertEqual(dataset.get_nb_class(), self.nb_class) - - def test_two_class(self): - dataset, labels_dict, name = get_multiview_db.get_plausible_db_hdf5( - "", self.path, "", nb_class=2, random_state=self.rs, - nb_view=3, nb_examples=self.nb_examples, - nb_features=self.nb_features) - self.assertEqual(dataset.init_example_indices(), range(5)) - self.assertEqual(dataset.get_nb_class(), 2) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/multiview_platform/tests/test_utils/test_base.py b/multiview_platform/tests/test_utils/test_base.py deleted file mode 100644 index 027da26f44111e18dae9eb1fe77c704b312fbfd7..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_utils/test_base.py +++ /dev/null @@ -1,261 +0,0 @@ -import os -import unittest -import yaml -import numpy as np -from sklearn.tree import DecisionTreeClassifier -from sklearn.model_selection import StratifiedKFold -from sklearn.metrics import accuracy_score, f1_score - -from multiview_platform.tests.utils import rm_tmp, tmp_path -from multiview_platform.mono_multi_view_classifiers.utils import base - - -class FakeClassifier(base.BaseClassifier): - def __init__(self, no_params=False, accepts_mc=True): - if no_params: - self.param_names = [] - self.classed_params = [] - else: - self.param_names = ["test1", "test2"] - self.classed_params = ["test2"] - self.weird_strings = [] - self.accepts_mc = accepts_mc - - def get_params(self, deep=True): - return {"test1": 10, - "test2": "test"} - - def fit(self, X, y): - if np.unique(y).shape[0]>2 and not self.accepts_mc: - raise ValueError('Does not accept MC') - else: - return self - - -class FakeDetector: - def __init__(self): - self.best_params_ = {"test1": 10, - "test2": "test"} - self.cv_results_ = {"param_test1": [10], - "param_test2": ["test"]} - -class FakeResultAnalyzer(base.ResultAnalyser): - - def get_view_specific_info(self): - return "test" - - def get_base_string(self): - return 'test2' - -class Test_ResultAnalyzer(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.rs = np.random.RandomState(42) - cls.classifier = FakeClassifier() - cls.n_examples = 50 - cls.n_classes = 3 - cls.train_length = 24 - cls.train_indices = cls.rs.choice(np.arange(cls.n_examples), - size=cls.train_length, - replace=False) - cls.test_indices = np.array([i for i in range(cls.n_examples) - if i not in cls.train_indices]) - cls.test_length = cls.test_indices.shape[0] - cls.classification_indices = [cls.train_indices, cls.test_indices] - cls.n_splits = 5 - cls.k_folds = StratifiedKFold(n_splits=cls.n_splits, ) - cls.hps_method = "randomized_search" - cls.metrics_list = {"accuracy_score": {}, "f1_score*":{}} - cls.n_iter = 6 - cls.class_label_names = ["class{}".format(ind+1) - for ind in range(cls.n_classes)] - cls.pred = cls.rs.randint(0, cls.n_classes, - size=cls.n_examples) - cls.directory = "fake_directory" - cls.base_file_name = "fake_file" - cls.labels = cls.rs.randint(0, cls.n_classes, - size=cls.n_examples) - cls.database_name = "test_database" - cls.nb_cores = 0.5 - cls.duration = -4 - cls.train_accuracy = accuracy_score(cls.labels[cls.train_indices], - cls.pred[cls.train_indices]) - cls.test_accuracy = accuracy_score(cls.labels[cls.test_indices], - cls.pred[cls.test_indices]) - cls.train_f1 = f1_score(cls.labels[cls.train_indices], - cls.pred[cls.train_indices], average='micro') - cls.test_f1 = f1_score(cls.labels[cls.test_indices], - cls.pred[cls.test_indices], average='micro') - - def test_simple(self): - RA = base.ResultAnalyser(self.classifier, self.classification_indices, - self.k_folds, self.hps_method, self.metrics_list, - self.n_iter, self.class_label_names, - self.pred, self.directory, - self.base_file_name, self.labels, - self.database_name, self.nb_cores, - self.duration) - - def test_get_metric_scores(self): - RA = base.ResultAnalyser(self.classifier, self.classification_indices, - self.k_folds, self.hps_method, - self.metrics_list, - self.n_iter, self.class_label_names, - self.pred, - self.directory, self.base_file_name, - self.labels, self.database_name, - self.nb_cores, self.duration) - cl_train, cl_test,train_score, test_score = RA.get_metric_score("accuracy_score", {}) - np.testing.assert_array_equal(train_score, self.train_accuracy) - np.testing.assert_array_equal(test_score, self.test_accuracy) - - def test_get_all_metrics_scores(self): - RA = base.ResultAnalyser(self.classifier, self.classification_indices, - self.k_folds, self.hps_method, - self.metrics_list, - self.n_iter, self.class_label_names, - self.pred, - self.directory, self.base_file_name, - self.labels, self.database_name, - self.nb_cores, self.duration) - RA.get_all_metrics_scores() - self.assertEqual(RA.metric_scores["accuracy_score"][0], - self.train_accuracy) - self.assertEqual(RA.metric_scores["accuracy_score"][1], - self.test_accuracy) - self.assertEqual(RA.metric_scores["f1_score*"][0], - self.train_f1) - self.assertEqual(RA.metric_scores["f1_score*"][1], - self.test_f1) - - def test_print_metrics_scores(self): - RA = base.ResultAnalyser(self.classifier, self.classification_indices, - self.k_folds, self.hps_method, - self.metrics_list, - self.n_iter, self.class_label_names, - self.pred, - self.directory, self.base_file_name, - self.labels, self.database_name, - self.nb_cores, self.duration) - RA.get_all_metrics_scores() - string = RA.print_metric_score() - print(repr(string)) - self.assertEqual(string, '\n\n\tFor Accuracy score using {}, (higher is better) : \n\t\t- Score on train : 0.25\n\t\t- Score on test : 0.2692307692307692\n\n\tFor F1 score using average: micro, {} (higher is better) : \n\t\t- Score on train : 0.25\n\t\t- Score on test : 0.2692307692307692\n\nTest set confusion matrix : \n\n╒════════╤══════════╤══════════╤══════════╕\n│ │ class1 │ class2 │ class3 │\n╞════════╪══════════╪══════════╪══════════╡\n│ class1 │ 3 │ 1 │ 2 │\n├────────┼──────────┼──────────┼──────────┤\n│ class2 │ 3 │ 2 │ 2 │\n├────────┼──────────┼──────────┼──────────┤\n│ class3 │ 3 │ 8 │ 2 │\n╘════════╧══════════╧══════════╧══════════╛\n\n') - - def test_get_db_config_string(self): - RA = FakeResultAnalyzer(self.classifier, self.classification_indices, - self.k_folds, self.hps_method, - self.metrics_list, - self.n_iter, self.class_label_names, - self.pred, - self.directory, self.base_file_name, - self.labels, self.database_name, - self.nb_cores, self.duration) - self.assertEqual(RA.get_db_config_string(), 'Database configuration : \n\t- Database name : test_database\ntest\t- Learning Rate : 0.48\n\t- Labels used : class1, class2, class3\n\t- Number of cross validation folds : 5\n\n') - - def test_get_classifier_config_string(self): - RA = base.ResultAnalyser(self.classifier, self.classification_indices, - self.k_folds, self.hps_method, - self.metrics_list, - self.n_iter, self.class_label_names, - self.pred, - self.directory, self.base_file_name, - self.labels, self.database_name, - self.nb_cores, self.duration) - self.assertEqual(RA.get_classifier_config_string(), 'Classifier configuration : \n\t- FakeClassifier with test1 : 10, test2 : test\n\t- Executed on 0.5 core(s) \n\t- Got configuration using randomized search with 6 iterations \n') - - def test_analyze(self): - RA = FakeResultAnalyzer(self.classifier, self.classification_indices, - self.k_folds, self.hps_method, - self.metrics_list, - self.n_iter, self.class_label_names, - self.pred, - self.directory, self.base_file_name, - self.labels, self.database_name, - self.nb_cores, self.duration) - str_analysis, img_analysis, metric_scores, class_metric_scores, conf_mat = RA.analyze() - print(repr(str_analysis)) - self.assertEqual(str_analysis, 'test2Database configuration : \n\t- Database name : test_database\ntest\t- Learning Rate : 0.48\n\t- Labels used : class1, class2, class3\n\t- Number of cross validation folds : 5\n\nClassifier configuration : \n\t- FakeClassifier with test1 : 10, test2 : test\n\t- Executed on 0.5 core(s) \n\t- Got configuration using randomized search with 6 iterations \n\n\n\tFor Accuracy score using {}, (higher is better) : \n\t\t- Score on train : 0.25\n\t\t- Score on test : 0.2692307692307692\n\n\tFor F1 score using average: micro, {} (higher is better) : \n\t\t- Score on train : 0.25\n\t\t- Score on test : 0.2692307692307692\n\nTest set confusion matrix : \n\n╒════════╤══════════╤══════════╤══════════╕\n│ │ class1 │ class2 │ class3 │\n╞════════╪══════════╪══════════╪══════════╡\n│ class1 │ 3 │ 1 │ 2 │\n├────────┼──────────┼──────────┼──────────┤\n│ class2 │ 3 │ 2 │ 2 │\n├────────┼──────────┼──────────┼──────────┤\n│ class3 │ 3 │ 8 │ 2 │\n╘════════╧══════════╧══════════╧══════════╛\n\n\n\n Classification took -1 day, 23:59:56\n\n Classifier Interpretation : \n') - - - -class Test_BaseClassifier(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.base_estimator = "DecisionTreeClassifier" - cls.base_estimator_config = {"max_depth":10, - "splitter": "best"} - cls.est = base.BaseClassifier() - cls.rs = np.random.RandomState(42) - - def test_simple(self): - base_estim = self.est.get_base_estimator(self.base_estimator, - self.base_estimator_config) - self.assertTrue(isinstance(base_estim, DecisionTreeClassifier)) - self.assertEqual(base_estim.max_depth, 10) - self.assertEqual(base_estim.splitter, "best") - - def test_gen_best_params(self): - fake_class = FakeClassifier() - best_params = fake_class.gen_best_params(FakeDetector()) - self.assertEqual(best_params, {"test1":10, "test2":"test"}) - - def test_gen_params_from_detector(self): - fake_class = FakeClassifier() - params = fake_class.gen_params_from_detector(FakeDetector()) - self.assertEqual(params, [("test1",np.array([10])), - ("test2",np.array(["str"], dtype='<U3'))]) - params = FakeClassifier(no_params=True).gen_params_from_detector(FakeDetector()) - self.assertEqual(params, [()]) - - def test_params_to_string(self): - fake_class = FakeClassifier() - string = fake_class.params_to_string() - self.assertEqual(string, "test1 : 10, test2 : test") - - def test_get_iterpret(self): - fake_class = FakeClassifier() - self.assertEqual("", fake_class.get_interpretation("", "", "",)) - - def test_accepts_mutliclass(self): - accepts = FakeClassifier().accepts_multi_class(self.rs) - self.assertEqual(accepts, True) - accepts = FakeClassifier(accepts_mc=False).accepts_multi_class(self.rs) - self.assertEqual(accepts, False) - self.assertRaises(ValueError, FakeClassifier().accepts_multi_class, self.rs, **{"n_samples":2}) - - - def test_class(self): - base_estimator = DecisionTreeClassifier(max_depth=15, splitter="random") - base_estim = self.est.get_base_estimator(base_estimator, - self.base_estimator_config) - self.assertTrue(isinstance(base_estim, DecisionTreeClassifier)) - self.assertEqual(base_estim.max_depth, 10) - self.assertEqual(base_estim.splitter, "best") - - def test_wrong_args(self): - base_estimator_config = {"n_estimators": 10, - "splitter": "best"} - with self.assertRaises(TypeError): - base_estim = self.est.get_base_estimator(self.base_estimator, - base_estimator_config) - - def test_get_config(self): - conf = FakeClassifier(no_params=True).get_config() - self.assertEqual(conf, 'FakeClassifier with no config.') - -class Test_Functions(unittest.TestCase): - - def test_get_name(self): - classed_list = ["test", 42] - np.testing.assert_array_equal(base.get_names(classed_list), - np.array(["str", "int"], dtype="<U3")) - - - def test_get_metric(self): - from multiview_platform.mono_multi_view_classifiers.metrics import accuracy_score - metrics_dict = {"accuracy_score*":{}} - self.assertEqual(base.get_metric(metrics_dict), (accuracy_score, {})) - diff --git a/multiview_platform/tests/test_utils/test_configuration.py b/multiview_platform/tests/test_utils/test_configuration.py deleted file mode 100644 index dc1fed6ccc7288d0988f52f921ed23b8179dfb68..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_utils/test_configuration.py +++ /dev/null @@ -1,63 +0,0 @@ -import os -import unittest -import yaml -import numpy as np - -from multiview_platform.tests.utils import rm_tmp, tmp_path -from multiview_platform.mono_multi_view_classifiers.utils import configuration - - -class Test_get_the_args(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - cls.path_to_config_file = tmp_path+"config_temp.yml" - path_file = os.path.dirname(os.path.abspath(__file__)) - make_tmp_dir = os.path.join(path_file, "../tmp_tests") - os.mkdir(make_tmp_dir) - data = {"log": 10, "name":[12.5, 1e-06], "type":True} - with open(cls.path_to_config_file, "w") as config_file: - yaml.dump(data, config_file) - - @classmethod - def tearDownClass(cls): - os.remove(tmp_path+"config_temp.yml") - os.rmdir(tmp_path) - - def test_file_loading(self): - config_dict = configuration.get_the_args(self.path_to_config_file) - self.assertEqual(type(config_dict), dict) - - def test_dict_format(self): - config_dict = configuration.get_the_args(self.path_to_config_file) - self.assertIn("log", config_dict) - self.assertIn("name", config_dict) - - def test_arguments(self): - config_dict = configuration.get_the_args(self.path_to_config_file) - self.assertEqual(config_dict["log"], 10) - self.assertEqual(config_dict["name"], [12.5, 1e-06]) - self.assertEqual(config_dict["type"], True) - -class Test_save_config(unittest.TestCase): - @classmethod - def setUpClass(cls): - rm_tmp() - path_file = os.path.dirname(os.path.abspath(__file__)) - make_tmp_dir = os.path.join(path_file, "../tmp_tests") - os.mkdir(make_tmp_dir) - - def test_simple(self): - configuration.save_config(tmp_path, {"test":10}) - with open(os.path.join(tmp_path,"config_file.yml" ), 'r') as stream: - yaml_config = yaml.safe_load(stream) - self.assertEqual(yaml_config,{"test":10} ) - - @classmethod - def tearDownClass(cls): - os.remove(os.path.join(tmp_path, "config_file.yml")) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/multiview_platform/tests/test_utils/test_dataset.py b/multiview_platform/tests/test_utils/test_dataset.py deleted file mode 100644 index 76644bcbddc227877a398e99c3f47f22d82cd22e..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_utils/test_dataset.py +++ /dev/null @@ -1,423 +0,0 @@ -import unittest -import h5py -import numpy as np -import os - -from multiview_platform.tests.utils import rm_tmp, tmp_path -from multiview_platform.mono_multi_view_classifiers.utils import dataset - - -class Test_Dataset(unittest.TestCase): - - @classmethod - def setUpClass(cls): - rm_tmp() - os.mkdir(tmp_path) - cls.rs = np.random.RandomState(42) - cls.nb_view = 3 - cls.file_name = "test.hdf5" - cls.nb_examples = 5 - cls.nb_attr = 7 - cls.nb_class = 3 - cls.views = [cls.rs.randint(0, 10, size=(cls.nb_examples, cls.nb_attr)) - for _ in range(cls.nb_view)] - cls.labels = cls.rs.randint(0, cls.nb_class, cls.nb_examples) - cls.dataset_file = h5py.File(os.path.join(tmp_path, cls.file_name), "w") - cls.view_names = ["ViewN" + str(index) for index in range(len(cls.views))] - cls.are_sparse = [False for _ in cls.views] - for view_index, (view_name, view, is_sparse) in enumerate( - zip(cls.view_names, cls.views, cls.are_sparse)): - view_dataset = cls.dataset_file.create_dataset("View" + str(view_index), - view.shape, - data=view) - view_dataset.attrs["name"] = view_name - view_dataset.attrs["sparse"] = is_sparse - labels_dataset = cls.dataset_file.create_dataset("Labels", - shape=cls.labels.shape, - data=cls.labels) - cls.labels_names = [str(index) for index in np.unique(cls.labels)] - labels_dataset.attrs["names"] = [label_name.encode() - for label_name in cls.labels_names] - meta_data_grp = cls.dataset_file.create_group("Metadata") - meta_data_grp.attrs["nbView"] = len(cls.views) - meta_data_grp.attrs["nbClass"] = len(np.unique(cls.labels)) - meta_data_grp.attrs["datasetLength"] = len(cls.labels) - - @classmethod - def tearDownClass(cls): - cls.dataset_file.close() - - def test_get_shape(self): - dataset_object = dataset.HDF5Dataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - file_name="from_scratch" + self.file_name, - view_names=self.view_names, - path=tmp_path, - labels_names=self.labels_names) - shape = dataset_object.get_shape(0) - self.assertEqual(shape, (5,7)) - - def test_to_numpy_array(self): - dataset_object = dataset.HDF5Dataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - file_name="from_scratch" + self.file_name, - view_names=self.view_names, - path=tmp_path, - labels_names=self.labels_names) - array, limits = dataset_object.to_numpy_array(view_indices=[0,1,2]) - - self.assertEqual(array.shape, (5, 21)) - - def test_filter(self): - """Had to create a new dataset to aviod playing with the class one""" - file_name = "test_filter.hdf5" - dataset_file_filter = h5py.File(os.path.join(tmp_path, file_name), "w") - for view_index, (view_name, view, is_sparse) in enumerate( - zip(self.view_names, self.views, self.are_sparse)): - view_dataset = dataset_file_filter.create_dataset( - "View" + str(view_index), - view.shape, - data=view) - view_dataset.attrs["name"] = view_name - view_dataset.attrs["sparse"] = is_sparse - labels_dataset = dataset_file_filter.create_dataset("Labels", - shape=self.labels.shape, - data=self.labels) - labels_dataset.attrs["names"] = [label_name.encode() - for label_name in self.labels_names] - meta_data_grp = dataset_file_filter.create_group("Metadata") - meta_data_grp.attrs["nbView"] = len(self.views) - meta_data_grp.attrs["nbClass"] = len(np.unique(self.labels)) - meta_data_grp.attrs["datasetLength"] = len(self.labels) - dataset_object = dataset.HDF5Dataset(hdf5_file=dataset_file_filter) - dataset_object.filter(np.array([0, 1, 0]), ["0", "1"], [1, 2, 3], - ["ViewN0"], tmp_path) - self.assertEqual(dataset_object.nb_view, 1) - np.testing.assert_array_equal(dataset_object.get_labels(), [0, 1, 0]) - dataset_object.dataset.close() - os.remove(os.path.join(tmp_path, "test_filter_temp_filter.hdf5")) - os.remove(os.path.join(tmp_path, "test_filter.hdf5")) - - def test_for_hdf5_file(self): - dataset_object = dataset.HDF5Dataset(hdf5_file=self.dataset_file) - - def test_from_scratch(self): - dataset_object = dataset.HDF5Dataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - file_name="from_scratch"+self.file_name, - view_names=self.view_names, - path=tmp_path, - labels_names=self.labels_names) - nb_class = dataset_object.get_nb_class() - self.assertEqual(nb_class, self.nb_class) - example_indices = dataset_object.init_example_indices() - self.assertEqual(example_indices, range(self.nb_examples)) - view = dataset_object.get_v(0) - np.testing.assert_array_equal(view, self.views[0]) - - def test_init_example_indices(self): - example_indices = dataset.HDF5Dataset( - hdf5_file=self.dataset_file).init_example_indices() - self.assertEqual(example_indices, range(self.nb_examples)) - example_indices = dataset.HDF5Dataset( - hdf5_file=self.dataset_file).init_example_indices([0, 1, 2]) - self.assertEqual(example_indices, [0,1,2]) - - def test_get_v(self): - view = dataset.HDF5Dataset(hdf5_file=self.dataset_file).get_v(0) - np.testing.assert_array_equal(view, self.views[0]) - view = dataset.HDF5Dataset(hdf5_file=self.dataset_file).get_v(1, [0,1,2]) - np.testing.assert_array_equal(view, self.views[1][[0,1,2,], :]) - - def test_get_nb_class(self): - nb_class = dataset.HDF5Dataset(hdf5_file=self.dataset_file).get_nb_class() - self.assertEqual(nb_class, self.nb_class) - nb_class = dataset.HDF5Dataset(hdf5_file=self.dataset_file).get_nb_class([0]) - self.assertEqual(nb_class, 1) - - - - def test_get_view_dict(self): - dataset_object = dataset.HDF5Dataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - file_name="from_scratch" + self.file_name, - view_names=self.view_names, - path=tmp_path, - labels_names=self.labels_names) - self.assertEqual(dataset_object.get_view_dict(), {"ViewN0":0, - "ViewN1": 1, - "ViewN2": 2,}) - - def test_get_label_names(self): - dataset_object = dataset.HDF5Dataset(hdf5_file=self.dataset_file) - raw_label_names = dataset_object.get_label_names(decode=False) - decoded_label_names = dataset_object.get_label_names() - restricted_label_names = dataset_object.get_label_names(example_indices=[3,4]) - self.assertEqual(raw_label_names, [b'0', b'1', b'2']) - self.assertEqual(decoded_label_names, ['0', '1', '2']) - self.assertEqual(restricted_label_names, ['2']) - - def test_get_nb_exmaples(self): - dataset_object = dataset.HDF5Dataset(hdf5_file=self.dataset_file) - nb_examples = dataset_object.get_nb_examples() - self.assertEqual(nb_examples, self.nb_examples) - - def test_get_labels(self): - dataset_object = dataset.HDF5Dataset(hdf5_file=self.dataset_file) - labels = dataset_object.get_labels() - np.testing.assert_array_equal(labels, self.labels) - labels = dataset_object.get_labels([1,2,0]) - np.testing.assert_array_equal(labels, self.labels[[1,2,0]]) - - def test_copy_view(self): - dataset_object = dataset.HDF5Dataset(hdf5_file=self.dataset_file) - new_dataset = h5py.File(os.path.join(tmp_path, "test_copy.hdf5"), "w") - dataset_object.copy_view(target_dataset=new_dataset, - source_view_name="ViewN0", - target_view_index=1) - self.assertIn("View1", list(new_dataset.keys())) - np.testing.assert_array_equal(dataset_object.get_v(0), new_dataset["View1"][()]) - self.assertEqual(new_dataset["View1"].attrs["name"], "ViewN0") - new_dataset.close() - os.remove(os.path.join(tmp_path, "test_copy.hdf5")) - - def test_get_name(self): - dataset_object = dataset.HDF5Dataset(hdf5_file=self.dataset_file) - self.assertEqual("test", dataset_object.get_name()) - - def test_select_labels(self): - dataset_object = dataset.HDF5Dataset(hdf5_file=self.dataset_file) - labels, label_names, indices = dataset_object.select_labels(["0", "2"]) - np.testing.assert_array_equal(np.unique(labels), np.array([0,1])) - self.assertEqual(label_names, ["0","2"]) - - def test_check_selected_label_names(self): - dataset_object = dataset.HDF5Dataset(hdf5_file=self.dataset_file) - names = dataset_object.check_selected_label_names(nb_labels=2, random_state=self.rs) - self.assertEqual(names, ["1", "0"]) - names = dataset_object.check_selected_label_names(selected_label_names=['0', '2'], - random_state=self.rs) - self.assertEqual(names, ["0", "2"]) - - def test_select_views_and_labels(self): - file_name = "test_filter.hdf5" - dataset_file_select = h5py.File(os.path.join(tmp_path, file_name), "w") - for view_index, (view_name, view, is_sparse) in enumerate( - zip(self.view_names, self.views, self.are_sparse)): - view_dataset = dataset_file_select.create_dataset( - "View" + str(view_index), - view.shape, - data=view) - view_dataset.attrs["name"] = view_name - view_dataset.attrs["sparse"] = is_sparse - labels_dataset = dataset_file_select.create_dataset("Labels", - shape=self.labels.shape, - data=self.labels) - labels_dataset.attrs["names"] = [label_name.encode() - for label_name in self.labels_names] - meta_data_grp = dataset_file_select.create_group("Metadata") - meta_data_grp.attrs["nbView"] = len(self.views) - meta_data_grp.attrs["nbClass"] = len(np.unique(self.labels)) - meta_data_grp.attrs["datasetLength"] = len(self.labels) - dataset_object = dataset.HDF5Dataset(hdf5_file=dataset_file_select) - names = dataset_object.select_views_and_labels(nb_labels=2, view_names=["ViewN0"], random_state=self.rs, path_for_new=tmp_path) - self.assertEqual(names, {0: '2', 1: '1'}) - self.assertEqual(dataset_object.nb_view, 1) - dataset_object.dataset.close() - os.remove(os.path.join(tmp_path, "test_filter_temp_filter.hdf5")) - os.remove(os.path.join(tmp_path, "test_filter.hdf5")) - - def test_add_gaussian_noise(self): - file_name = "test_noise.hdf5" - dataset_file_select = h5py.File(os.path.join(tmp_path, file_name), "w") - limits = np.zeros((self.nb_attr, 2)) - limits[:, 1] += 100 - meta_data_grp = dataset_file_select.create_group("Metadata") - for view_index, (view_name, view, is_sparse) in enumerate( - zip(self.view_names, self.views, self.are_sparse)): - view_dataset = dataset_file_select.create_dataset( - "View" + str(view_index), - view.shape, - data=view) - view_dataset.attrs["name"] = view_name - view_dataset.attrs["sparse"] = is_sparse - meta_data_grp.create_dataset("View"+str(view_index)+"_limits", data= limits) - labels_dataset = dataset_file_select.create_dataset("Labels", - shape=self.labels.shape, - data=self.labels) - labels_dataset.attrs["names"] = [label_name.encode() - for label_name in self.labels_names] - meta_data_grp.attrs["nbView"] = len(self.views) - meta_data_grp.attrs["nbClass"] = len(np.unique(self.labels)) - meta_data_grp.attrs["datasetLength"] = len(self.labels) - dataset_object = dataset.HDF5Dataset(hdf5_file=dataset_file_select) - dataset_object.add_gaussian_noise(self.rs, tmp_path) - dataset_object.dataset.close() - os.remove(os.path.join(tmp_path, "test_noise_noised.hdf5")) - os.remove(os.path.join(tmp_path, "test_noise.hdf5")) - -class TestRAMDataset(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.rs = np.random.RandomState(42) - cls.nb_view = 3 - cls.file_name = "test.hdf5" - cls.nb_examples = 5 - cls.nb_attr = 7 - cls.nb_class = 3 - cls.views = [cls.rs.randint(0, 10, size=(cls.nb_examples, cls.nb_attr)) - for _ in range(cls.nb_view)] - cls.labels = cls.rs.randint(0, cls.nb_class, cls.nb_examples) - cls.view_names = ["ViewN" + str(index) for index in - range(len(cls.views))] - cls.are_sparse = [False for _ in cls.views] - cls.labels_names = [str(index) for index in np.unique(cls.labels)] - - def test_get_view_name(self): - dataset_object = dataset.RAMDataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - view_names=self.view_names, - labels_names=self.labels_names) - self.assertEqual(dataset_object.get_view_name(0), - "ViewN0") - - def test_init_attrs(self): - dataset_object = dataset.RAMDataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - view_names=self.view_names, - labels_names=self.labels_names) - - - dataset_object.init_attrs() - self.assertEqual(dataset_object.nb_view, 3) - - def test_get_label_names(self): - dataset_object = dataset.RAMDataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - view_names=self.view_names, - labels_names=self.labels_names) - shape = dataset_object.get_label_names() - self.assertEqual(shape, ['0'.encode('utf-8'), - '1'.encode('utf-8'), - '2'.encode('utf-8')]) - shape = dataset_object.get_label_names(decode=False) - self.assertEqual(shape, ['0'.encode('utf-8'), - '1'.encode('utf-8'), - '2'.encode('utf-8')]) - - def test_get_v(self): - dataset_object = dataset.RAMDataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - view_names=self.view_names, - labels_names=self.labels_names) - data = dataset_object.get_v(0, 1) - np.testing.assert_array_equal(data, np.array([6, 7, 4, 3, 7, 7, 2])) - data = dataset_object.get_v(0, None) - np.testing.assert_array_equal(data, np.array([[6, 3, 7, 4, 6, 9, 2], - [6, 7, 4, 3, 7, 7, 2], - [5, 4, 1, 7, 5, 1, 4], - [0, 9, 5, 8, 0, 9, 2], - [6, 3, 8, 2, 4, 2, 6]])) - - def test_filter(self): - dataset_object = dataset.RAMDataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - view_names=self.view_names, - labels_names=self.labels_names) - dataset_object.filter("", "", np.array([1,2]), ["ViewN0", "ViewN1"], - path=None) - self.assertEqual(dataset_object.nb_view, 2) - self.assertEqual(dataset_object.labels.shape, (2,1)) - - def test_get_view_dict(self): - dataset_object = dataset.RAMDataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - view_names=self.view_names, - labels_names=self.labels_names) - d = dataset_object.get_view_dict() - self.assertEqual(d, {'ViewN0': 0, 'ViewN1': 1, 'ViewN2': 2}) - - def test_get_name(self): - dataset_object = dataset.RAMDataset(views=self.views, - labels=self.labels, - are_sparse=self.are_sparse, - view_names=self.view_names, - labels_names=self.labels_names) - n = dataset_object.get_name() - self.assertEqual(n, None) - -class Test_Functions(unittest.TestCase): - @classmethod - def setUpClass(cls): - rm_tmp() - os.mkdir(tmp_path) - cls.rs = np.random.RandomState(42) - cls.nb_view = 3 - cls.file_name = "test0.hdf5" - cls.nb_examples = 5 - cls.nb_attr = 7 - cls.nb_class = 3 - cls.views = [cls.rs.randint(0, 10, size=(cls.nb_examples, cls.nb_attr)) - for _ in range(cls.nb_view)] - cls.labels = cls.rs.randint(0, cls.nb_class, cls.nb_examples) - cls.dataset_file = h5py.File(os.path.join(tmp_path, cls.file_name), "w") - cls.view_names = ["ViewN" + str(index) for index in - range(len(cls.views))] - cls.are_sparse = [False for _ in cls.views] - for view_index, (view_name, view, is_sparse) in enumerate( - zip(cls.view_names, cls.views, cls.are_sparse)): - view_dataset = cls.dataset_file.create_dataset( - "View" + str(view_index), - view.shape, - data=view) - view_dataset.attrs["name"] = view_name - view_dataset.attrs["sparse"] = is_sparse - labels_dataset = cls.dataset_file.create_dataset("Labels", - shape=cls.labels.shape, - data=cls.labels) - cls.labels_names = [str(index) for index in np.unique(cls.labels)] - labels_dataset.attrs["names"] = [label_name.encode() - for label_name in cls.labels_names] - meta_data_grp = cls.dataset_file.create_group("Metadata") - meta_data_grp.attrs["nbView"] = len(cls.views) - meta_data_grp.attrs["nbClass"] = len(np.unique(cls.labels)) - meta_data_grp.attrs["datasetLength"] = len(cls.labels) - - @classmethod - def tearDownClass(cls): - cls.dataset_file.close() - rm_tmp() - - def test_datasets_already_exist(self): - self.assertEqual(True, dataset.datasets_already_exist(tmp_path, "test", 1)) - - def test_init_multiple_datasets(self): - dataset.init_multiple_datasets(tmp_path, "test0", 2) - self.assertTrue(os.path.isfile(os.path.join(tmp_path,'test00.hdf5'))) - dataset.delete_HDF5([{"args":{"pathf":tmp_path, "name":"test0"}}], - 2, dataset.HDF5Dataset(hdf5_file=self.dataset_file)) - self.assertFalse(os.path.isfile(os.path.join(tmp_path,'test00.hdf5'))) - - - - - - - - - - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/multiview_platform/tests/test_utils/test_execution.py b/multiview_platform/tests/test_utils/test_execution.py deleted file mode 100644 index 1e97963e0bb42fbcb6a3a8cd9d74108aa5048ca6..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_utils/test_execution.py +++ /dev/null @@ -1,361 +0,0 @@ -import os -import unittest - -import numpy as np - -from multiview_platform.tests.utils import rm_tmp, tmp_path, test_dataset - -from multiview_platform.mono_multi_view_classifiers.utils import execution - - -class Test_parseTheArgs(unittest.TestCase): - - def setUp(self): - self.args = [] - - def test_empty_args(self): - args = execution.parse_the_args([]) - -class Test_init_log_file(unittest.TestCase): - - @classmethod - def setUpClass(cls): - os.mkdir(tmp_path) - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_simple(self): - res_dir = execution.init_log_file(name="test_dataset", - views=["V1", "V2", "V3"], - cl_type="", - log=True, - debug=False, - label="No", - result_directory=tmp_path, - args={}) - self.assertTrue(res_dir.startswith(os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))),"tmp_tests", "test_dataset", "started" ))) - - def test_no_log(self): - res_dir = execution.init_log_file(name="test_dataset", - views=["V1", "V2", "V3"], - cl_type="", - log=False, - debug=False, - label="No1", - result_directory=tmp_path, - args={}) - self.assertTrue(res_dir.startswith(os.path.join( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))), - "tmp_tests", "test_dataset", "started"))) - - def test_debug(self): - res_dir = execution.init_log_file(name="test_dataset", - views=["V1", "V2", "V3"], - cl_type="", - log=True, - debug=True, - label="No", - result_directory=tmp_path, - args={}) - self.assertTrue(res_dir.startswith(os.path.join( - os.path.dirname(os.path.dirname(os.path.realpath(__file__))), - "tmp_tests", "test_dataset", "debug_started"))) - -class Test_gen_k_folds(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.statsIter = 1 - - @classmethod - def tearDownClass(cls): - pass - - def test_simple(self): - folds_list = execution.gen_k_folds(stats_iter=1, - nb_folds=4, - stats_iter_random_states=np.random.RandomState(42)) - self.assertEqual(folds_list[0].n_splits, 4) - self.assertEqual(len(folds_list), 1) - - def test_multple_iters(self): - folds_list = execution.gen_k_folds(stats_iter=2, - nb_folds=4, - stats_iter_random_states=[np.random.RandomState(42), np.random.RandomState(43)]) - self.assertEqual(folds_list[0].n_splits, 4) - self.assertEqual(len(folds_list), 2) - - def test_list_rs(self): - folds_list = execution.gen_k_folds(stats_iter=1, - nb_folds=4, - stats_iter_random_states=[np.random.RandomState(42)]) - self.assertEqual(folds_list[0].n_splits, 4) - self.assertEqual(len(folds_list), 1) - - -class Test_init_views(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.statsIter = 1 - - @classmethod - def tearDownClass(cls): - pass - - def test_simple(self): - views, views_indices, all_views = execution.init_views(test_dataset, ["ViewN1", "ViewN2"]) - self.assertEqual(views, ["ViewN1", "ViewN2"]) - self.assertEqual(views_indices, [1,2]) - self.assertEqual(all_views, ["ViewN0", "ViewN1", "ViewN2"]) - - views, views_indices, all_views = execution.init_views(test_dataset,None) - self.assertEqual(views, ["ViewN0", "ViewN1", "ViewN2"]) - self.assertEqual(views_indices, range(3)) - self.assertEqual(all_views, ["ViewN0", "ViewN1", "ViewN2"]) - - -class Test_find_dataset_names(unittest.TestCase): - - @classmethod - def setUpClass(cls): - os.mkdir(tmp_path) - with open(os.path.join(tmp_path, "test.txt"), "w") as file_stream: - file_stream.write("test") - with open(os.path.join(tmp_path, "test1.txt"), "w") as file_stream: - file_stream.write("test") - - - - @classmethod - def tearDownClass(cls): - rm_tmp() - - def test_simple(self): - path, names = execution.find_dataset_names(tmp_path, ".txt", ["test"]) - self.assertEqual(path, tmp_path) - self.assertEqual(names, ["test"]) - path, names = execution.find_dataset_names(tmp_path, ".txt", ["test", 'test1']) - self.assertEqual(path, tmp_path) - self.assertIn("test1", names) - path, names = execution.find_dataset_names("examples/data", ".hdf5", ["all"]) - self.assertIn("doc_summit", names) - self.assertRaises(ValueError, execution.find_dataset_names, tmp_path+"test", ".txt", - ["test"]) - self.assertRaises(ValueError, execution.find_dataset_names, tmp_path, ".txt", ["ah"]) - - -class Test_initStatsIterRandomStates(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.statsIter = 1 - - def test_one_statiter(cls): - cls.state = cls.random_state.get_state()[1] - statsIterRandomStates = execution.init_stats_iter_random_states( - - cls.statsIter, cls.random_state) - np.testing.assert_array_equal(statsIterRandomStates[0].get_state()[1], - cls.state) - - def test_multiple_iter(cls): - cls.statsIter = 3 - statsIterRandomStates = execution.init_stats_iter_random_states( - - cls.statsIter, cls.random_state) - cls.assertAlmostEqual(len(statsIterRandomStates), 3) - cls.assertNotEqual(statsIterRandomStates[0].randint(5000), - statsIterRandomStates[1].randint(5000)) - cls.assertNotEqual(statsIterRandomStates[0].randint(5000), - statsIterRandomStates[2].randint(5000)) - cls.assertNotEqual(statsIterRandomStates[2].randint(5000), - statsIterRandomStates[1].randint(5000)) - - -class Test_getDatabaseFunction(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.name = "zrtTap" - cls.type = ".csv" - - def test_simple(cls): - getDB = execution.get_database_function(cls.name, cls.type) - from multiview_platform.mono_multi_view_classifiers.utils.get_multiview_db import \ - get_classic_db_csv - cls.assertEqual(getDB, get_classic_db_csv) - - def test_hdf5(cls): - cls.type = ".hdf5" - getDB = execution.get_database_function(cls.name, cls.type) - from multiview_platform.mono_multi_view_classifiers.utils.get_multiview_db import \ - get_classic_db_hdf5 - cls.assertEqual(getDB, get_classic_db_hdf5) - - def test_plausible_hdf5(cls): - cls.name = "plausible" - cls.type = ".hdf5" - getDB = execution.get_database_function(cls.name, cls.type) - from multiview_platform.mono_multi_view_classifiers.utils.get_multiview_db import \ - get_plausible_db_hdf5 - cls.assertEqual(getDB, get_plausible_db_hdf5) - - -class Test_initRandomState(unittest.TestCase): - - def setUp(self): - rm_tmp() - os.mkdir(tmp_path) - - def tearDown(self): - os.rmdir(tmp_path) - - def test_random_state_42(self): - randomState_42 = np.random.RandomState(42) - randomState = execution.init_random_state("42", - tmp_path) - os.remove(tmp_path+"random_state.pickle") - np.testing.assert_array_equal(randomState.beta(1, 100, 100), - randomState_42.beta(1, 100, 100)) - - def test_random_state_pickle(self): - randomState_to_pickle = execution.init_random_state(None, - tmp_path) - pickled_randomState = execution.init_random_state( - tmp_path+"random_state.pickle", - tmp_path) - os.remove(tmp_path+"random_state.pickle") - - np.testing.assert_array_equal(randomState_to_pickle.beta(1, 100, 100), - pickled_randomState.beta(1, 100, 100)) - - -class FakeArg(): - - def __init__(self): - self.name = "zrtTap" - self.CL_type = ["fromage", "jambon"] - self.views = ["view1", "view2"] - self.log = True - - -# Impossible to test as the main directory is notthe same for the exec and the test -# class Test_initLogFile(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# cls.fakeArgs = FakeArg() -# cls.timestr = time.strftime("%Y_%m_%d-%H_%M") -# -# def test_initLogFile(cls): -# cls.timestr = time.strftime("%Y_%m_%d-%H_%M") -# execution.initLogFile(cls.fakeArgs) -# cls.assertIn("zrtTap", os.listdir("mutliview_platform/results"), "Database directory not created") -# cls.assertIn("started_"+cls.timestr, os.listdir("mutliview_platform/results/zrtTap"),"experimentation dir not created") -# cls.assertIn(cls.timestr + "-" + ''.join(cls.fakeArgs.CL_type) + "-" + "_".join( -# cls.fakeArgs.views) + "-" + cls.fakeArgs.name + "-LOG.log", os.listdir("mutliview_platform/results/zrtTap/"+"started_"+cls.timestr), "logfile was not created") -# -# @classmethod -# def tearDownClass(cls): -# shutil.rmtree("multiview_platform/results/zrtTap") -# pass - - -class Test_genSplits(unittest.TestCase): - - def setUp(self): - self.stastIter = 3 - self.statsIterRandomStates = [np.random.RandomState(42 + i + 1) for i in - range(self.stastIter)] - self.random_state = np.random.RandomState(42) - self.X_indices = self.random_state.randint(0, 500, 50) - self.labels = np.zeros(500) - self.labels[self.X_indices[:10]] = 1 - self.labels[self.X_indices[11:30]] = 2 # To test multiclass - self.splitRatio = 0.2 - - def test_simple(self): - splits = execution.gen_splits(self.labels, self.splitRatio, - self.statsIterRandomStates) - self.assertEqual(len(splits), 3) - self.assertEqual(len(splits[1]), 2) - self.assertEqual(type(splits[1][0]), np.ndarray) - self.assertAlmostEqual(len(splits[1][0]), 0.8 * 500) - self.assertAlmostEqual(len(splits[1][1]), 0.2 * 500) - self.assertGreater(len(np.where(self.labels[splits[1][0]] == 0)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[1][0]] == 1)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[1][0]] == 2)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[1][1]] == 0)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[1][1]] == 1)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[1][1]] == 2)[0]), 0) - - def test_genSplits_no_iter(self): - splits = execution.gen_splits(self.labels, self.splitRatio, - self.statsIterRandomStates) - self.assertEqual(len(splits), 3) - self.assertEqual(len(splits[0]), 2) - self.assertEqual(type(splits[0][0]), np.ndarray) - self.assertAlmostEqual(len(splits[0][0]), 0.8 * 500) - self.assertAlmostEqual(len(splits[0][1]), 0.2 * 500) - self.assertGreater(len(np.where(self.labels[splits[0][0]] == 0)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[0][0]] == 1)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[0][0]] == 2)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[0][1]] == 0)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[0][1]] == 1)[0]), 0) - self.assertGreater(len(np.where(self.labels[splits[0][1]] == 2)[0]), 0) - - -class Test_genKFolds(unittest.TestCase): - - def setUp(self): - self.statsIter = 2 - self.nbFolds = 5 - self.statsIterRandomStates = [np.random.RandomState(42), - np.random.RandomState(94)] - - def test_genKFolds_iter(self): - pass - - -class Test_genDirecortiesNames(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.directory = tmp_path - cls.stats_iter = 5 - - def test_simple_ovo(cls): - directories = execution.gen_direcorties_names(cls.directory, - cls.stats_iter) - cls.assertEqual(len(directories), 5) - cls.assertEqual(directories[0], os.path.join(tmp_path, "iter_1")) - cls.assertEqual(directories[-1], os.path.join(tmp_path, "iter_5")) - - def test_ovo_no_iter(cls): - cls.stats_iter = 1 - directories = execution.gen_direcorties_names(cls.directory, - cls.stats_iter) - cls.assertEqual(len(directories), 1) - cls.assertEqual(directories[0], tmp_path) - - -class Test_genArgumentDictionaries(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.labelsDictionary = {0: "yes", 1: "No", 2: "Maybe"} - cls.direcories = ["Res/iter_1", "Res/iter_2"] - cls.multiclassLabels = [np.array([0, 1, -100, 1, 0]), - np.array([1, 0, -100, 1, 0]), - np.array([0, 1, -100, 0, 1])] - cls.labelsCombinations = [[0, 1], [0, 2], [1, 2]] - cls.indicesMulticlass = [[[[], []], [[], []], [[], []]], [[], [], []]] - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py deleted file mode 100644 index 41287784af397b9db1246c513d257bf8c8716407..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py +++ /dev/null @@ -1,217 +0,0 @@ -import os -import unittest - -import h5py -import numpy as np -from sklearn.model_selection import StratifiedKFold -from sklearn.metrics import accuracy_score, make_scorer -from multiview_platform.tests.utils import rm_tmp, tmp_path, test_dataset -from sklearn.base import BaseEstimator -import sys - - -from multiview_platform.mono_multi_view_classifiers.utils.dataset import HDF5Dataset -from multiview_platform.mono_multi_view_classifiers.utils import hyper_parameter_search -from multiview_platform.mono_multi_view_classifiers.multiview_classifiers import weighted_linear_early_fusion - - - - - -class FakeEstim(BaseEstimator): - def __init__(self, param1=None, param2=None, random_state=None): - self.param1 = param1 - self.param2 = param2 - - def fit(self, X, y,): - return self - - def accepts_multi_class(self, rs): - return True - - def predict(self, X): - return np.zeros(X.shape[0]) - -class FakeEstimMV(BaseEstimator): - def __init__(self, param1=None, param2=None): - self.param1 = param1 - self.param2 = param2 - - def fit(self, X, y,train_indices=None, view_indices=None): - self.y = y - return self - - def predict(self, X, example_indices=None, view_indices=None): - if self.param1=="return exact": - return self.y[example_indices] - else: - return np.zeros(example_indices.shape[0]) - - - -class Test_Random(unittest.TestCase): - - @classmethod - def setUpClass(cls): - n_splits=2 - cls.estimator = FakeEstim() - cls.param_distributions = {"param1":[10,100], "param2":[11, 101]} - cls.n_iter = 4 - cls.refit = True - cls.n_jobs = 1 - cls.scoring = make_scorer(accuracy_score, ) - cls.cv = StratifiedKFold(n_splits=n_splits, ) - cls.random_state = np.random.RandomState(42) - cls.learning_indices = np.array([0,1,2, 3, 4,]) - cls.view_indices = None - cls.framework = "monoview" - cls.equivalent_draws = False - cls.X = cls.random_state.randint(0,100, (10,11)) - cls.y = cls.random_state.randint(0,2, 10) - - def test_simple(self): - hyper_parameter_search.Random( - self.estimator, self.param_distributions, n_iter=self.n_iter, - refit=self.refit, n_jobs=self.n_jobs, scoring=self.scoring, cv=self.cv, - random_state=self.random_state, - learning_indices=self.learning_indices, view_indices=self.view_indices, - framework=self.framework, - equivalent_draws=self.equivalent_draws - ) - - def test_fit(self): - RSCV = hyper_parameter_search.Random( - self.estimator, self.param_distributions, n_iter=self.n_iter, - refit=self.refit, n_jobs=self.n_jobs, scoring=self.scoring, - cv=self.cv, - random_state=self.random_state, - learning_indices=self.learning_indices, - view_indices=self.view_indices, - framework=self.framework, - equivalent_draws=self.equivalent_draws - ) - RSCV.fit(self.X, self.y, ) - tested_param1 = np.ma.masked_array(data=[10,10,100,100], - mask=[False, False, False, False]) - np.testing.assert_array_equal(RSCV.cv_results_['param_param1'], - tested_param1) - - def test_fit_multiview(self): - RSCV = hyper_parameter_search.Random( - FakeEstimMV(), self.param_distributions, n_iter=self.n_iter, - refit=self.refit, n_jobs=self.n_jobs, scoring=self.scoring, - cv=self.cv, - random_state=self.random_state, - learning_indices=self.learning_indices, - view_indices=self.view_indices, - framework="multiview", - equivalent_draws=self.equivalent_draws - ) - RSCV.fit(test_dataset, self.y, ) - self.assertEqual(RSCV.n_iter, self.n_iter) - - def test_fit_multiview_equiv(self): - self.n_iter=1 - RSCV = hyper_parameter_search.Random( - FakeEstimMV(), self.param_distributions, n_iter=self.n_iter, - refit=self.refit, n_jobs=self.n_jobs, scoring=self.scoring, - cv=self.cv, - random_state=self.random_state, - learning_indices=self.learning_indices, - view_indices=self.view_indices, - framework="multiview", - equivalent_draws=True - ) - RSCV.fit(test_dataset, self.y, ) - self.assertEqual(RSCV.n_iter, self.n_iter*test_dataset.nb_view) - - def test_gets_good_params(self): - self.param_distributions["param1"].append('return exact') - self.n_iter=6 - RSCV = hyper_parameter_search.Random( - FakeEstimMV(), self.param_distributions, n_iter=self.n_iter, - refit=self.refit, n_jobs=self.n_jobs, scoring=self.scoring, - cv=self.cv, - random_state=self.random_state, - learning_indices=self.learning_indices, - view_indices=self.view_indices, - framework="multiview", - equivalent_draws=False - ) - RSCV.fit(test_dataset, self.y, ) - self.assertEqual(RSCV.best_params_["param1"], "return exact") - - -class Test_Grid(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.parameter_grid = {"param1":[5,6], "param2":[7,8]} - cls.estimator = FakeEstim() - - def test_simple(self): - grid = hyper_parameter_search.Grid(self.estimator, - param_grid=self.parameter_grid) - - def test_get_candidate_params(self): - grid = hyper_parameter_search.Grid(self.estimator, - param_grid=self.parameter_grid) - grid.get_candidate_params(None) - self.assertEqual(grid.candidate_params, [{"param1": 5, "param2": 7}, - {"param1": 5, "param2": 8}, - {"param1": 6, "param2": 7}, - {"param1": 6, "param2": 8}]) - - -# if __name__ == '__main__': -# # unittest.main() -# suite = unittest.TestLoader().loadTestsFromTestCase(Test_randomized_search) -# unittest.TextTestRunner(verbosity=2).run(suite) -# class Test_randomized_search(unittest.TestCase): -# -# @classmethod -# def setUpClass(cls): -# rm_tmp() -# cls.random_state = np.random.RandomState(42) -# cls.view_weights = [0.5, 0.5] -# os.mkdir(tmp_path) -# cls.dataset_file = h5py.File( -# tmp_path+"test_file.hdf5", "w") -# cls.labels = cls.dataset_file.create_dataset("Labels", -# data=np.array( -# [0, 1, 0, 0, 1, 0, 1, 0, 0, 1, ])) -# cls.view0_data = cls.random_state.randint(1, 10, size=(10, 4)) -# view0 = cls.dataset_file.create_dataset("View0", -# data=cls.view0_data) -# view0.attrs["sparse"] = False -# view0.attrs["name"] = "ViewN0" -# cls.view1_data = cls.random_state.randint(1, 10, size=(10, 4)) -# view1 = cls.dataset_file.create_dataset("View1", -# data=cls.view1_data) -# view1.attrs["sparse"] = False -# view1.attrs["name"] = "ViewN1" -# metaDataGrp = cls.dataset_file.create_group("Metadata") -# metaDataGrp.attrs["nbView"] = 2 -# metaDataGrp.attrs["nbClass"] = 2 -# metaDataGrp.attrs["datasetLength"] = 10 -# cls.monoview_classifier_name = "decision_tree" -# cls.monoview_classifier_config = {"max_depth": 1, -# "criterion": "gini", -# "splitter": "best"} -# cls.k_folds = StratifiedKFold(n_splits=3, random_state=cls.random_state, -# shuffle=True) -# cls.learning_indices = np.array([1,2,3,4, 5,6,7,8,9]) -# cls.dataset = HDF5Dataset(hdf5_file=cls.dataset_file) -# -# @classmethod -# def tearDownClass(cls): -# cls.dataset_file.close() -# rm_tmp() -# -# -# def test_simple(self): -# best_params, _, params, scores = hyper_parameter_search.randomized_search( -# self.dataset, self.labels[()], "multiview", self.random_state, tmp_path, -# weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds, -# 1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices) -# self.assertIsInstance(best_params, dict) diff --git a/multiview_platform/tests/test_utils/test_multiclass.py b/multiview_platform/tests/test_utils/test_multiclass.py deleted file mode 100644 index 178308ad4da87818e2ff388e3c84aa44ea06fd24..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/test_utils/test_multiclass.py +++ /dev/null @@ -1,164 +0,0 @@ -import unittest - -import numpy as np -from sklearn.base import BaseEstimator - -from multiview_platform.mono_multi_view_classifiers.utils.multiclass import get_mc_estim, \ -OVRWrapper, OVOWrapper, MultiviewOVOWrapper, MultiviewOVRWrapper - -class FakeMCEstim(BaseEstimator): - - def __init__(self): - self.short_name="short_name" - - def accepts_multi_class(self, random_state): - return False - -class FakeEstimNative(FakeMCEstim): - - def accepts_multi_class(self, random_state): - return True - - -class FakeNonProbaEstim(FakeMCEstim): - pass - - -class FakeProbaEstim(FakeMCEstim): - - def predict_proba(self): - pass - - -class Test_get_mc_estim(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.y = cls.random_state.randint(0, 3, 10) - - def test_biclass(self): - y = self.random_state.randint(0,2,10) - estimator="Test" - returned_estimator = get_mc_estim(estimator, self.random_state, y=y) - self.assertEqual(returned_estimator, estimator) - - def test_multiclass_native(self): - estimator = FakeEstimNative() - returned_estimator = get_mc_estim(estimator, self.random_state, y=self.y) - self.assertIsInstance(returned_estimator, FakeEstimNative) - - def test_multiclass_ovo(self): - estimator = FakeNonProbaEstim() - returned_estimator = get_mc_estim(estimator, self.random_state, y=self.y) - self.assertIsInstance(returned_estimator, OVOWrapper) - - def test_multiclass_ovr(self): - estimator = FakeProbaEstim() - returned_estimator = get_mc_estim(estimator, self.random_state, y=self.y) - self.assertIsInstance(returned_estimator, OVRWrapper) - - def test_multiclass_ovo_multiview(self): - estimator = FakeNonProbaEstim() - returned_estimator = get_mc_estim(estimator, self.random_state, - multiview=True, y=self.y, ) - self.assertIsInstance(returned_estimator, MultiviewOVOWrapper) - - def test_multiclass_ovr_multiview(self): - estimator = FakeProbaEstim() - returned_estimator = get_mc_estim(estimator, self.random_state, - multiview=True, y=self.y,) - self.assertIsInstance(returned_estimator, MultiviewOVRWrapper) - -class FakeMVClassifier(BaseEstimator): - - def __init__(self, short_name="None"): - self.short_name = short_name - - def fit(self, X, y, train_indices=None, view_indices=None): - self.n_classes = np.unique(y[train_indices]).shape[0] - self.views_indices = view_indices - - def predict(self, X, example_indices=None, view_indices=None): - self.example_indices = example_indices - self.views_indices = view_indices - return np.zeros((example_indices.shape[0])) - -class FakeMVClassifierProb(FakeMVClassifier): - - def predict_proba(self, X, example_indices=None, view_indices=None): - self.example_indices = example_indices - self.views_indices = view_indices - return np.zeros((example_indices.shape[0], 2)) - -class Test_MultiviewOVRWrapper_fit(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.X = "dataset" - cls.n_classes=3 - cls.y = cls.random_state.randint(0,cls.n_classes,50) - cls.train_indices = np.arange(25) - cls.example_indices = np.arange(25)+25 - cls.view_indices="None" - cls.wrapper = MultiviewOVRWrapper(FakeMVClassifierProb(), ) - - def test_fit(self): - fitted = self.wrapper.fit(self.X, self.y, train_indices=self.train_indices, - view_indices=self.view_indices) - for estimator in fitted.estimators_: - self.assertEqual(estimator.n_classes,2) - self.assertEqual(estimator.views_indices, "None") - - def test_predict(self): - fitted = self.wrapper.fit(self.X, self.y, train_indices=self.train_indices, - view_indices=self.view_indices) - pred = fitted.predict(self.X, example_indices=self.example_indices, - view_indices=self.view_indices) - for estimator in fitted.estimators_: - np.testing.assert_array_equal(estimator.example_indices, - self.example_indices) - - -class FakeDset: - - def __init__(self, n_examples): - self.n_examples = n_examples - - def get_nb_examples(self): - return self.n_examples - -class Test_MultiviewOVOWrapper_fit(unittest.TestCase): - - @classmethod - def setUpClass(cls): - cls.random_state = np.random.RandomState(42) - cls.n_examples=50 - cls.X = FakeDset(n_examples=cls.n_examples) - cls.n_classes=3 - cls.y = cls.random_state.randint(0,cls.n_classes,cls.n_examples) - cls.train_indices = np.arange(int(cls.n_examples/2)) - cls.example_indices = np.arange(int(cls.n_examples/2))+int(cls.n_examples/2) - cls.view_indices="None" - cls.wrapper = MultiviewOVOWrapper(FakeMVClassifier(), ) - - def test_fit(self): - fitted = self.wrapper.fit(self.X, self.y, train_indices=self.train_indices, - view_indices=self.view_indices) - for estimator in fitted.estimators_: - self.assertEqual(estimator.n_classes,2) - self.assertEqual(estimator.views_indices, "None") - - def test_predict(self): - fitted = self.wrapper.fit(self.X, self.y, train_indices=self.train_indices, - view_indices=self.view_indices) - pred = fitted.predict(self.X, example_indices=self.example_indices, - view_indices=self.view_indices) - for estimator in fitted.estimators_: - np.testing.assert_array_equal(estimator.example_indices, - self.example_indices) - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/multiview_platform/tests/utils.py b/multiview_platform/tests/utils.py deleted file mode 100644 index 9a3f04cb0aecb9ba34e7f5318d7f7bab4c81478d..0000000000000000000000000000000000000000 --- a/multiview_platform/tests/utils.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import numpy as np -import h5py - -from ..mono_multi_view_classifiers.utils.dataset import HDF5Dataset - - -tmp_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "tmp_tests/") -# TODO Convert to ram dataset -test_dataset = HDF5Dataset(hdf5_file=h5py.File(os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_database.hdf5"), "r")) - -def rm_tmp(path=tmp_path): - try: - for file_name in os.listdir(path): - if os.path.isdir(os.path.join(path, file_name)): - rm_tmp(os.path.join(path, file_name)) - else: - os.remove(os.path.join(path, file_name)) - os.rmdir(path) - except: - pass - - -def gen_test_dataset(random_state=np.random.RandomState(42)): - dataset_file = h5py.File("test_database.hdf5", "w") - view_names = ["ViewN0", "ViewN1", "ViewN2"] - views = [random_state.randint(0,100,(5,6)) - for _ in range(len(view_names))] - labels = random_state.randint(0,2, 5) - label_names = ["yes", "no"] - for view_index, (view_name, view) in enumerate( - zip(view_names, views)): - view_dataset = dataset_file.create_dataset("View" + str(view_index), - view.shape, - data=view) - view_dataset.attrs["name"] = view_name - view_dataset.attrs["sparse"] = False - labels_dataset = dataset_file.create_dataset("Labels", - shape=labels.shape, - data=labels) - labels_dataset.attrs["names"] = [label_name.encode() - if not isinstance(label_name, bytes) - else label_name - for label_name in label_names] - meta_data_grp = dataset_file.create_group("Metadata") - meta_data_grp.attrs["nbView"] = len(views) - meta_data_grp.attrs["nbClass"] = len(np.unique(labels)) - meta_data_grp.attrs["datasetLength"] = len(labels) - dataset_file.close() - - -if __name__ == "__main__": - gen_test_dataset() diff --git a/setup.cfg b/setup.cfg index 43c4b4ed043e81147260b3d5e1bdec505d7f19c9..5241fde359a3844ec560340808b7dbf99b16b170 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,14 +1,14 @@ [tool:pytest] -testpaths = multiview_platform +testpaths = summit addopts = --cov-report=html --verbose - --cov=multiview_platform + --cov=summit --cov-report=term-missing ; --cov-config setup.cfg --cache-clear [coverage:run] -source = multiview_platform +source = summit include = */mono_multi_view_classifiers/* omit = */tests/* */examples/* diff --git a/setup.py b/setup.py index 8c9bfe2f38516d714047e561c22eae08e0e2be5c..d477504eee57f5391ef6b3a7f34d07421b4d70a9 100644 --- a/setup.py +++ b/setup.py @@ -85,7 +85,7 @@ def setup_package(): # La syntaxe est "nom-de-commande-a-creer = package.module:fonction". entry_points={ 'console_scripts': [ - 'exec_multiview = multiview_platform.execute:exec', + 'exec_multiview = summit.execute:exec', ], }, @@ -96,7 +96,7 @@ def setup_package(): # Il y a encore une chiée de paramètres possibles, mais avec ça vous # couvrez 90% des besoins # ext_modules=cythonize( - # "multiview_platform/mono_multi_view_classifiers/monoview/additions/_custom_criterion.pyx"), + # "summit/mono_multi_view_classifiers/monoview/additions/_custom_criterion.pyx"), ) if __name__ == "__main__":