From 00cecb450065e41a52db27ddc8c80fd488a2b5b0 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Thu, 27 Feb 2020 16:11:52 +0100 Subject: [PATCH] Bug correction in randomized search multiview, secure_file_path added --- config_files/config_test.yml | 8 ++-- .../exec_classif.py | 18 ++------ .../monoview/exec_classif_mono_view.py | 12 ++---- .../multiview/exec_multiview.py | 32 ++++++--------- .../multiview/multiview_utils.py | 8 ++-- .../result_analysis.py | 9 +--- .../mono_multi_view_classifiers/utils/base.py | 2 + .../utils/dataset.py | 12 +----- .../utils/get_multiview_db.py | 13 ++---- .../utils/hyper_parameter_search.py | 41 ++++++++++++++----- .../utils/organization.py | 11 +++++ .../test_utils/test_hyper_parameter_search.py | 30 +++++++++++--- 12 files changed, 104 insertions(+), 92 deletions(-) create mode 100644 multiview_platform/mono_multi_view_classifiers/utils/organization.py diff --git a/config_files/config_test.yml b/config_files/config_test.yml index fcd7ddf7..4eb88667 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -1,10 +1,10 @@ # The base configuration of the benchmark -log: False -name: ["plausible",] +log: True +name: ["digits",] label: "_" file_type: ".hdf5" views: -pathf: "../data/" +pathf: "/home/baptiste/Documents/Datasets/Digits/" nice: 0 random_state: 42 nb_cores: 1 @@ -28,7 +28,7 @@ stats_iter: 2 metrics: ["accuracy_score", "f1_score"] metric_princ: "accuracy_score" hps_type: "randomized_search-equiv" -hps_iter: 1 +hps_iter: 10 ###################################### diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py index 88bcf279..39e8a09a 100644 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ b/multiview_platform/mono_multi_view_classifiers/exec_classif.py @@ -1,4 +1,3 @@ -import errno import itertools import logging import os @@ -17,6 +16,7 @@ from .monoview.exec_classif_mono_view import exec_monoview from .multiview.exec_multiview import exec_multiview from .result_analysis import get_results, plot_results_noise, analyze_iterations from .utils import execution, dataset, configuration +from .utils.organization import secure_file_path from .utils.dataset import delete_HDF5 matplotlib.use( @@ -513,14 +513,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary, """ logging.debug("Start:\t Benchmark initialization") - if not os.path.exists( - os.path.dirname(os.path.join(directory, "train_labels.csv"))): - try: - os.makedirs( - os.path.dirname(os.path.join(directory, "train_labels.csv"))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(os.path.join(directory, "train_labels.csv")) train_indices = classification_indices[0] train_labels = dataset_var.get_labels(example_indices=train_indices) np.savetxt(os.path.join(directory, "train_labels.csv"), train_labels, @@ -534,12 +527,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary, for fold_index, (train_cv_indices, test_cv_indices) in enumerate(folds): file_name = os.path.join(directory, "folds", "test_labels_fold_" + str( fold_index) + ".csv") - if not os.path.exists(os.path.dirname(file_name)): - try: - os.makedirs(os.path.dirname(file_name)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(file_name) np.savetxt(file_name, train_labels[test_cv_indices[:min_fold_len]], delimiter=",") labels_names = list(labels_dictionary.values()) diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py index f673f063..ef0bf719 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py @@ -2,7 +2,6 @@ """ Execution: Script to perform a MonoView classification """ -import errno import logging # To create Log-Files # Import built-in modules import os # to geth path of the running script @@ -18,6 +17,7 @@ from .. import monoview_classifiers from ..utils import hyper_parameter_search from ..utils.dataset import extract_subset, HDF5Dataset from ..utils.multiclass import get_mc_estim +from ..utils.organization import secure_file_path # Author-Info __author__ = "Nikolas Huelsmann, Baptiste BAUVIN" @@ -175,12 +175,7 @@ def init_constants(args, X, classification_indices, labels_names, output_file_name = os.path.join(directory, cl_type_string, view_name, cl_type_string + '-' + name + "-" + view_name + "-") - if not os.path.exists(os.path.dirname(output_file_name)): - try: - os.makedirs(os.path.dirname(output_file_name)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(output_file_name) return kwargs, t_start, view_name, cl_type, X, learning_rate, labels_string, output_file_name @@ -203,7 +198,7 @@ def get_hyper_params(classifier_module, hyper_param_search, nIter, classifier_mo nIter) + " iterations for " + classifier_module_name) classifier_hp_search = getattr(hyper_parameter_search, hyper_param_search.split("-")[0]) - cl_kwargs, test_folds_preds = classifier_hp_search(X_train, y_train, + cl_kwargs, test_folds_preds, scores, params = classifier_hp_search(X_train, y_train, "monoview", random_state, output_file_name, @@ -216,6 +211,7 @@ def get_hyper_params(classifier_module, hyper_param_search, nIter, classifier_mo classifier_kwargs= kwargs[ classifier_module_name]) + hyper_parameter_search.gen_report(params, scores, output_file_name) logging.debug("Done:\t " + hyper_param_search + " best settings") else: cl_kwargs = kwargs[classifier_module_name] diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py index 59fc7c78..4fc5ca7f 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py @@ -1,4 +1,3 @@ -import errno import logging import os import os.path @@ -11,6 +10,7 @@ from .multiview_utils import MultiviewResult, MultiviewResultAnalyzer from .. import multiview_classifiers from ..utils import hyper_parameter_search from ..utils.multiclass import get_mc_estim +from ..utils.organization import secure_file_path # Author-Info __author__ = "Baptiste Bauvin" @@ -19,7 +19,7 @@ __status__ = "Prototype" # Production, Development, Prototype def init_constants(kwargs, classification_indices, metrics, name, nb_cores, k_folds, - dataset_var): + dataset_var, directory): """ Used to init the constants Parameters @@ -63,11 +63,13 @@ def init_constants(kwargs, classification_indices, metrics, logging.info("Info:\t Shape of " + str(view_name) + " :" + str( dataset_var.get_shape())) labels = dataset_var.get_labels() + output_file_name = os.path.join(directory, classifier_name, + classifier_name+"-"+dataset_var.get_name()+"-") return classifier_name, t_start, views_indices, \ - classifier_config, views, learning_rate, labels + classifier_config, views, learning_rate, labels, output_file_name -def save_results(classifier, string_analysis, directory, name, images_analysis): +def save_results(string_analysis, images_analysis, output_file_name): """ Save results in derectory @@ -96,16 +98,7 @@ def save_results(classifier, string_analysis, directory, name, images_analysis): """ logging.info(string_analysis) - views_string = "mv" - cl_type_string = classifier.short_name - output_file_name = os.path.join(directory, cl_type_string, - cl_type_string + "-" + views_string + '-' + name) - if not os.path.exists(os.path.dirname(output_file_name)): - try: - os.makedirs(os.path.dirname(output_file_name)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(output_file_name) output_text_file = open(output_file_name + 'summary.txt', 'w') output_text_file.write(string_analysis) output_text_file.close() @@ -244,8 +237,9 @@ def exec_multiview(directory, dataset_var, name, classification_indices, classifier_config, \ views, \ learning_rate, \ - labels = init_constants(kwargs, classification_indices, metrics, name, - nb_cores, k_folds, dataset_var) + labels, \ + output_file_name = init_constants(kwargs, classification_indices, metrics, name, + nb_cores, k_folds, dataset_var, directory) logging.debug("Done:\t Initialize constants") extraction_time = time.time() - t_start @@ -269,7 +263,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, dataset_var, dataset_var.get_labels(), classifier_module, classifier_name, metrics[0], learning_indices, k_folds, random_state, - directory, nb_cores=nb_cores, views_indices=views_indices, + output_file_name, nb_cores=nb_cores, views_indices=views_indices, searching_tool=hyper_param_search, n_iter=n_iter, classifier_config=classifier_config) classifier = get_mc_estim( @@ -314,7 +308,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, class_label_names=list(labels_dictionary.values()), train_pred=train_pred, test_pred=test_pred, - directory=directory, + output_file_name=output_file_name, labels=labels, database_name=dataset_var.get_name(), nb_cores=nb_cores, @@ -323,7 +317,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, logging.info("Done:\t Result Analysis for " + cl_type) logging.debug("Start:\t Saving preds") - save_results(classifier, string_analysis, directory, name, images_analysis) + save_results(string_analysis, images_analysis, output_file_name) logging.debug("Start:\t Saving preds") return MultiviewResult(cl_type, classifier_config, metrics_scores, diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py index 644a9f9e..3d78d991 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py @@ -168,12 +168,12 @@ class MultiviewResultAnalyzer(ResultAnalyser): def __init__(self, view_names, classifier, classification_indices, k_folds, hps_method, metrics_list, n_iter, class_label_names, - train_pred, test_pred, directory, labels, database_name, + train_pred, test_pred, output_file_name, labels, database_name, nb_cores, duration): ResultAnalyser.__init__(self, classifier, classification_indices, k_folds, - hps_method, metrics_list, n_iter, class_label_names, - train_pred, test_pred, directory, labels, database_name, - nb_cores, duration) + hps_method, metrics_list, n_iter, class_label_names, + train_pred, test_pred, output_file_name, labels, database_name, + nb_cores, duration) self.classifier_name = classifier.short_name self.view_names = view_names diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis.py index a5a9a205..a473ffc1 100644 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis.py +++ b/multiview_platform/mono_multi_view_classifiers/result_analysis.py @@ -1,5 +1,4 @@ # Import built-in modules -import errno import logging import os @@ -13,6 +12,7 @@ from matplotlib.patches import Patch # Import own Modules from .monoview.monoview_utils import MonoviewResult +from .utils.organization import secure_file_path # Author-Info __author__ = "Baptiste Bauvin" @@ -961,12 +961,7 @@ def publish_all_metrics_scores(iter_results, directory, data_base_name, stats_iter, min_size=10): results = [] - if not os.path.exists(os.path.dirname(os.path.join(directory, "a"))): - try: - os.makedirs(os.path.dirname(os.path.join(directory, "a"))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(os.path.join(directory, "a")) for metric_name, scores in iter_results.items(): train = np.array(scores["mean"].loc["train"]) diff --git a/multiview_platform/mono_multi_view_classifiers/utils/base.py b/multiview_platform/mono_multi_view_classifiers/utils/base.py index 82e5cc23..0f786270 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/base.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/base.py @@ -64,6 +64,8 @@ class BaseClassifier(BaseEstimator, ): return self.__class__.__name__ + "with no config." def get_base_estimator(self, base_estimator, estimator_config): + if estimator_config is None: + estimator_config = {} if base_estimator is None: return DecisionTreeClassifier(**estimator_config) if isinstance(base_estimator, str): diff --git a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py index f107b2ea..f98a325d 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py @@ -1,4 +1,3 @@ -import errno import logging import os import select @@ -9,8 +8,7 @@ import h5py import numpy as np from scipy import sparse - -# from . import get_multiview_db as DB +from .organization import secure_file_path class Dataset(): @@ -305,13 +303,7 @@ class HDF5Dataset(Dataset): self.dataset = hdf5_file self.init_attrs() else: - if not os.path.exists( - os.path.dirname(os.path.join(path, file_name))): - try: - os.makedirs(os.path.dirname(os.path.join(path, file_name))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(os.path.join(path, file_name)) dataset_file = h5py.File(os.path.join(path, file_name), "w") if view_names is None: view_names = ["View" + str(index) for index in diff --git a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py index 4b061122..b3d2a24c 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py @@ -1,10 +1,10 @@ -import errno import os import h5py import numpy as np -from ..utils.dataset import RAMDataset, HDF5Dataset +from .dataset import RAMDataset, HDF5Dataset +from .organization import secure_file_path # Author-Info __author__ = "Baptiste Bauvin" @@ -33,14 +33,7 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, noise_std=0.15, nb_view=3, nb_examples=100, nb_features=10): """Used to generate a plausible dataset to test the algorithms""" - - if not os.path.exists( - os.path.dirname(os.path.join(path, "plausible.hdf5"))): - try: - os.makedirs(os.path.dirname(os.path.join(path, "plausible.hdf5"))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(os.path.join(path, "plausible.hdf5")) example_ids = ["exmaple_id_" + str(i) for i in range(nb_examples)] views = [] view_names = [] diff --git a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py index a483b339..654870bb 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py @@ -8,6 +8,7 @@ from scipy.stats import randint, uniform from sklearn.model_selection import RandomizedSearchCV from .multiclass import get_mc_estim +from .organization import secure_file_path from .. import metrics @@ -26,12 +27,13 @@ def search_best_settings(dataset_var, labels, classifier_module, if searching_tool is not "None": searching_tool_method = getattr(thismodule, searching_tool.split("-")[0]) - best_settings, test_folds_preds = searching_tool_method( + best_settings, test_folds_preds, scores, params = searching_tool_method( dataset_var, labels, "multiview", random_state, output_file_name, classifier_module, classifier_name, i_k_folds, nb_cores, metrics, n_iter, classifier_config, learning_indices=learning_indices, view_indices=views_indices, equivalent_draws=searching_tool.endswith("equiv")) + gen_report(params, scores, directory, ) else: best_settings = classifier_config return best_settings # or well set clasifier ? @@ -159,17 +161,21 @@ def randomized_search(X, y, framework, random_state, output_file_name, if "random_state" in best_params: best_params.pop("random_state") - scoresArray = random_search.cv_results_['mean_test_score'] - params = [(key[6:], value) for key, value in - random_search.cv_results_.items() if key.startswith("param_")] + scores_array = random_search.cv_results_['mean_test_score'] + sorted_indices = np.argsort(-scores_array) + params = [random_search.cv_results_["params"][score_index] + for score_index in sorted_indices] + scores_array = scores_array[sorted_indices] # gen_heat_maps(params, scores_array, output_file_name) best_estimator = random_search.best_estimator_ else: best_estimator = estimator best_params = {} - testFoldsPreds = get_test_folds_preds(X, y, folds, best_estimator, + scores_array = {} + params = {} + test_folds_preds = get_test_folds_preds(X, y, folds, best_estimator, framework, learning_indices) - return best_params, testFoldsPreds + return best_params, test_folds_preds, scores_array, params from sklearn.base import clone @@ -222,6 +228,7 @@ class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV): self.cv_results_ = dict(("param_" + param_name, []) for param_name in candidate_params[0].keys()) self.cv_results_["mean_test_score"] = [] + self.cv_results_["params"]=[] n_failed = 0 tracebacks = [] for candidate_param_idx, candidate_param in enumerate(candidate_params): @@ -243,13 +250,12 @@ class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV): test_prediction, **self.scoring._kwargs) test_scores[fold_idx] = test_score - for param_name, param in candidate_param.items(): - self.cv_results_["param_" + param_name].append(param) + self.cv_results_['params'].append(current_estimator.get_params()) cross_validation_score = np.mean(test_scores) self.cv_results_["mean_test_score"].append( cross_validation_score) results[candidate_param_idx] = cross_validation_score - if cross_validation_score <= min(results.values()): + if cross_validation_score >= min(results.values()): self.best_params_ = candidate_params[candidate_param_idx] self.best_score_ = cross_validation_score except: @@ -262,7 +268,10 @@ class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV): raise ValueError( 'No fits were performed. All HP combination returned errors \n\n' + '\n'.join( tracebacks)) - + self.cv_results_["mean_test_score"] = np.array(self.cv_results_["mean_test_score"]) + # for key, value in self.cv_results_.items(): + # if key.startswith("param_"): + # self.cv_results_[key] = np.ma.array(data=value, mask=[False for _ in value]) if self.refit: self.best_estimator_ = clone(base_estimator).set_params( **self.best_params_) @@ -417,6 +426,18 @@ def gen_heat_maps(params, scores_array, output_file_name): transparent=True) plt.close() + +def gen_report(params, scores_array, output_file_name): + output_string = "" + for parameters, score in zip(params, scores_array): + if "random_state" in parameters: + parameters.pop("random_state") + output_string+="\n{}\t\t{}".format(parameters, score) + secure_file_path(output_file_name + "hps_report.txt") + with open(output_file_name+"hps_report.txt", "w") as output_file: + output_file.write(output_string) + + # nohup python ~/dev/git/spearmint/spearmint/main.py . & # import json diff --git a/multiview_platform/mono_multi_view_classifiers/utils/organization.py b/multiview_platform/mono_multi_view_classifiers/utils/organization.py new file mode 100644 index 00000000..663536eb --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/utils/organization.py @@ -0,0 +1,11 @@ +import os +import errno + + +def secure_file_path(file_name): + if not os.path.exists(os.path.dirname(file_name)): + try: + os.makedirs(os.path.dirname(file_name)) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise diff --git a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py index 5ee7c0bb..edbfb93f 100644 --- a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py +++ b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py @@ -55,7 +55,7 @@ class Test_randomized_search(unittest.TestCase): def test_simple(self): - best_params, _ = hyper_parameter_search.randomized_search( + best_params, _, params, scores = hyper_parameter_search.randomized_search( self.dataset, self.labels[()], "multiview", self.random_state, tmp_path, weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds, 1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices) @@ -80,10 +80,14 @@ class FakeEstimMV(BaseEstimator): self.param2 = param2 def fit(self, X, y,train_indices=None, view_indices=None): + self.y = y return self def predict(self, X, example_indices=None, view_indices=None): - return np.zeros(example_indices.shape[0]) + if self.param1=="return exact": + return self.y[example_indices] + else: + return np.zeros(example_indices.shape[0]) from sklearn.metrics import accuracy_score, make_scorer from sklearn.model_selection import StratifiedKFold @@ -101,12 +105,12 @@ class Test_MultiviewCompatibleRandomizedSearchCV(unittest.TestCase): cls.scoring = make_scorer(accuracy_score, ) cls.cv = StratifiedKFold(n_splits=n_splits, ) cls.random_state = np.random.RandomState(42) - cls.learning_indices = np.array([0,1,2]) + cls.learning_indices = np.array([0,1,2, 3, 4,]) cls.view_indices = None cls.framework = "monoview" cls.equivalent_draws = False - cls.X = cls.random_state.randint(0,100, (5,11)) - cls.y = cls.random_state.randint(0,1, 5) + cls.X = cls.random_state.randint(0,100, (10,11)) + cls.y = cls.random_state.randint(0,2, 10) def test_simple(self): hyper_parameter_search.MultiviewCompatibleRandomizedSearchCV( @@ -164,6 +168,22 @@ class Test_MultiviewCompatibleRandomizedSearchCV(unittest.TestCase): RSCV.fit(test_dataset, self.y, ) self.assertEqual(RSCV.n_iter, self.n_iter*test_dataset.nb_view) + def test_gets_good_params(self): + self.param_distributions["param1"].append('return exact') + self.n_iter=6 + RSCV = hyper_parameter_search.MultiviewCompatibleRandomizedSearchCV( + FakeEstimMV(), self.param_distributions, n_iter=self.n_iter, + refit=self.refit, n_jobs=self.n_jobs, scoring=self.scoring, + cv=self.cv, + random_state=self.random_state, + learning_indices=self.learning_indices, + view_indices=self.view_indices, + framework="multiview", + equivalent_draws=False + ) + RSCV.fit(test_dataset, self.y, ) + self.assertEqual(RSCV.best_params_["param1"], "return exact") + # if __name__ == '__main__': # # unittest.main() -- GitLab