diff --git a/config_files/config_test.yml b/config_files/config_test.yml index fcd7ddf74d9e8c5867ae1a5169d525b23e26a5f2..4eb8866752b17d41258ae6135ee4f4334d8fbea5 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -1,10 +1,10 @@ # The base configuration of the benchmark -log: False -name: ["plausible",] +log: True +name: ["digits",] label: "_" file_type: ".hdf5" views: -pathf: "../data/" +pathf: "/home/baptiste/Documents/Datasets/Digits/" nice: 0 random_state: 42 nb_cores: 1 @@ -28,7 +28,7 @@ stats_iter: 2 metrics: ["accuracy_score", "f1_score"] metric_princ: "accuracy_score" hps_type: "randomized_search-equiv" -hps_iter: 1 +hps_iter: 10 ###################################### diff --git a/multiview_platform/mono_multi_view_classifiers/exec_classif.py b/multiview_platform/mono_multi_view_classifiers/exec_classif.py index 88bcf2799c86f38e053ef00fe23ecfeeb61384e7..39e8a09a4b3d391adfc2ffa38ce0ae7f418c7c19 100644 --- a/multiview_platform/mono_multi_view_classifiers/exec_classif.py +++ b/multiview_platform/mono_multi_view_classifiers/exec_classif.py @@ -1,4 +1,3 @@ -import errno import itertools import logging import os @@ -17,6 +16,7 @@ from .monoview.exec_classif_mono_view import exec_monoview from .multiview.exec_multiview import exec_multiview from .result_analysis import get_results, plot_results_noise, analyze_iterations from .utils import execution, dataset, configuration +from .utils.organization import secure_file_path from .utils.dataset import delete_HDF5 matplotlib.use( @@ -513,14 +513,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary, """ logging.debug("Start:\t Benchmark initialization") - if not os.path.exists( - os.path.dirname(os.path.join(directory, "train_labels.csv"))): - try: - os.makedirs( - os.path.dirname(os.path.join(directory, "train_labels.csv"))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(os.path.join(directory, "train_labels.csv")) train_indices = classification_indices[0] train_labels = dataset_var.get_labels(example_indices=train_indices) np.savetxt(os.path.join(directory, "train_labels.csv"), train_labels, @@ -534,12 +527,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary, for fold_index, (train_cv_indices, test_cv_indices) in enumerate(folds): file_name = os.path.join(directory, "folds", "test_labels_fold_" + str( fold_index) + ".csv") - if not os.path.exists(os.path.dirname(file_name)): - try: - os.makedirs(os.path.dirname(file_name)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(file_name) np.savetxt(file_name, train_labels[test_cv_indices[:min_fold_len]], delimiter=",") labels_names = list(labels_dictionary.values()) diff --git a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py index f673f0631798dfd6e4b698f0f080caafc10b649b..ef0bf719ca8ca4420cdea9bbbb0b214e3351d2a4 100644 --- a/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py +++ b/multiview_platform/mono_multi_view_classifiers/monoview/exec_classif_mono_view.py @@ -2,7 +2,6 @@ """ Execution: Script to perform a MonoView classification """ -import errno import logging # To create Log-Files # Import built-in modules import os # to geth path of the running script @@ -18,6 +17,7 @@ from .. import monoview_classifiers from ..utils import hyper_parameter_search from ..utils.dataset import extract_subset, HDF5Dataset from ..utils.multiclass import get_mc_estim +from ..utils.organization import secure_file_path # Author-Info __author__ = "Nikolas Huelsmann, Baptiste BAUVIN" @@ -175,12 +175,7 @@ def init_constants(args, X, classification_indices, labels_names, output_file_name = os.path.join(directory, cl_type_string, view_name, cl_type_string + '-' + name + "-" + view_name + "-") - if not os.path.exists(os.path.dirname(output_file_name)): - try: - os.makedirs(os.path.dirname(output_file_name)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(output_file_name) return kwargs, t_start, view_name, cl_type, X, learning_rate, labels_string, output_file_name @@ -203,7 +198,7 @@ def get_hyper_params(classifier_module, hyper_param_search, nIter, classifier_mo nIter) + " iterations for " + classifier_module_name) classifier_hp_search = getattr(hyper_parameter_search, hyper_param_search.split("-")[0]) - cl_kwargs, test_folds_preds = classifier_hp_search(X_train, y_train, + cl_kwargs, test_folds_preds, scores, params = classifier_hp_search(X_train, y_train, "monoview", random_state, output_file_name, @@ -216,6 +211,7 @@ def get_hyper_params(classifier_module, hyper_param_search, nIter, classifier_mo classifier_kwargs= kwargs[ classifier_module_name]) + hyper_parameter_search.gen_report(params, scores, output_file_name) logging.debug("Done:\t " + hyper_param_search + " best settings") else: cl_kwargs = kwargs[classifier_module_name] diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py index 59fc7c78402965770a30146d07f5d6b57c1830c4..4fc5ca7f5ec8987a42f5f87dc1b9a64c3a211ed9 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/exec_multiview.py @@ -1,4 +1,3 @@ -import errno import logging import os import os.path @@ -11,6 +10,7 @@ from .multiview_utils import MultiviewResult, MultiviewResultAnalyzer from .. import multiview_classifiers from ..utils import hyper_parameter_search from ..utils.multiclass import get_mc_estim +from ..utils.organization import secure_file_path # Author-Info __author__ = "Baptiste Bauvin" @@ -19,7 +19,7 @@ __status__ = "Prototype" # Production, Development, Prototype def init_constants(kwargs, classification_indices, metrics, name, nb_cores, k_folds, - dataset_var): + dataset_var, directory): """ Used to init the constants Parameters @@ -63,11 +63,13 @@ def init_constants(kwargs, classification_indices, metrics, logging.info("Info:\t Shape of " + str(view_name) + " :" + str( dataset_var.get_shape())) labels = dataset_var.get_labels() + output_file_name = os.path.join(directory, classifier_name, + classifier_name+"-"+dataset_var.get_name()+"-") return classifier_name, t_start, views_indices, \ - classifier_config, views, learning_rate, labels + classifier_config, views, learning_rate, labels, output_file_name -def save_results(classifier, string_analysis, directory, name, images_analysis): +def save_results(string_analysis, images_analysis, output_file_name): """ Save results in derectory @@ -96,16 +98,7 @@ def save_results(classifier, string_analysis, directory, name, images_analysis): """ logging.info(string_analysis) - views_string = "mv" - cl_type_string = classifier.short_name - output_file_name = os.path.join(directory, cl_type_string, - cl_type_string + "-" + views_string + '-' + name) - if not os.path.exists(os.path.dirname(output_file_name)): - try: - os.makedirs(os.path.dirname(output_file_name)) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(output_file_name) output_text_file = open(output_file_name + 'summary.txt', 'w') output_text_file.write(string_analysis) output_text_file.close() @@ -244,8 +237,9 @@ def exec_multiview(directory, dataset_var, name, classification_indices, classifier_config, \ views, \ learning_rate, \ - labels = init_constants(kwargs, classification_indices, metrics, name, - nb_cores, k_folds, dataset_var) + labels, \ + output_file_name = init_constants(kwargs, classification_indices, metrics, name, + nb_cores, k_folds, dataset_var, directory) logging.debug("Done:\t Initialize constants") extraction_time = time.time() - t_start @@ -269,7 +263,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, dataset_var, dataset_var.get_labels(), classifier_module, classifier_name, metrics[0], learning_indices, k_folds, random_state, - directory, nb_cores=nb_cores, views_indices=views_indices, + output_file_name, nb_cores=nb_cores, views_indices=views_indices, searching_tool=hyper_param_search, n_iter=n_iter, classifier_config=classifier_config) classifier = get_mc_estim( @@ -314,7 +308,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, class_label_names=list(labels_dictionary.values()), train_pred=train_pred, test_pred=test_pred, - directory=directory, + output_file_name=output_file_name, labels=labels, database_name=dataset_var.get_name(), nb_cores=nb_cores, @@ -323,7 +317,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, logging.info("Done:\t Result Analysis for " + cl_type) logging.debug("Start:\t Saving preds") - save_results(classifier, string_analysis, directory, name, images_analysis) + save_results(string_analysis, images_analysis, output_file_name) logging.debug("Start:\t Saving preds") return MultiviewResult(cl_type, classifier_config, metrics_scores, diff --git a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py index 644a9f9ef9a1ca56c5ba48763836df5a6b69b77d..3d78d991bd40565fbf4da3aec1b81b547d5c7bb7 100644 --- a/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py +++ b/multiview_platform/mono_multi_view_classifiers/multiview/multiview_utils.py @@ -168,12 +168,12 @@ class MultiviewResultAnalyzer(ResultAnalyser): def __init__(self, view_names, classifier, classification_indices, k_folds, hps_method, metrics_list, n_iter, class_label_names, - train_pred, test_pred, directory, labels, database_name, + train_pred, test_pred, output_file_name, labels, database_name, nb_cores, duration): ResultAnalyser.__init__(self, classifier, classification_indices, k_folds, - hps_method, metrics_list, n_iter, class_label_names, - train_pred, test_pred, directory, labels, database_name, - nb_cores, duration) + hps_method, metrics_list, n_iter, class_label_names, + train_pred, test_pred, output_file_name, labels, database_name, + nb_cores, duration) self.classifier_name = classifier.short_name self.view_names = view_names diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis.py index a5a9a205effd2ba1b008d8745ebec35d6bd349f2..a473ffc16815cf525f7b2cab2eafa6bcac69d037 100644 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis.py +++ b/multiview_platform/mono_multi_view_classifiers/result_analysis.py @@ -1,5 +1,4 @@ # Import built-in modules -import errno import logging import os @@ -13,6 +12,7 @@ from matplotlib.patches import Patch # Import own Modules from .monoview.monoview_utils import MonoviewResult +from .utils.organization import secure_file_path # Author-Info __author__ = "Baptiste Bauvin" @@ -961,12 +961,7 @@ def publish_all_metrics_scores(iter_results, directory, data_base_name, stats_iter, min_size=10): results = [] - if not os.path.exists(os.path.dirname(os.path.join(directory, "a"))): - try: - os.makedirs(os.path.dirname(os.path.join(directory, "a"))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(os.path.join(directory, "a")) for metric_name, scores in iter_results.items(): train = np.array(scores["mean"].loc["train"]) diff --git a/multiview_platform/mono_multi_view_classifiers/utils/base.py b/multiview_platform/mono_multi_view_classifiers/utils/base.py index 82e5cc236f569c189a42fb303d6e4e3cf24e09ef..0f786270c130414ca3e427ed1989a1310bb3670e 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/base.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/base.py @@ -64,6 +64,8 @@ class BaseClassifier(BaseEstimator, ): return self.__class__.__name__ + "with no config." def get_base_estimator(self, base_estimator, estimator_config): + if estimator_config is None: + estimator_config = {} if base_estimator is None: return DecisionTreeClassifier(**estimator_config) if isinstance(base_estimator, str): diff --git a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py index f107b2ea848e1febeeade4b7df85711fd0780b86..f98a325d86726e1c45f33516f4823eea1183add8 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py @@ -1,4 +1,3 @@ -import errno import logging import os import select @@ -9,8 +8,7 @@ import h5py import numpy as np from scipy import sparse - -# from . import get_multiview_db as DB +from .organization import secure_file_path class Dataset(): @@ -305,13 +303,7 @@ class HDF5Dataset(Dataset): self.dataset = hdf5_file self.init_attrs() else: - if not os.path.exists( - os.path.dirname(os.path.join(path, file_name))): - try: - os.makedirs(os.path.dirname(os.path.join(path, file_name))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(os.path.join(path, file_name)) dataset_file = h5py.File(os.path.join(path, file_name), "w") if view_names is None: view_names = ["View" + str(index) for index in diff --git a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py index 4b061122d8cc2516c9166e5422d708df1ede31d9..b3d2a24c7acb043eb43360b63e098a49319cd275 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py @@ -1,10 +1,10 @@ -import errno import os import h5py import numpy as np -from ..utils.dataset import RAMDataset, HDF5Dataset +from .dataset import RAMDataset, HDF5Dataset +from .organization import secure_file_path # Author-Info __author__ = "Baptiste Bauvin" @@ -33,14 +33,7 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, noise_std=0.15, nb_view=3, nb_examples=100, nb_features=10): """Used to generate a plausible dataset to test the algorithms""" - - if not os.path.exists( - os.path.dirname(os.path.join(path, "plausible.hdf5"))): - try: - os.makedirs(os.path.dirname(os.path.join(path, "plausible.hdf5"))) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise + secure_file_path(os.path.join(path, "plausible.hdf5")) example_ids = ["exmaple_id_" + str(i) for i in range(nb_examples)] views = [] view_names = [] diff --git a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py index a483b339aa0bfd7b74b39469a7e0c61e4dd2bf86..654870bb55659a6a0e94a107a55a8b9cc3eaaf37 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/hyper_parameter_search.py @@ -8,6 +8,7 @@ from scipy.stats import randint, uniform from sklearn.model_selection import RandomizedSearchCV from .multiclass import get_mc_estim +from .organization import secure_file_path from .. import metrics @@ -26,12 +27,13 @@ def search_best_settings(dataset_var, labels, classifier_module, if searching_tool is not "None": searching_tool_method = getattr(thismodule, searching_tool.split("-")[0]) - best_settings, test_folds_preds = searching_tool_method( + best_settings, test_folds_preds, scores, params = searching_tool_method( dataset_var, labels, "multiview", random_state, output_file_name, classifier_module, classifier_name, i_k_folds, nb_cores, metrics, n_iter, classifier_config, learning_indices=learning_indices, view_indices=views_indices, equivalent_draws=searching_tool.endswith("equiv")) + gen_report(params, scores, directory, ) else: best_settings = classifier_config return best_settings # or well set clasifier ? @@ -159,17 +161,21 @@ def randomized_search(X, y, framework, random_state, output_file_name, if "random_state" in best_params: best_params.pop("random_state") - scoresArray = random_search.cv_results_['mean_test_score'] - params = [(key[6:], value) for key, value in - random_search.cv_results_.items() if key.startswith("param_")] + scores_array = random_search.cv_results_['mean_test_score'] + sorted_indices = np.argsort(-scores_array) + params = [random_search.cv_results_["params"][score_index] + for score_index in sorted_indices] + scores_array = scores_array[sorted_indices] # gen_heat_maps(params, scores_array, output_file_name) best_estimator = random_search.best_estimator_ else: best_estimator = estimator best_params = {} - testFoldsPreds = get_test_folds_preds(X, y, folds, best_estimator, + scores_array = {} + params = {} + test_folds_preds = get_test_folds_preds(X, y, folds, best_estimator, framework, learning_indices) - return best_params, testFoldsPreds + return best_params, test_folds_preds, scores_array, params from sklearn.base import clone @@ -222,6 +228,7 @@ class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV): self.cv_results_ = dict(("param_" + param_name, []) for param_name in candidate_params[0].keys()) self.cv_results_["mean_test_score"] = [] + self.cv_results_["params"]=[] n_failed = 0 tracebacks = [] for candidate_param_idx, candidate_param in enumerate(candidate_params): @@ -243,13 +250,12 @@ class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV): test_prediction, **self.scoring._kwargs) test_scores[fold_idx] = test_score - for param_name, param in candidate_param.items(): - self.cv_results_["param_" + param_name].append(param) + self.cv_results_['params'].append(current_estimator.get_params()) cross_validation_score = np.mean(test_scores) self.cv_results_["mean_test_score"].append( cross_validation_score) results[candidate_param_idx] = cross_validation_score - if cross_validation_score <= min(results.values()): + if cross_validation_score >= min(results.values()): self.best_params_ = candidate_params[candidate_param_idx] self.best_score_ = cross_validation_score except: @@ -262,7 +268,10 @@ class MultiviewCompatibleRandomizedSearchCV(RandomizedSearchCV): raise ValueError( 'No fits were performed. All HP combination returned errors \n\n' + '\n'.join( tracebacks)) - + self.cv_results_["mean_test_score"] = np.array(self.cv_results_["mean_test_score"]) + # for key, value in self.cv_results_.items(): + # if key.startswith("param_"): + # self.cv_results_[key] = np.ma.array(data=value, mask=[False for _ in value]) if self.refit: self.best_estimator_ = clone(base_estimator).set_params( **self.best_params_) @@ -417,6 +426,18 @@ def gen_heat_maps(params, scores_array, output_file_name): transparent=True) plt.close() + +def gen_report(params, scores_array, output_file_name): + output_string = "" + for parameters, score in zip(params, scores_array): + if "random_state" in parameters: + parameters.pop("random_state") + output_string+="\n{}\t\t{}".format(parameters, score) + secure_file_path(output_file_name + "hps_report.txt") + with open(output_file_name+"hps_report.txt", "w") as output_file: + output_file.write(output_string) + + # nohup python ~/dev/git/spearmint/spearmint/main.py . & # import json diff --git a/multiview_platform/mono_multi_view_classifiers/utils/organization.py b/multiview_platform/mono_multi_view_classifiers/utils/organization.py new file mode 100644 index 0000000000000000000000000000000000000000..663536eb56488a451787060992c63e366e999f57 --- /dev/null +++ b/multiview_platform/mono_multi_view_classifiers/utils/organization.py @@ -0,0 +1,11 @@ +import os +import errno + + +def secure_file_path(file_name): + if not os.path.exists(os.path.dirname(file_name)): + try: + os.makedirs(os.path.dirname(file_name)) + except OSError as exc: + if exc.errno != errno.EEXIST: + raise diff --git a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py index 5ee7c0bb62be208923493b10235c1a45c38cd106..edbfb93f755336956cdf483f1b8b513819e95f6a 100644 --- a/multiview_platform/tests/test_utils/test_hyper_parameter_search.py +++ b/multiview_platform/tests/test_utils/test_hyper_parameter_search.py @@ -55,7 +55,7 @@ class Test_randomized_search(unittest.TestCase): def test_simple(self): - best_params, _ = hyper_parameter_search.randomized_search( + best_params, _, params, scores = hyper_parameter_search.randomized_search( self.dataset, self.labels[()], "multiview", self.random_state, tmp_path, weighted_linear_early_fusion, "WeightedLinearEarlyFusion", self.k_folds, 1, ["accuracy_score", None], 2, {}, learning_indices=self.learning_indices) @@ -80,10 +80,14 @@ class FakeEstimMV(BaseEstimator): self.param2 = param2 def fit(self, X, y,train_indices=None, view_indices=None): + self.y = y return self def predict(self, X, example_indices=None, view_indices=None): - return np.zeros(example_indices.shape[0]) + if self.param1=="return exact": + return self.y[example_indices] + else: + return np.zeros(example_indices.shape[0]) from sklearn.metrics import accuracy_score, make_scorer from sklearn.model_selection import StratifiedKFold @@ -101,12 +105,12 @@ class Test_MultiviewCompatibleRandomizedSearchCV(unittest.TestCase): cls.scoring = make_scorer(accuracy_score, ) cls.cv = StratifiedKFold(n_splits=n_splits, ) cls.random_state = np.random.RandomState(42) - cls.learning_indices = np.array([0,1,2]) + cls.learning_indices = np.array([0,1,2, 3, 4,]) cls.view_indices = None cls.framework = "monoview" cls.equivalent_draws = False - cls.X = cls.random_state.randint(0,100, (5,11)) - cls.y = cls.random_state.randint(0,1, 5) + cls.X = cls.random_state.randint(0,100, (10,11)) + cls.y = cls.random_state.randint(0,2, 10) def test_simple(self): hyper_parameter_search.MultiviewCompatibleRandomizedSearchCV( @@ -164,6 +168,22 @@ class Test_MultiviewCompatibleRandomizedSearchCV(unittest.TestCase): RSCV.fit(test_dataset, self.y, ) self.assertEqual(RSCV.n_iter, self.n_iter*test_dataset.nb_view) + def test_gets_good_params(self): + self.param_distributions["param1"].append('return exact') + self.n_iter=6 + RSCV = hyper_parameter_search.MultiviewCompatibleRandomizedSearchCV( + FakeEstimMV(), self.param_distributions, n_iter=self.n_iter, + refit=self.refit, n_jobs=self.n_jobs, scoring=self.scoring, + cv=self.cv, + random_state=self.random_state, + learning_indices=self.learning_indices, + view_indices=self.view_indices, + framework="multiview", + equivalent_draws=False + ) + RSCV.fit(test_dataset, self.y, ) + self.assertEqual(RSCV.best_params_["param1"], "return exact") + # if __name__ == '__main__': # # unittest.main()