diff --git a/config_files/config_test.yml b/config_files/config_test.yml index 4c075335fbce6711adeeb6f84ced4b571153f60e..478e0e08c5f343e9a929d8a3f9c5c0501ce73041 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -21,8 +21,8 @@ split: 0.8 nb_folds: 2 nb_class: 3 classes: -type: [ "multiview"] -algos_monoview: ["decision_tree", "adaboost", ] +type: [ "monoview"] +algos_monoview: ["decision_tree", ] algos_multiview: ["weighted_linear_late_fusion"] stats_iter: 3 metrics: @@ -33,6 +33,11 @@ hps_type: "Random" hps_args: n_iter: 4 equivalent_draws: False + decision_tree: + max_depth: + Randint: + low: 1 + high: 10 weighted_linear_early_fusion: diff --git a/setup.py b/setup.py index 820b86ec20608ae3293a131f50f4f47e8fd1fa66..14527011e53270d9926d4fafa5da4af06fb912ac 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ def setup_package(): install_requires=requirements, extras_require={ 'dev': ['pytest', 'pytest-cov'], - 'doc': ['sphinx', 'numpydoc', 'docutils', 'sphinx-autoapi']}, + 'doc': ['sphinx >= 3.0.2', 'numpydoc', 'docutils', 'sphinx-autoapi']}, # Il est d'usage de mettre quelques metadata à propos de sa lib # Pour que les robots puissent facilement la classer. diff --git a/summit/multiview_platform/exec_classif.py b/summit/multiview_platform/exec_classif.py index efaae89152deabd28d2071fc8055e134b346451e..6c75194aaf2c10a58cfdd38ecd9e2fa8e071b27b 100644 --- a/summit/multiview_platform/exec_classif.py +++ b/summit/multiview_platform/exec_classif.py @@ -107,9 +107,7 @@ def init_multiview_exps(classifier_names, views_dictionary, nb_class, classifier_name]}, views_dictionary=views_dictionary)] elif hps_method == "Random": - hps_kwargs = dict((key, value) - for key, value in hps_kwargs.items() - if key in ["n_iter", "equivalent_draws"]) + hps_kwargs = get_random_hps_args(hps_kwargs, classifier_name) multiview_arguments += [ gen_single_multiview_arg_dictionary(classifier_name, arguments, @@ -171,9 +169,7 @@ def init_monoview_exps(classifier_names, hps_kwargs[ classifier_name]}) elif hps_method == "Random": - hps_kwargs = dict((key, value) - for key, value in hps_kwargs.items() - if key in ["n_iter", "equivalent_draws"]) + hps_kwargs = get_random_hps_args(hps_kwargs, classifier_name) arguments = gen_single_monoview_arg_dictionary(classifier_name, kwargs_init, nb_class, @@ -198,6 +194,15 @@ def init_monoview_exps(classifier_names, return monoview_arguments +def get_random_hps_args(hps_args, classifier_name): + hps_dict = {} + for key, value in hps_args.items(): + if key in ["n_iter", "equivalent_draws"]: + hps_dict[key] = value + if key==classifier_name: + hps_dict["param_distributions"] = value + return hps_dict + def gen_single_monoview_arg_dictionary(classifier_name, arguments, nb_class, view_index, view_name, hps_kwargs): if classifier_name in arguments: @@ -300,7 +305,7 @@ def init_kwargs(args, classifiers_names, framework="monoview"): For example, for Adaboost, the KWARGS will be `{"n_estimators":<value>, "base_estimator":<value>}`""" - logging.debug("Start:\t Initializing monoview classifiers arguments") + logging.info("Start:\t Initializing monoview classifiers arguments") kwargs = {} for classifiers_name in classifiers_names: try: @@ -316,7 +321,7 @@ def init_kwargs(args, classifiers_names, framework="monoview"): kwargs[classifiers_name] = args[classifiers_name] else: kwargs[classifiers_name] = {} - logging.debug("Done:\t Initializing monoview classifiers arguments") + logging.info("Done:\t Initializing monoview classifiers arguments") return kwargs @@ -402,7 +407,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary, ------- """ - logging.debug("Start:\t Benchmark initialization") + logging.info("Start:\t Benchmark initialization") secure_file_path(os.path.join(directory, "train_labels.csv")) train_indices = classification_indices[0] train_labels = dataset_var.get_labels(sample_indices=train_indices) @@ -421,7 +426,7 @@ def benchmark_init(directory, classification_indices, labels, labels_dictionary, np.savetxt(file_name, train_labels[test_cv_indices[:min_fold_len]], delimiter=",") labels_names = list(labels_dictionary.values()) - logging.debug("Done:\t Benchmark initialization") + logging.info("Done:\t Benchmark initialization") return results_monoview, labels_names @@ -550,7 +555,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, labels_dictionary, k_folds, dataset_var) logging.getLogger('matplotlib.font_manager').disabled = True - logging.debug("Start:\t monoview benchmark") + logging.info("Start:\t monoview benchmark") traceback_outputs = {} for arguments in argument_dictionaries["monoview"]: try: @@ -571,9 +576,9 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, else: raise - logging.debug("Done:\t monoview benchmark") + logging.info("Done:\t monoview benchmark") - logging.debug("Start:\t multiview arguments initialization") + logging.info("Start:\t multiview arguments initialization") # argument_dictionaries = initMultiviewArguments(args, benchmark, views, # views_indices, @@ -581,9 +586,9 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, # random_state, directory, # resultsMonoview, # classification_indices) - logging.debug("Done:\t multiview arguments initialization") + logging.info("Done:\t multiview arguments initialization") - logging.debug("Start:\t multiview benchmark") + logging.info("Start:\t multiview benchmark") results_multiview = [] for arguments in argument_dictionaries["multiview"]: try: @@ -602,7 +607,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, arguments["classifier_name"]] = traceback.format_exc() else: raise - logging.debug("Done:\t multiview benchmark") + logging.info("Done:\t multiview benchmark") return [flag, results_monoview + results_multiview, traceback_outputs] @@ -653,7 +658,7 @@ def exec_benchmark(nb_cores, stats_iter, results : list of lists The results of the benchmark. """ - logging.debug("Start:\t Executing all the needed benchmarks") + logging.info("Start:\t Executing all the needed benchmarks") results = [] # if nb_cores > 1: # if stats_iter > 1 or nb_multiclass > 1: @@ -681,17 +686,17 @@ def exec_benchmark(nb_cores, stats_iter, metrics, sample_ids=dataset_var.sample_ids, labels=dataset_var.get_labels()) results += [benchmark_results] - logging.debug("Done:\t Executing all the needed benchmarks") + logging.info("Done:\t Executing all the needed benchmarks") # Do everything with flagging - logging.debug("Start:\t Analyzing predictions") + logging.info("Start:\t Analyzing predictions") results_mean_stds = analyze(results, stats_iter, benchmark_arguments_dictionaries, metrics, directory, dataset_var.sample_ids, dataset_var.get_labels()) - logging.debug("Done:\t Analyzing predictions") + logging.info("Done:\t Analyzing predictions") delete(benchmark_arguments_dictionaries, nb_cores, dataset_var) return results_mean_stds diff --git a/summit/multiview_platform/monoview/exec_classif_mono_view.py b/summit/multiview_platform/monoview/exec_classif_mono_view.py index 324fdc38c65ca7348e9867596bb0e4a709aa4488..18d896bc353664555135950c21ffa4a137948b89 100644 --- a/summit/multiview_platform/monoview/exec_classif_mono_view.py +++ b/summit/multiview_platform/monoview/exec_classif_mono_view.py @@ -55,7 +55,7 @@ def exec_monoview(directory, X, Y, database_name, labels_names, random_state, hyper_param_search="Random", metrics={"accuracy_score*": {}}, n_iter=30, view_name="", hps_kwargs={}, **args): - logging.debug("Start:\t Loading data") + logging.info("Start:\t Loading data") kwargs, \ t_start, \ view_name, \ @@ -68,9 +68,9 @@ def exec_monoview(directory, X, Y, database_name, labels_names, base_file_name = init_constants(args, X, classification_indices, labels_names, database_name, directory, view_name, ) - logging.debug("Done:\t Loading data") + logging.info("Done:\t Loading data") - logging.debug( + logging.info( "Info:\t Classification - Database:" + str( database_name) + " View:" + str( view_name) + " train ratio:" @@ -78,17 +78,17 @@ def exec_monoview(directory, X, Y, database_name, labels_names, k_folds.n_splits) + ", cores:" + str(nb_cores) + ", algorithm : " + classifier_name) - logging.debug("Start:\t Determine Train/Test split") + logging.info("Start:\t Determine Train/Test split") X_train, y_train, X_test, y_test = init_train_test(X, Y, classification_indices) - logging.debug("Info:\t Shape X_train:" + str( + logging.info("Info:\t Shape X_train:" + str( X_train.shape) + ", Length of y_train:" + str(len(y_train))) - logging.debug("Info:\t Shape X_test:" + str( + logging.info("Info:\t Shape X_test:" + str( X_test.shape) + ", Length of y_test:" + str(len(y_test))) - logging.debug("Done:\t Determine Train/Test split") + logging.info("Done:\t Determine Train/Test split") - logging.debug("Start:\t Generate classifier args") + logging.info("Start:\t Generate classifier args") classifier_module = getattr(monoview_classifiers, classifier_name) classifier_class_name = classifier_module.classifier_class_name hyper_param_beg = time.monotonic() @@ -100,9 +100,9 @@ def exec_monoview(directory, X, Y, database_name, labels_names, k_folds, nb_cores, metrics, kwargs, **hps_kwargs) hyper_param_duration = time.monotonic() - hyper_param_beg - logging.debug("Done:\t Generate classifier args") + logging.info("Done:\t Generate classifier args") - logging.debug("Start:\t Training") + logging.info("Start:\t Training") classifier = get_mc_estim(getattr(classifier_module, classifier_class_name) @@ -112,9 +112,9 @@ def exec_monoview(directory, X, Y, database_name, labels_names, fit_beg = time.monotonic() classifier.fit(X_train, y_train) # NB_CORES=nbCores, fit_duration = time.monotonic() - fit_beg - logging.debug("Done:\t Training") + logging.info("Done:\t Training") - logging.debug("Start:\t Predicting") + logging.info("Start:\t Predicting") train_pred = classifier.predict(X_train) pred_beg = time.monotonic() test_pred = classifier.predict(X_test) @@ -127,14 +127,14 @@ def exec_monoview(directory, X, Y, database_name, labels_names, for testIndex, index in enumerate(classification_indices[1]): full_pred[index] = test_pred[testIndex] - logging.debug("Done:\t Predicting") + logging.info("Done:\t Predicting") whole_duration = time.monotonic() - t_start - logging.debug( + logging.info( "Info:\t Duration for training and predicting: " + str( whole_duration) + "[s]") - logging.debug("Start:\t Getting results") + logging.info("Start:\t Getting results") result_analyzer = MonoviewResultAnalyzer(view_name=view_name, classifier_name=classifier_name, shape=X.shape, @@ -154,9 +154,9 @@ def exec_monoview(directory, X, Y, database_name, labels_names, duration=whole_duration) string_analysis, images_analysis, metrics_scores, class_metrics_scores, \ confusion_matrix = result_analyzer.analyze() - logging.debug("Done:\t Getting results") + logging.info("Done:\t Getting results") - logging.debug("Start:\t Saving preds") + logging.info("Start:\t Saving preds") save_results(string_analysis, output_file_name, full_pred, train_pred, y_train, images_analysis, y_test, confusion_matrix) logging.info("Done:\t Saving results") @@ -203,7 +203,7 @@ def get_hyper_params(classifier_module, search_method, classifier_module_name, output_file_name, k_folds, nb_cores, metrics, kwargs, **hps_kwargs): if search_method != "None": - logging.debug( + logging.info( "Start:\t " + search_method + " best settings for " + classifier_module_name) classifier_hp_search = getattr(hyper_parameter_search, search_method) estimator = getattr(classifier_module, classifier_class_name)( @@ -215,10 +215,10 @@ def get_hyper_params(classifier_module, search_method, classifier_module_name, random_state=random_state, framework="monoview", n_jobs=nb_cores, **hps_kwargs) - hps.fit(X_train, y_train, **kwargs[classifier_module_name]) + hps.fit(X_train, y_train) cl_kwargs = hps.get_best_params() hps.gen_report(output_file_name) - logging.debug("Done:\t " + search_method + " best settings") + logging.info("Done:\t " + search_method + " best settings") else: cl_kwargs = kwargs[classifier_module_name] return cl_kwargs diff --git a/summit/multiview_platform/monoview_classifiers/adaboost.py b/summit/multiview_platform/monoview_classifiers/adaboost.py index 82b380f7c93198128064cd2b290c2d7690bcaf17..cd8ce3db0b769e7ad99032487d94da010988138b 100644 --- a/summit/multiview_platform/monoview_classifiers/adaboost.py +++ b/summit/multiview_platform/monoview_classifiers/adaboost.py @@ -40,6 +40,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): self.weird_strings = {"base_estimator": "class_name"} self.plotted_metric = metrics.zero_one_loss self.plotted_metric_name = "zero_one_loss" + self.base_estimator_config = base_estimator_config self.step_predictions = None def fit(self, X, y, sample_weight=None): diff --git a/summit/multiview_platform/multiview/exec_multiview.py b/summit/multiview_platform/multiview/exec_multiview.py index bed8317a1656137838ecc093e42fb088fca668d6..1f3dcdc39b11f0d0e79c3f1629068bbfd72973b4 100644 --- a/summit/multiview_platform/multiview/exec_multiview.py +++ b/summit/multiview_platform/multiview/exec_multiview.py @@ -237,7 +237,7 @@ def exec_multiview(directory, dataset_var, name, classification_indices, ``MultiviewResult`` """ - logging.debug("Start:\t Initialize constants") + logging.info("Start:\t Initialize constants") cl_type, \ t_start, \ views_indices, \ @@ -250,24 +250,24 @@ def exec_multiview(directory, dataset_var, name, classification_indices, base_file_name, \ metrics = init_constants(kwargs, classification_indices, metrics, name, nb_cores, k_folds, dataset_var, directory) - logging.debug("Done:\t Initialize constants") + logging.info("Done:\t Initialize constants") extraction_time = time.time() - t_start logging.info("Info:\t Extraction duration " + str(extraction_time) + "s") - logging.debug("Start:\t Getting train/test split") + logging.info("Start:\t Getting train/test split") learning_indices, validation_indices = classification_indices - logging.debug("Done:\t Getting train/test split") + logging.info("Done:\t Getting train/test split") - logging.debug("Start:\t Getting classifiers modules") + logging.info("Start:\t Getting classifiers modules") # classifierPackage = getattr(multiview_classifiers, # CL_type) # Permet d'appeler un module avec une string classifier_module = getattr(multiview_classifiers, cl_type) classifier_name = classifier_module.classifier_class_name # classifierClass = getattr(classifierModule, CL_type + "Class") - logging.debug("Done:\t Getting classifiers modules") + logging.info("Done:\t Getting classifiers modules") - logging.debug("Start:\t Optimizing hyperparameters") + logging.info("Start:\t Optimizing hyperparameters") hps_beg = time.monotonic() if hps_method != "None": hps_method_class = getattr(hyper_parameter_search, hps_method) @@ -298,16 +298,16 @@ def exec_multiview(directory, dataset_var, name, classification_indices, **classifier_config), random_state, multiview=True, y=dataset_var.get_labels()) - logging.debug("Done:\t Optimizing hyperparameters") - logging.debug("Start:\t Fitting classifier") + logging.info("Done:\t Optimizing hyperparameters") + logging.info("Start:\t Fitting classifier") fit_beg = time.monotonic() classifier.fit(dataset_var, dataset_var.get_labels(), train_indices=learning_indices, view_indices=views_indices) fit_duration = time.monotonic() - fit_beg - logging.debug("Done:\t Fitting classifier") + logging.info("Done:\t Fitting classifier") - logging.debug("Start:\t Predicting") + logging.info("Start:\t Predicting") train_pred = classifier.predict(dataset_var, sample_indices=learning_indices, view_indices=views_indices) @@ -349,10 +349,10 @@ def exec_multiview(directory, dataset_var, name, classification_indices, confusion_matrix = result_analyzer.analyze() logging.info("Done:\t Result Analysis for " + cl_type) - logging.debug("Start:\t Saving preds") + logging.info("Start:\t Saving preds") save_results(string_analysis, images_analysis, output_file_name, confusion_matrix) - logging.debug("Start:\t Saving preds") + logging.info("Start:\t Saving preds") return MultiviewResult(cl_type, classifier_config, metrics_scores, full_pred, hps_duration, fit_duration, diff --git a/summit/multiview_platform/result_analysis/error_analysis.py b/summit/multiview_platform/result_analysis/error_analysis.py index 12f018072c6ffbd099f304bb0a17c9ba7d6fadf7..7ff5f06b9471ba08a15d2487e334285e1c6c5ef1 100644 --- a/summit/multiview_platform/result_analysis/error_analysis.py +++ b/summit/multiview_platform/result_analysis/error_analysis.py @@ -45,11 +45,11 @@ def get_sample_errors(groud_truth, results): return sample_errors -def publish_sample_errors(sample_errors, directory, databaseName, +def publish_sample_errors(sample_errors, directory, database_name, labels_names, sample_ids, labels): # pragma: no cover - logging.debug("Start:\t Label analysis figure generation") + logging.info("Start:\t Label analysis figure generation") - base_file_name = os.path.join(directory, databaseName + "-") + base_file_name = os.path.join(directory, database_name + "-") nb_classifiers, nb_samples, classifiers_names, \ data_2d, error_on_samples = gen_error_data(sample_errors) @@ -58,19 +58,19 @@ def publish_sample_errors(sample_errors, directory, databaseName, np.savetxt(base_file_name + "bar_plot_data.csv", error_on_samples, delimiter=",") - plot_2d(data_2d, classifiers_names, nb_classifiers, base_file_name, + plot_2d(data_2d, classifiers_names, nb_classifiers, base_file_name, database_name, sample_ids=sample_ids, labels=labels) plot_errors_bar(error_on_samples, nb_samples, - base_file_name, sample_ids=sample_ids) + base_file_name, database_name, sample_ids=sample_ids) - logging.debug("Done:\t Label analysis figures generation") + logging.info("Done:\t Label analysis figures generation") def publish_all_sample_errors(iter_results, directory, stats_iter, - sample_ids, labels): # pragma: no cover - logging.debug( + sample_ids, labels, data_base_name): # pragma: no cover + logging.info( "Start:\t Global label analysis figure generation") nb_samples, nb_classifiers, data, \ @@ -82,12 +82,12 @@ def publish_all_sample_errors(iter_results, directory, delimiter=",") plot_2d(data, classifier_names, nb_classifiers, - os.path.join(directory, ""), stats_iter=stats_iter, + os.path.join(directory, ""), data_base_name, stats_iter=stats_iter, sample_ids=sample_ids, labels=labels) - plot_errors_bar(error_on_samples, nb_samples, os.path.join(directory, ""), + plot_errors_bar(error_on_samples, nb_samples, os.path.join(directory, ""), data_base_name, sample_ids=sample_ids) - logging.debug( + logging.info( "Done:\t Global label analysis figures generation") @@ -151,7 +151,7 @@ def gen_error_data_glob(iter_results, stats_iter): classifier_names -def plot_2d(data, classifiers_names, nb_classifiers, file_name, labels=None, +def plot_2d(data, classifiers_names, nb_classifiers, file_name, dataset_name, labels=None, stats_iter=1, use_plotly=True, sample_ids=None): # pragma: no cover r"""Used to generate a 2D plot of the errors. @@ -218,6 +218,9 @@ def plot_2d(data, classifiers_names, nb_classifiers, file_name, labels=None, ticktext=["Always Wrong", "Always Right"]), reversescale=True), ) fig.update_yaxes(title_text="Examples", showticklabels=True) + fig.update_layout( + title="Dataset : {} <br> Errors for each classifier <br> Generated on <a href='https://baptiste.bauvin.pages.lis-lab.fr/summit'>SuMMIT</a>.".format( + dataset_name)) fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)') fig.update_xaxes(showticklabels=True, ) @@ -226,7 +229,7 @@ def plot_2d(data, classifiers_names, nb_classifiers, file_name, labels=None, del fig -def plot_errors_bar(error_on_samples, nb_samples, file_name, +def plot_errors_bar(error_on_samples, nb_samples, file_name, dataset_name, use_plotly=True, sample_ids=None): # pragma: no cover r"""Used to generate a barplot of the muber of classifiers that failed to classify each samples @@ -257,6 +260,8 @@ def plot_errors_bar(error_on_samples, nb_samples, file_name, [plotly.graph_objs.Bar(x=sample_ids, y=1 - error_on_samples)]) fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)') + title = "Dataset : {} <br> Error % for each sample <br> Generated on <a href='https://baptiste.bauvin.pages.lis-lab.fr/summit'>SuMMIT</a>.".format( + dataset_name) plotly.offline.plot(fig, filename=file_name + "error_analysis_bar.html", auto_open=False) diff --git a/summit/multiview_platform/result_analysis/execution.py b/summit/multiview_platform/result_analysis/execution.py index 7d3c9c6fe80db4b9cb51f62683840019fcb46882..20ff793b2279430d5b4918c38320d1c6b4861654 100644 --- a/summit/multiview_platform/result_analysis/execution.py +++ b/summit/multiview_platform/result_analysis/execution.py @@ -65,7 +65,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, label combination, regrouping the scores for each metrics and the information useful to plot errors on samples. """ - logging.debug("Start:\t Analyzing all results") + logging.info("Start:\t Analyzing all results") iter_results = {"metrics_scores": [i for i in range(stats_iter)], "class_metrics_scores": [i for i in range(stats_iter)], "sample_errors": [i for i in range(stats_iter)], @@ -105,7 +105,7 @@ def analyze_iterations(results, benchmark_argument_dictionaries, stats_iter, iter_results["labels"] = labels iter_results["durations"][iter_index] = durations - logging.debug("Done:\t Analyzing all results") + logging.info("Done:\t Analyzing all results") return res, iter_results, flagged_tracebacks_list, labels_names @@ -124,7 +124,7 @@ def analyze_all(iter_results, stats_iter, directory, data_base_name, data_base_name, stats_iter, label_names) publish_all_sample_errors(error_analysis, directory, stats_iter, - sample_ids, labels) + sample_ids, labels, data_base_name) publish_feature_importances(feature_importances, directory, data_base_name, feature_importances_stds) plot_durations(duration_means, directory, data_base_name, duration_stds) diff --git a/summit/multiview_platform/result_analysis/metric_analysis.py b/summit/multiview_platform/result_analysis/metric_analysis.py index c2db7b26d9d933f2981e809a2cf7ee5505c4ab7a..560976d591434ba736f256efad26b153c919b1f6 100644 --- a/summit/multiview_platform/result_analysis/metric_analysis.py +++ b/summit/multiview_platform/result_analysis/metric_analysis.py @@ -95,7 +95,7 @@ def publish_metrics_graphs(metrics_scores, directory, database_name, """ results = [] for metric_name in metrics_scores.keys(): - logging.debug( + logging.info( "Start:\t Score graph generation for " + metric_name) train_scores, test_scores, classifier_names, \ file_name, nb_results, results, \ @@ -106,13 +106,13 @@ def publish_metrics_graphs(metrics_scores, directory, database_name, class_metric_scores[metric_name]) plot_metric_scores(train_scores, test_scores, classifier_names, - nb_results, metric_name, file_name, - tag=" " + " vs ".join(labels_names)) + nb_results, metric_name, file_name, database_name, + tag=" vs ".join(labels_names)) class_file_name = file_name+"-class" plot_class_metric_scores(class_test_scores, class_file_name, labels_names, classifier_names, metric_name) - logging.debug( + logging.info( "Done:\t Score graph generation for " + metric_name) return results @@ -137,7 +137,7 @@ def publish_all_metrics_scores(iter_results, class_iter_results, directory, nb_results = classifier_names.shape[0] plot_metric_scores(train, test, classifier_names, nb_results, - metric_name, file_name, tag=" averaged", + metric_name, file_name, data_base_name, tag="Averaged", train_STDs=train_std, test_STDs=test_std) results += [[classifier_name, metric_name, test_mean, test_std] for classifier_name, test_mean, test_std @@ -186,7 +186,7 @@ def init_plot(results, metric_name, metric_dataframe, def plot_metric_scores(train_scores, test_scores, names, nb_results, metric_name, - file_name, + file_name, dataset_name, tag="", train_STDs=None, test_STDs=None, use_plotly=True): # pragma: no cover r"""Used to plot and save the score barplot for a specific metric. @@ -272,7 +272,7 @@ def plot_metric_scores(train_scores, test_scores, names, nb_results, )) fig.update_layout( - title=metric_name + "<br>" + tag + " scores for each classifier") + title="Dataset : {}, metric : {}, task : {} <br> Scores for each classifier <br> Generated on <a href='https://baptiste.bauvin.pages.lis-lab.fr/summit'>SuMMIT</a>.".format(dataset_name, metric_name, tag)) fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)') plotly.offline.plot(fig, filename=file_name + ".html", auto_open=False) diff --git a/summit/multiview_platform/utils/base.py b/summit/multiview_platform/utils/base.py index 8dcaaf819ba346757e5fbe620d8ca9a490033cc9..c88294c59977b3160ddada9b1a343f8fa8f12909 100644 --- a/summit/multiview_platform/utils/base.py +++ b/summit/multiview_platform/utils/base.py @@ -99,6 +99,9 @@ class BaseClassifier(BaseEstimator, ): if "config" in self.weird_strings[param_name]: string += "( with " + self.get_params()[ param_name].params_to_string() + ")" + return string + elif self.get_params()[param_name] is None: + return "None" else: return str(self.get_params()[param_name]) diff --git a/summit/multiview_platform/utils/configuration.py b/summit/multiview_platform/utils/configuration.py index 75bd4b0254d23a6a4669787cc8d24cccbd5cf3f4..9c79b83b3dd07d64b228b637ba901bf177d16284 100644 --- a/summit/multiview_platform/utils/configuration.py +++ b/summit/multiview_platform/utils/configuration.py @@ -3,7 +3,6 @@ import os import yaml package_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -print(package_path) def get_the_args(path_to_config_file=os.path.join(os.path.dirname(package_path), "config_files", "config.yml")): """ diff --git a/summit/multiview_platform/utils/dataset.py b/summit/multiview_platform/utils/dataset.py index 25c396fa8fcb84fcb0e4e7152bd266fb824f7c77..07023b756fea909cc01e821de05f1f28febf61b8 100644 --- a/summit/multiview_platform/utils/dataset.py +++ b/summit/multiview_platform/utils/dataset.py @@ -693,14 +693,14 @@ def init_multiple_datasets(path_f, name, nb_cores): # pragma: no cover """ if nb_cores > 1: if datasets_already_exist(path_f, name, nb_cores): - logging.debug( + logging.info( "Info:\t Enough copies of the dataset are already available") pass else: if os.path.getsize( os.path.join(path_f, name + ".hdf5")) * nb_cores / float( 1024) / 1000 / 1000 > 0.1: - logging.debug("Start:\t Creating " + str( + logging.info("Start:\t Creating " + str( nb_cores) + " temporary datasets for multiprocessing") logging.warning( r" WARNING : /!\ This may use a lot of HDD storage space : " + @@ -715,7 +715,7 @@ def init_multiple_datasets(path_f, name, nb_cores): # pragma: no cover else: pass dataset_files = copy_hdf5(path_f, name, nb_cores) - logging.debug("Start:\t Creating datasets for multiprocessing") + logging.info("Start:\t Creating datasets for multiprocessing") return dataset_files @@ -732,10 +732,10 @@ def copy_hdf5(pathF, name, nbCores): def delete_HDF5(benchmarkArgumentsDictionaries, nbCores, dataset): """Used to delete temporary copies at the end of the benchmark""" if nbCores > 1: - logging.debug("Start:\t Deleting " + str( + logging.info("Start:\t Deleting " + str( nbCores) + " temporary datasets for multiprocessing") args = benchmarkArgumentsDictionaries[0]["args"] - logging.debug("Start:\t Deleting datasets for multiprocessing") + logging.info("Start:\t Deleting datasets for multiprocessing") for coreIndex in range(nbCores): os.remove(args["pathf"] + args["name"] + str(coreIndex) + ".hdf5") diff --git a/summit/multiview_platform/utils/execution.py b/summit/multiview_platform/utils/execution.py index 0ce9886406018fabe7c30f0264106009bac857a5..4c2e94b7a91a1cc00ccb4f2a46ac5c404504534e 100644 --- a/summit/multiview_platform/utils/execution.py +++ b/summit/multiview_platform/utils/execution.py @@ -171,7 +171,7 @@ def init_log_file(name, views, cl_type, log, debug, label, log_file_path = os.path.join(result_directory, log_file_name) os.makedirs(os.path.dirname(log_file_path)) logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', - filename=log_file_path, level=logging.DEBUG, + filename=log_file_path, level=logging.INFO, filemode='w') if log: logging.getLogger().addHandler(logging.StreamHandler()) @@ -321,8 +321,6 @@ def find_dataset_names(path, type, names): the needed dataset names.""" package_path = os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) - print(package_path, os.path.isdir(path), - os.path.isdir(os.path.join(package_path, path)), ) if os.path.isdir(path): pass elif os.path.isdir(os.path.join(package_path, path)): diff --git a/summit/multiview_platform/utils/hyper_parameter_search.py b/summit/multiview_platform/utils/hyper_parameter_search.py index 0fd65b9309939b23a8530d5ba116fda38ae7a340..84211acbf79ba909298814528597ab8657fa4653 100644 --- a/summit/multiview_platform/utils/hyper_parameter_search.py +++ b/summit/multiview_platform/utils/hyper_parameter_search.py @@ -28,6 +28,25 @@ from .organization import secure_file_path class HPSearch: + def translate_param_distribs(self, param_distribs): + translated_params = {} + if param_distribs is None: + return translated_params + for param_name, value in param_distribs.items(): + if type(value) == list: + translated_params[param_name] = value + elif type(value)==dict: + if "Uniform" in value.keys(): + distrib = self.translate_uniform(value["Uniform"]) + elif "Randint" in value.keys(): + distrib = self.translate_randint(value["Randint"]) + else: + distrib=value + translated_params[param_name] = distrib + else: + translated_params[param_name] = value + return translated_params + def get_scoring(self, metric): if isinstance(metric, dict): metric_module, metric_kwargs = get_metric(metric) @@ -138,13 +157,15 @@ class Random(RandomizedSearchCV, HPSearch): random_state=None, learning_indices=None, view_indices=None, framework="monoview", equivalent_draws=True, track_tracebacks=True): - if param_distributions is None: - param_distributions = self.get_param_distribs(estimator) + param_distributions = self.get_param_distribs(estimator, param_distributions) + + scoring = HPSearch.get_scoring(self, scoring) RandomizedSearchCV.__init__(self, estimator, n_iter=n_iter, param_distributions=param_distributions, refit=refit, n_jobs=n_jobs, scoring=scoring, cv=cv, random_state=random_state) + self.framework = framework self.available_indices = learning_indices self.view_indices = view_indices @@ -152,11 +173,22 @@ class Random(RandomizedSearchCV, HPSearch): self.track_tracebacks = track_tracebacks self.tracebacks = [] - def get_param_distribs(self, estimator): + def translate_uniform(self, args): + return CustomUniform(**args) + + def translate_randint(self, args): + return CustomRandint(**args) + + + def get_param_distribs(self, estimator, user_distribs): + user_distribs = self.translate_param_distribs(user_distribs) if isinstance(estimator, MultiClassWrapper): - return estimator.estimator.gen_distribs() + base_distribs = estimator.estimator.gen_distribs() else: - return estimator.gen_distribs() + base_distribs = estimator.gen_distribs() + for key, value in user_distribs.items(): + base_distribs[key] = value + return base_distribs def fit(self, X, y=None, groups=None, **fit_params): # pragma: no cover if self.framework == "monoview": @@ -174,10 +206,6 @@ class Random(RandomizedSearchCV, HPSearch): ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state)) - # def fit_multiview(self, X, y=None, groups=None, track_tracebacks=True, - # **fit_params): - # n_splits = self.cv.get_n_splits(self.available_indices, - # y[self.available_indices]) class Grid(GridSearchCV, HPSearch): @@ -208,153 +236,19 @@ class Grid(GridSearchCV, HPSearch): self.candidate_params = list(ParameterGrid(self.param_grid)) self.n_iter = len(self.candidate_params) +class CustomDist: -# class ParameterSamplerGrid: -# -# def __init__(self, param_distributions, n_iter): -# from math import floor -# n_points_per_param = int(n_iter **(1/len(param_distributions))) -# selected_params = dict((param_name, []) -# for param_name in param_distributions.keys()) -# for param_name, distribution in param_distributions.items(): -# if isinstance(distribution, list): -# if len(distribution)<n_points_per_param: -# selected_params[param_name] = distribution -# else: -# index_step = floor(len(distribution)/n_points_per_param-2) -# selected_params[param_name] = distribution[0]+[distribution[index*index_step+1] -# for index -# in range(n_points_per_param)] - - -# -# def hps_search(): -# pass -# -# def grid_search(X, y, framework, random_state, output_file_name, -# classifier_module, -# classifier_name, folds=4, nb_cores=1, -# metric=["accuracy_score", None], -# n_iter=30, classifier_kwargs={}, learning_indices=None, -# view_indices=None, -# equivalent_draws=True, grid_search_config=None): -# """Used to perfom gridsearch on the classifiers""" -# pass - - -# class RS(HPSSearch): -# -# def __init__(self, X, y, framework, random_state, output_file_name, -# classifier_module, -# classifier_name, folds=4, nb_cores=1, -# metric=["accuracy_score", None], -# n_iter=30, classifier_kwargs={}, learning_indices=None, -# view_indices=None, -# equivalent_draws=True): -# HPSSearch.__init__() - - -# def randomized_search(X, y, framework, random_state, output_file_name, -# classifier_module, -# classifier_name, folds=4, nb_cores=1, -# metric=["accuracy_score", None], -# n_iter=30, classifier_kwargs={}, learning_indices=None, -# view_indices=None, -# equivalent_draws=True): -# estimator = getattr(classifier_module, classifier_name)( -# random_state=random_state, -# **classifier_kwargs) -# params_dict = estimator.gen_distribs() -# estimator = get_mc_estim(estimator, random_state, -# multiview=(framework == "multiview"), -# y=y) -# if params_dict: -# metric_module, metric_kwargs = get_metric(metric) -# scorer = metric_module.get_scorer(**metric_kwargs) -# # nb_possible_combinations = compute_possible_combinations(params_dict) -# # n_iter_real = min(n_iter, nb_possible_combinations) -# -# random_search = MultiviewCompatibleRandomizedSearchCV(estimator, -# n_iter=n_iter, -# param_distributions=params_dict, -# refit=True, -# n_jobs=nb_cores, -# scoring=scorer, -# cv=folds, -# random_state=random_state, -# learning_indices=learning_indices, -# view_indices=view_indices, -# framework=framework, -# equivalent_draws=equivalent_draws) -# random_search.fit(X, y) -# return random_search.transform_results() -# else: -# best_estimator = estimator -# best_params = {} -# scores_array = {} -# params = {} -# test_folds_preds = np.zeros(10)#get_test_folds_preds(X, y, folds, best_estimator, -# # framework, learning_indices) -# return best_params, scores_array, params - - -# -# def spear_mint(dataset, classifier_name, views_indices=None, k_folds=None, -# n_iter=1, -# **kwargs): -# """Used to perform spearmint on the classifiers to optimize hyper parameters, -# longer than randomsearch (can't be parallelized)""" -# pass -# -# -# def gen_heat_maps(params, scores_array, output_file_name): -# """Used to generate a heat map for each doublet of hyperparms -# optimized on the previous function""" -# nb_params = len(params) -# if nb_params > 2: -# combinations = itertools.combinations(range(nb_params), 2) -# elif nb_params == 2: -# combinations = [(0, 1)] -# else: -# combinations = [()] -# for combination in combinations: -# if combination: -# param_name1, param_array1 = params[combination[0]] -# param_name2, param_array2 = params[combination[1]] -# else: -# param_name1, param_array1 = params[0] -# param_name2, param_array2 = ("Control", np.array([0])) -# -# param_array1_set = np.sort(np.array(list(set(param_array1)))) -# param_array2_set = np.sort(np.array(list(set(param_array2)))) -# -# scores_matrix = np.zeros( -# (len(param_array2_set), len(param_array1_set))) - 0.1 -# for param1, param2, score in zip(param_array1, param_array2, -# scores_array): -# param1_index, = np.where(param_array1_set == param1) -# param2_index, = np.where(param_array2_set == param2) -# scores_matrix[int(param2_index), int(param1_index)] = score -# -# plt.figure(figsize=(8, 6)) -# plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95) -# plt.imshow(scores_matrix, interpolation='nearest', cmap=plt.cm.hot, -# ) -# plt.xlabel(param_name1) -# plt.ylabel(param_name2) -# plt.colorbar() -# plt.xticks(np.arange(len(param_array1_set)), param_array1_set) -# plt.yticks(np.arange(len(param_array2_set)), param_array2_set, -# rotation=45) -# plt.title('Validation metric') -# plt.savefig( -# output_file_name + "heat_map-" + param_name1 + "-" + param_name2 + ".png", -# transparent=True) -# plt.close() -# - + def multiply(self, random_number): + if self.multiplier == "e-": + return 10 ** -random_number + elif self.multiplier =="e": + return 10**random_number + elif type(self.multiplier) in [int, float]: + return self.multiplier*random_number + else: + return random_number -class CustomRandint: +class CustomRandint(CustomDist): """Used as a distribution returning a integer between low and high-1. It can be used with a multiplier agrument to be able to perform more complex generation for example 10 e -(randint)""" @@ -366,20 +260,14 @@ class CustomRandint: self.multiplier = multiplier def rvs(self, random_state=None): - randinteger = self.randint.rvs(random_state=random_state) - if self.multiplier == "e-": - return 10 ** -randinteger - else: - return randinteger + rand_integer = self.randint.rvs(random_state=random_state) + return self.multiply(rand_integer) def get_nb_possibilities(self): - if self.multiplier == "e-": - return abs(10 ** -self.low - 10 ** -self.high) - else: - return self.high - self.low + return self.high - self.low -class CustomUniform: +class CustomUniform(CustomDist): """Used as a distribution returning a float between loc and loc + scale.. It can be used with a multiplier agrument to be able to perform more complex generation for example 10 e -(float)""" @@ -390,10 +278,9 @@ class CustomUniform: def rvs(self, random_state=None): unif = self.uniform.rvs(random_state=random_state) - if self.multiplier == 'e-': - return 10 ** -unif - else: - return unif + return self.multiply(unif) + + def format_params(params, pref=""): diff --git a/summit/tests/test_utils/test_base.py b/summit/tests/test_utils/test_base.py index dc4ccb680affd98391338ebecfe2afde9a1e079a..981118e9df6b88f35c49e9e8e4b23cb51b280456 100644 --- a/summit/tests/test_utils/test_base.py +++ b/summit/tests/test_utils/test_base.py @@ -143,7 +143,6 @@ class Test_ResultAnalyzer(unittest.TestCase): self.nb_cores, self.duration) RA.get_all_metrics_scores() string = RA.print_metric_score() - print(repr(string)) self.assertEqual(string, '\n\n\tFor Accuracy score using {}, (higher is better) : \n\t\t- Score on train : 0.25\n\t\t- Score on test : 0.2692307692307692\n\n\tFor F1 score using average: micro, {} (higher is better) : \n\t\t- Score on train : 0.25\n\t\t- Score on test : 0.2692307692307692\n\nTest set confusion matrix : \n\n╒════════╤══════════╤══════════╤══════════╕\n│ │ class1 │ class2 │ class3 │\n╞════════╪══════════╪══════════╪══════════╡\n│ class1 │ 3 │ 1 │ 2 │\n├────────┼──────────┼──────────┼──────────┤\n│ class2 │ 3 │ 2 │ 2 │\n├────────┼──────────┼──────────┼──────────┤\n│ class3 │ 3 │ 8 │ 2 │\n╘════════╧══════════╧══════════╧══════════╛\n\n') def test_get_db_config_string(self): @@ -182,7 +181,6 @@ class Test_ResultAnalyzer(unittest.TestCase): self.labels, self.database_name, self.nb_cores, self.duration) str_analysis, img_analysis, metric_scores, class_metric_scores, conf_mat = RA.analyze() - print(repr(str_analysis)) self.assertEqual(str_analysis, 'test2Database configuration : \n\t- Database name : test_database\ntest\t- Learning Rate : 0.48\n\t- Labels used : class1, class2, class3\n\t- Number of cross validation folds : 5\n\nClassifier configuration : \n\t- FakeClassifier with test1 : 10, test2 : test\n\t- Executed on 0.5 core(s) \n\t- Got configuration using randomized search with 6 iterations \n\n\n\tFor Accuracy score using {}, (higher is better) : \n\t\t- Score on train : 0.25\n\t\t- Score on test : 0.2692307692307692\n\n\tFor F1 score using average: micro, {} (higher is better) : \n\t\t- Score on train : 0.25\n\t\t- Score on test : 0.2692307692307692\n\nTest set confusion matrix : \n\n╒════════╤══════════╤══════════╤══════════╕\n│ │ class1 │ class2 │ class3 │\n╞════════╪══════════╪══════════╪══════════╡\n│ class1 │ 3 │ 1 │ 2 │\n├────────┼──────────┼──────────┼──────────┤\n│ class2 │ 3 │ 2 │ 2 │\n├────────┼──────────┼──────────┼──────────┤\n│ class3 │ 3 │ 8 │ 2 │\n╘════════╧══════════╧══════════╧══════════╛\n\n\n\n Classification took -1 day, 23:59:56\n\n Classifier Interpretation : \n') diff --git a/summit/tests/test_utils/test_hyper_parameter_search.py b/summit/tests/test_utils/test_hyper_parameter_search.py index e1b848c5fff55faceb549b83e5108712bd0747b1..5b635b077f9ab1a011c9f368c6854a31b65febcd 100644 --- a/summit/tests/test_utils/test_hyper_parameter_search.py +++ b/summit/tests/test_utils/test_hyper_parameter_search.py @@ -29,6 +29,9 @@ class FakeEstim(BaseEstimator): def predict(self, X): return np.zeros(X.shape[0]) + def gen_distribs(self): + return {"param1":"", "param2":""} + class FakeEstimMV(BaseEstimator): def __init__(self, param1=None, param2=None): @@ -45,6 +48,9 @@ class FakeEstimMV(BaseEstimator): else: return np.zeros(sample_indices.shape[0]) + def gen_distribs(self): + return {"param1":"", "param2":""} + class Test_Random(unittest.TestCase):