From e21ca1062e0c394948ea50c228605285adf8abf1 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Mon, 11 Nov 2019 17:33:54 -0500 Subject: [PATCH] ID examples tracking on and working --- config_files/config_test.yml | 8 +- .../result_analysis.py | 67 +++++++++----- .../utils/dataset.py | 27 +++++- .../utils/get_multiview_db.py | 2 + .../tests/test_ResultAnalysis.py | 90 ++++++++++++++++++- 5 files changed, 166 insertions(+), 28 deletions(-) diff --git a/config_files/config_test.yml b/config_files/config_test.yml index 7f1aa89e..719b2e3c 100644 --- a/config_files/config_test.yml +++ b/config_files/config_test.yml @@ -1,15 +1,15 @@ # The base configuration of the benchmark Base : log: true - name: ["plausible"] + name: ["control_vs_malade"] label: "_" type: ".hdf5" - views: + views: ["300nm", "350nm"] pathf: "../data/" nice: 0 random_state: 42 nb_cores: 1 - full: False + full: True debug: True add_noise: False noise_std: 0.0 @@ -23,7 +23,7 @@ Classification: nb_class: 2 classes: type: ["monoview",] - algos_monoview: ["adaboost", "decision_tree"] + algos_monoview: ["decision_tree"] algos_multiview: ["all"] stats_iter: 2 metrics: ["accuracy_score", "f1_score"] diff --git a/multiview_platform/mono_multi_view_classifiers/result_analysis.py b/multiview_platform/mono_multi_view_classifiers/result_analysis.py index ac4e8a3b..f93d72a6 100644 --- a/multiview_platform/mono_multi_view_classifiers/result_analysis.py +++ b/multiview_platform/mono_multi_view_classifiers/result_analysis.py @@ -108,7 +108,7 @@ def plot_metric_scores(train_scores, test_scores, names, nb_results, metric_name autolabel(rect2, ax, set=2, std=train_STDs) ax.legend((rects[0], rect2[0]), ('Test', 'Train')) ax.set_ylim(-0.1, 1.1) - ax.set_xticks(np.arange(nb_results) + barWidth) + ax.set_xticks(np.arange(nb_results) + barWidth/2) ax.set_xticklabels(names, rotation="vertical") try: @@ -187,7 +187,7 @@ def plot_2d(data, classifiers_names, nbClassifiers, nbExamples, for i in range(data.shape[0]) ] fig = plotly.graph_objs.Figure(data=plotly.graph_objs.Heatmap( x=list(classifiers_names), - y=example_ids, + y=[_ for _ in example_ids], z=data, text=hover_text, hoverinfo=["y", "x", "text"], @@ -712,7 +712,7 @@ def publishMulticlassScores(multiclass_results, metrics, stats_iter, direcories, nbResults = classifiers_names.shape[0] fileName = directory + time.strftime( "%Y_%m_%d-%H_%M_%S") + "-" + databaseName + "-" + metric[ - 0] + ".png" + 0] plot_metric_scores(train_scores, validationScores, classifiers_names, nbResults, metric[0], fileName, tag=" multiclass") @@ -844,7 +844,7 @@ def publish_iter_biclass_metrics_scores(iter_results, directory, labels_dictiona return results -def gen_error_dat_glob(combi_results, stats_iter): +def gen_error_data_glob(combi_results, stats_iter): nb_examples = next(iter(combi_results.values())).shape[0] nb_classifiers = len(combi_results) data = np.zeros((nb_examples, nb_classifiers), dtype=int) @@ -870,8 +870,8 @@ def publish_iter_biclass_example_errors(iter_results, directory, "Start:\t Global biclass label analysis figure generation") nbExamples, nbClassifiers, data, \ - error_on_examples, classifier_names = gen_error_dat_glob(combi_results, - stats_iter) + error_on_examples, classifier_names = gen_error_data_glob(combi_results, + stats_iter) np.savetxt(base_file_name + "clf_errors.csv", data, delimiter=",") np.savetxt(base_file_name + "example_errors.csv", error_on_examples, @@ -914,7 +914,7 @@ def publish_iter_multiclass_example_errors(iter_multiclass_results, directory, "Start:\t Global multiclass label analysis figures generation") base_file_name = directory + time.strftime("%Y_%m_%d-%H_%M_%S") + "-" - nb_examples, nb_classifiers, data, error_on_examples = gen_error_dat_glob( + nb_examples, nb_classifiers, data, error_on_examples = gen_error_data_glob( iter_multiclass_results, stats_iter, base_file_name) plot_2d(data, classifiers_names, nb_classifiers, nb_examples, 1, @@ -958,27 +958,47 @@ def add_new_metric(iter_biclass_results, metric, labels_combination, nb_classifi return iter_biclass_results -def analyzebiclass_iter(biclass_results, metrics, stats_iter, directory, - labels_dictionary, data_base_name, nb_examples, example_ids): - """Used to format the results in order to plot the mean results on the iterations""" - classifiers_dict = gen_classifiers_dict(biclass_results, - metrics) - metrics_analysis = dict((key,{}) for key in biclass_results.keys()) - error_analysis = dict((key,{}) for key in biclass_results.keys()) +def format_previous_results(biclass_results): + """ + Formats each statistical iteration's result into a mean/std analysis for + the metrics and adds the errors of each statistical iteration. + + Parameters + ---------- + biclass_results : The raw results, for each statistical iteration i contains + - biclass_results[i]["metrics_scores"] is a dictionary with a pd.dataframe + for each metrics + - biclass_results[i]["example_errors"], a dicaitonary with a np.array + for each classifier. + + Returns + ------- + metrics_analysis : The mean and std dataframes for each metrics + + error_analysis : A dictionary containing the added errors + arrays for each classifier + + """ + metrics_analysis = dict((key, {}) for key in biclass_results.keys()) + error_analysis = dict((key, {}) for key in biclass_results.keys()) for label_combination, biclass_result in biclass_results.items(): concat_dict = {} - for iter_index, metrics_score in enumerate(biclass_result["metrics_scores"]): + for iter_index, metrics_score in enumerate( + biclass_result["metrics_scores"]): for metric_name, dataframe in metrics_score.items(): if metric_name not in concat_dict: concat_dict[metric_name] = dataframe else: - concat_dict[metric_name] = pd.concat([concat_dict[metric_name], dataframe]) + concat_dict[metric_name] = pd.concat( + [concat_dict[metric_name], dataframe]) for metric_name, dataframe in concat_dict.items(): metrics_analysis[label_combination][metric_name] = {} - metrics_analysis[label_combination][metric_name]["mean"] = dataframe.groupby(dataframe.index).mean() - metrics_analysis[label_combination][metric_name]["std"] = dataframe.groupby(dataframe.index).std() + metrics_analysis[label_combination][metric_name][ + "mean"] = dataframe.groupby(dataframe.index).mean() + metrics_analysis[label_combination][metric_name][ + "std"] = dataframe.groupby(dataframe.index).std(ddof=0) added_example_errors = {} for example_errors in biclass_result["example_errors"]: @@ -988,6 +1008,13 @@ def analyzebiclass_iter(biclass_results, metrics, stats_iter, directory, else: added_example_errors[classifier_name] += errors error_analysis[label_combination] = added_example_errors + return metrics_analysis, error_analysis + + +def analyzebiclass_iter(biclass_results, stats_iter, directory, + labels_dictionary, data_base_name, example_ids): + """Used to format the results in order to plot the mean results on the iterations""" + metrics_analysis, error_analysis = format_previous_results(biclass_results) results = publish_iter_biclass_metrics_scores(metrics_analysis, directory, labels_dictionary, @@ -1058,8 +1085,8 @@ def get_results(results, stats_iter, nb_multiclass, benchmark_argument_dictionar directories, example_ids) if stats_iter > 1: results_means_std = analyzebiclass_iter( - biclass_results, metrics, stats_iter, directory, - labels_dictionary, data_base_name, nb_examples, example_ids) + biclass_results, stats_iter, directory, + labels_dictionary, data_base_name, example_ids) if nb_multiclass > 1: results_means_std = analyze_iter_multiclass(multiclass_results, directory, stats_iter, metrics, data_base_name, nb_examples, example_ids) diff --git a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py index 1c3e5961..85666b66 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/dataset.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/dataset.py @@ -106,9 +106,12 @@ class Dataset(): dataset_file.close() self.update_hdf5_dataset(os.path.join(path, file_name)) if example_ids is not None: + example_ids = [example_id if not is_just_number(example_id) + else "ID_"+example_id for example_id in example_ids] self.example_ids = example_ids else: - self.example_ids = [str(i) for i in range(labels.shape[0])] + self.example_ids = ["ID_"+str(i) + for i in range(labels.shape[0])] def rm(self): """ @@ -151,8 +154,11 @@ class Dataset(): """ self.nb_view = self.dataset["Metadata"].attrs["nbView"] self.view_dict = self.get_view_dict() - if "example_ids" in self.dataset["Metadata"].keys(): - self.example_ids = self.dataset["Metadata"]["example_ids"] + if "example_ids" in self.dataset["Metadata"].keys(): + self.example_ids = [example_id.decode() + if not is_just_number(example_id.decode()) + else "ID_"+example_id.decode() + for example_id in self.dataset["Metadata"]["example_ids"]] else: self.example_ids = [str(i) for i in range(self.dataset["Labels"].shape[0])] @@ -284,6 +290,14 @@ class Dataset(): dataset_file_path = os.path.join(path,self.get_name()+"_temp_filter.hdf5") new_dataset_file = h5py.File(dataset_file_path,"w") self.dataset.copy("Metadata", new_dataset_file) + if "example_ids" in self.dataset["Metadata"].keys(): + ex_ids = new_dataset_file["Metadata"]["example_ids"] + ex_ids = np.array([self.example_ids[example_indices]]).astype(np.dtype("S10")) + else: + new_dataset_file["Metadata"].create_dataset("example_ids", + (len(self.example_ids), ), + data=np.array(self.example_ids).astype(np.dtype("S10")), + dtype=np.dtype("S10")) new_dataset_file["Metadata"].attrs["datasetLength"] = len(example_indices) new_dataset_file["Metadata"].attrs["nbClass"] = np.unique(labels) new_dataset_file.create_dataset("Labels", data=labels) @@ -433,6 +447,13 @@ class Dataset(): return selected_label_names +def is_just_number(string): + try: + float(string) + return True + except ValueError: + return False + def datasets_already_exist(pathF, name, nbCores): """Used to check if it's necessary to copy datasets""" allDatasetExist = True diff --git a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py index 4ba4e24b..11e7bd3b 100644 --- a/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py +++ b/multiview_platform/mono_multi_view_classifiers/utils/get_multiview_db.py @@ -63,6 +63,8 @@ def get_plausible_db_hdf5(features, path, file_name, nb_class=3, fake_zero_indices = random_state.randint(int(nb_examples / 2), nb_examples, int(nb_examples / 12)) + for index in np.concatenate((fake_one_indices, fake_zero_indices)): + example_ids[index]+="noised" view_data[fake_one_indices] = np.ones( (len(fake_one_indices), nb_features)) diff --git a/multiview_platform/tests/test_ResultAnalysis.py b/multiview_platform/tests/test_ResultAnalysis.py index 04531a20..bcf63fc7 100644 --- a/multiview_platform/tests/test_ResultAnalysis.py +++ b/multiview_platform/tests/test_ResultAnalysis.py @@ -176,4 +176,92 @@ class Test_gen_error_data(unittest.TestCase): self.assertEqual(nb_examples, 7) self.assertEqual(classifiers_names, ["ada-1", "mv"]) np.testing.assert_array_equal(data_2d, np.array([ada_data, mv_data]).transpose()) - np.testing.assert_array_equal(error_on_examples, -1*(ada_data+mv_data)/nb_classifiers) \ No newline at end of file + np.testing.assert_array_equal(error_on_examples, -1*(ada_data+mv_data)/nb_classifiers) + + +class Test_format_previous_results(unittest.TestCase): + + def test_simple(self): + biclass_results = {"01":{"metrics_scores":[], "example_errors":[]}} + random_state = np.random.RandomState(42) + + # Gen metrics data + metrics_1_data = random_state.uniform(size=(2,2)) + metrics_2_data = random_state.uniform(size=(2,2)) + metric_1_df = pd.DataFrame(data=metrics_1_data, index=["train", "test"], + columns=["ada-1", "mv"]) + metric_2_df = pd.DataFrame(data=metrics_2_data, index=["train", "test"], + columns=["ada-1", "mv"]) + biclass_results["01"]["metrics_scores"].append({"acc": metric_1_df}) + biclass_results["01"]["metrics_scores"].append({"acc": metric_2_df}) + + # Gen error data + ada_error_data_1 = random_state.randint(0,2,7) + ada_error_data_2 = random_state.randint(0, 2, 7) + ada_sum = ada_error_data_1+ada_error_data_2 + mv_error_data_1 = random_state.randint(0, 2, 7) + mv_error_data_2 = random_state.randint(0, 2, 7) + mv_sum = mv_error_data_1+mv_error_data_2 + biclass_results["01"]["example_errors"].append({}) + biclass_results["01"]["example_errors"].append({}) + biclass_results["01"]["example_errors"][0]["ada-1"] = ada_error_data_1 + biclass_results["01"]["example_errors"][0]["mv"] = mv_error_data_1 + biclass_results["01"]["example_errors"][1]["ada-1"] = ada_error_data_2 + biclass_results["01"]["example_errors"][1]["mv"] = mv_error_data_2 + + # Running the function + metric_analysis, error_analysis = result_analysis.format_previous_results(biclass_results) + mean_df = pd.DataFrame(data=np.mean(np.array([metrics_1_data, + metrics_2_data]), + axis=0), + index=["train", "test"], + columns=["ada-1", "mvm"]) + std_df = pd.DataFrame(data=np.std(np.array([metrics_1_data, + metrics_2_data]), + axis=0), + index=["train", "test"], + columns=["ada-1", "mvm"]) + + # Testing + np.testing.assert_array_equal(metric_analysis["01"]["acc"]["mean"].loc["train"], + mean_df.loc["train"]) + np.testing.assert_array_equal(metric_analysis["01"]["acc"]["mean"].loc["test"], + mean_df.loc["test"]) + np.testing.assert_array_equal(metric_analysis["01"]["acc"]["std"].loc["train"], + std_df.loc["train"]) + np.testing.assert_array_equal(metric_analysis["01"]["acc"]["std"].loc["test"], + std_df.loc["test"]) + np.testing.assert_array_equal(ada_sum, error_analysis["01"]["ada-1"]) + np.testing.assert_array_equal(mv_sum, error_analysis["01"]["mv"]) + + +class Test_gen_error_data_glob(unittest.TestCase): + + def test_simple(self): + random_state = np.random.RandomState(42) + + ada_error_data_1 = random_state.randint(0,2,7) + ada_error_data_2 = random_state.randint(0, 2, 7) + ada_sum = ada_error_data_1+ada_error_data_2 + mv_error_data_1 = random_state.randint(0, 2, 7) + mv_error_data_2 = random_state.randint(0, 2, 7) + mv_sum = mv_error_data_1+mv_error_data_2 + + combi_results = {"ada-1":ada_sum, "mv": mv_sum} + + stats_iter = 2 + + nb_examples, nb_classifiers, \ + data, error_on_examples, \ + classifier_names = result_analysis.gen_error_data_glob(combi_results, + stats_iter) + self.assertEqual(nb_examples, 7) + self.assertEqual(nb_classifiers, 2) + np.testing.assert_array_equal(data, np.array([ada_sum, mv_sum]).transpose()) + np.testing.assert_array_equal(error_on_examples, -1*np.sum(np.array([ada_sum, mv_sum]), axis=0)+(nb_classifiers*stats_iter)) + self.assertEqual(classifier_names, ["ada-1", "mv"]) + + + + + -- GitLab