From eb638666de10583ce5251bd9a9d2ff9127fa2a8a Mon Sep 17 00:00:00 2001
From: Charly Lamothe <charly.lamothe@univ-amu.fr>
Date: Wed, 25 Mar 2020 20:33:01 +0100
Subject: [PATCH] Last changes in compute_results. TODO: clean the code

---
 code/bolsonaro/visualization/plotter.py |  6 +-
 code/compute_results.py                 | 86 +++++++++++++++++++++----
 code/prepare_models.py                  | 31 +++++++++
 3 files changed, 108 insertions(+), 15 deletions(-)
 create mode 100644 code/prepare_models.py

diff --git a/code/bolsonaro/visualization/plotter.py b/code/bolsonaro/visualization/plotter.py
index 5a5f72a..7d3154e 100644
--- a/code/bolsonaro/visualization/plotter.py
+++ b/code/bolsonaro/visualization/plotter.py
@@ -51,6 +51,7 @@ class Plotter(object):
 
     @staticmethod
     def plot_mean_and_CI(ax, mean, lb, ub, x_value, color_mean=None, facecolor=None, label=None):
+        #print(x_value, mean, lb, ub)
         # plot the shaded range of the confidence intervals
         ax.fill_between(x_value, ub, lb, facecolor=facecolor, alpha=.5)
         # plot the mean on top
@@ -105,7 +106,7 @@ class Plotter(object):
 
     @staticmethod
     def plot_stage2_losses(file_path, all_experiment_scores, x_value,
-        xlabel, ylabel, all_labels, title):
+        xlabel, ylabel, all_labels, title, filter_num=-1):
 
         fig, ax = plt.subplots()
 
@@ -124,13 +125,14 @@ class Plotter(object):
             # Compute the mean and the std for the CI
             mean_experiment_scores = np.average(experiment_scores, axis=0)
             std_experiment_scores = np.std(experiment_scores, axis=0)
+
             # Plot the score curve with the CI
             Plotter.plot_mean_and_CI(
                 ax=ax,
                 mean=mean_experiment_scores,
                 lb=mean_experiment_scores + std_experiment_scores,
                 ub=mean_experiment_scores - std_experiment_scores,
-                x_value=x_value,
+                x_value=x_value[:filter_num] if len(mean_experiment_scores) == filter_num else x_value,
                 color_mean=colors[i],
                 facecolor=colors[i],
                 label=all_labels[i]
diff --git a/code/compute_results.py b/code/compute_results.py
index 28b08ac..0d38509 100644
--- a/code/compute_results.py
+++ b/code/compute_results.py
@@ -150,6 +150,35 @@ def extract_weights_across_seeds(models_dir, results_dir, experiment_id):
 
     return experiment_weights
 
+def extract_correlations_across_seeds(models_dir, results_dir, experiment_id):
+    experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
+    experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
+    experiment_correlations = dict()
+
+    # For each seed results stored in models/{experiment_id}/seeds
+    seeds = os.listdir(experiment_seed_root_path)
+    seeds.sort(key=int)
+    for seed in seeds:
+        experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
+        extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' # models/{experiment_id}/seeds/{seed}/forest_size
+
+        # {{seed}:[]}
+        experiment_correlations[seed] = list()
+
+        # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
+        extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
+        extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
+        extracted_forest_sizes.sort(key=int)
+        for extracted_forest_size in extracted_forest_sizes:
+            # models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
+            extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
+            # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
+            model_raw_results = ModelRawResults.load(extracted_forest_size_path)
+            # Save the weights
+            experiment_correlations[seed].append(model_raw_results.correlation)
+
+    return experiment_correlations
+
 def extract_coherences_across_seeds(models_dir, results_dir, experiment_id):
     experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
     experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
@@ -179,7 +208,6 @@ def extract_coherences_across_seeds(models_dir, results_dir, experiment_id):
 
     return experiment_coherences
 
-
 if __name__ == "__main__":
     # get environment variables in .env
     load_dotenv(find_dotenv('.env'))
@@ -203,6 +231,7 @@ if __name__ == "__main__":
     parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.')
     parser.add_argument('--wo_loss_plots', action='store_true', default=DEFAULT_WO_LOSS_PLOTS, help='Do not compute the loss plots.')
     parser.add_argument('--plot_preds_coherence', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the coherence of the prediction trees.')
+    parser.add_argument('--plot_preds_correlation', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the correlation of the prediction trees.')
     args = parser.parse_args()
 
     if args.stage not in list(range(1, 6)):
@@ -501,8 +530,20 @@ if __name__ == "__main__":
             omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
                 args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False, extracted_forest_sizes=extracted_forest_sizes)
 
+        """print(omp_with_params_dev_scores)
+        import sys
+        sys.exit(0)"""
+
+        #all_labels = ['base', 'random', 'omp', 'omp_wo_weights']
         all_labels = ['base', 'random', 'omp']
-        all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores]
+        omp_with_params_test_scores_new = dict()
+        filter_num = -1
+        """filter_num = 9
+        for key, value in omp_with_params_test_scores.items():
+            omp_with_params_test_scores_new[key] = value[:filter_num]"""
+        #all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
+        #    omp_with_params_without_weights_test_scores]
+        all_scores = [base_with_params_dev_scores, random_with_params_dev_scores, omp_with_params_dev_scores]
         #all_scores = [base_with_params_train_scores, random_with_params_train_scores, omp_with_params_train_scores,
         #    omp_with_params_without_weights_train_scores]
 
@@ -515,29 +556,32 @@ if __name__ == "__main__":
                 label = 'similarity_predictions'
             elif 'ensemble' in args.experiment_ids[i]:
                 label = 'ensemble'
+            elif 'omp_distillation' in args.experiment_ids[i]:
+                label = 'omp_distillation'
             else:
                 logger.error('Invalid value encountered')
                 continue
 
             logger.info(f'Loading {label} experiment scores...')
             current_experiment_id = int(args.experiment_ids[i].split('=')[1])
-            current_train_scores, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
+            current_train_scores, current_dev_scores, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
                 args.models_dir, args.results_dir, current_experiment_id)
             all_labels.append(label)
-            all_scores.append(current_test_scores)
+            #all_scores.append(current_test_scores)
             #all_scores.append(current_train_scores)
+            all_scores.append(current_dev_scores)
 
-        output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5')
+        output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_new')
         pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
 
         Plotter.plot_stage2_losses(
-            file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_test_train,dev.png",
+            file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_dev_clean.png",
             all_experiment_scores=all_scores,
             all_labels=all_labels,
             x_value=with_params_extracted_forest_sizes,
             xlabel='Number of trees extracted',
             ylabel=base_with_params_experiment_score_metric,
-            title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
+            title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name), filter_num=filter_num)
 
     """if args.plot_weight_density:
         root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage{args.stage}')
@@ -581,14 +625,14 @@ if __name__ == "__main__":
         experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id)
         Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png'))
     if args.plot_preds_coherence:
-        root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5')
-        all_labels = ['random', 'omp', 'omp_normalize_D']
-        random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
-            with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
+        root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_new')
+        pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True)
+        all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble']
+        _, _, _, with_params_extracted_forest_sizes, _ = \
             extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
-        coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 4]]
+        coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 5, 6, 7, 8]]
         Plotter.plot_stage2_losses(
-            file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}_30_all.png",
+            file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}.png",
             all_experiment_scores=coherence_values,
             all_labels=all_labels,
             x_value=with_params_extracted_forest_sizes,
@@ -596,5 +640,21 @@ if __name__ == "__main__":
             ylabel='Coherence',
             title='Coherence values of {}'.format(args.dataset_name))
         logger.info(f'Computing preds coherence plot...')
+    if args.plot_preds_correlation:
+        root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_new')
+        pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True)
+        all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble']
+        _, _, _, with_params_extracted_forest_sizes, _ = \
+            extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
+        correlation_values = [extract_correlations_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 5, 6, 7, 8]]
+        Plotter.plot_stage2_losses(
+            file_path=root_output_path + os.sep + f"correlations_{'-'.join(all_labels)}.png",
+            all_experiment_scores=correlation_values,
+            all_labels=all_labels,
+            x_value=with_params_extracted_forest_sizes,
+            xlabel='Number of trees extracted',
+            ylabel='correlation',
+            title='correlation values of {}'.format(args.dataset_name))
+        logger.info(f'Computing preds correlation plot...')
 
     logger.info('Done.')
diff --git a/code/prepare_models.py b/code/prepare_models.py
new file mode 100644
index 0000000..04a2ec3
--- /dev/null
+++ b/code/prepare_models.py
@@ -0,0 +1,31 @@
+import pathlib
+import glob2
+import os
+import shutil
+from tqdm import tqdm
+
+
+if __name__ == "__main__":
+    models_source_path = 'models'
+    models_destination_path = 'bolsonaro_models_25-03-20'
+    datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds',
+        'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs']
+
+    pathlib.Path(models_destination_path).mkdir(parents=True, exist_ok=True)
+
+    with tqdm(datasets) as dataset_bar:
+        for dataset in dataset_bar:
+            dataset_bar.set_description(dataset)
+            found_paths = glob2.glob(os.path.join(models_source_path, dataset, 'stage5_new',
+                '**', 'model_raw_results.pickle'), recursive=True)
+            pathlib.Path(os.path.join(models_destination_path, dataset)).mkdir(parents=True, exist_ok=True)
+            with tqdm(found_paths) as found_paths_bar:
+                for path in found_paths_bar:
+                    found_paths_bar.set_description(path)
+                    new_path = path.replace(f'models/{dataset}/stage5_new/', '')
+                    (new_path, filename) = os.path.split(new_path)
+                    new_path = os.path.join(models_destination_path, dataset, new_path)
+                    pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
+                    shutil.copyfile(src=path, dst=os.path.join(new_path, filename))
+                    found_paths_bar.update(1)
+            dataset_bar.update(1)
-- 
GitLab