Last changes in compute_results. TODO: clean the code

eb638666 · Charly Lamothe · 95f543a1 · eb638666 · eb638666 · eb638666
Commit eb638666 authored 5 years ago by Charly Lamothe
--- a/code/bolsonaro/visualization/plotter.py
+++ b/code/bolsonaro/visualization/plotter.py
@@ -51,6 +51,7 @@ class Plotter(object):
    @staticmethod
    def plot_mean_and_CI(ax, mean, lb, ub, x_value, color_mean=None, facecolor=None, label=None):
+        #print(x_value, mean, lb, ub)
        # plot the shaded range of the confidence intervals
        ax.fill_between(x_value, ub, lb, facecolor=facecolor, alpha=.5)
        # plot the mean on top
@@ -105,7 +106,7 @@ class Plotter(object):
    @staticmethod
    def plot_stage2_losses(file_path, all_experiment_scores, x_value,
-        xlabel, ylabel, all_labels, title):
+        xlabel, ylabel, all_labels, title, filter_num=-1):
        fig, ax = plt.subplots()
@@ -124,13 +125,14 @@ class Plotter(object):
            # Compute the mean and the std for the CI
            mean_experiment_scores = np.average(experiment_scores, axis=0)
            std_experiment_scores = np.std(experiment_scores, axis=0)
            # Plot the score curve with the CI
            Plotter.plot_mean_and_CI(
                ax=ax,
                mean=mean_experiment_scores,
                lb=mean_experiment_scores + std_experiment_scores,
                ub=mean_experiment_scores - std_experiment_scores,
-                x_value=x_value,
+                x_value=x_value[:filter_num] if len(mean_experiment_scores) == filter_num else x_value,
                color_mean=colors[i],
                facecolor=colors[i],
                label=all_labels[i]

--- a/code/compute_results.py
+++ b/code/compute_results.py
@@ -150,6 +150,35 @@ def extract_weights_across_seeds(models_dir, results_dir, experiment_id):
    return experiment_weights
+def extract_correlations_across_seeds(models_dir, results_dir, experiment_id):
+    experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
+    experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
+    experiment_correlations = dict()
+    # For each seed results stored in models/{experiment_id}/seeds
+    seeds = os.listdir(experiment_seed_root_path)
+    seeds.sort(key=int)
+    for seed in seeds:
+        experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
+        extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' # models/{experiment_id}/seeds/{seed}/forest_size
+        # {{seed}:[]}
+        experiment_correlations[seed] = list()
+        # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
+        extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
+        extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
+        extracted_forest_sizes.sort(key=int)
+        for extracted_forest_size in extracted_forest_sizes:
+            # models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
+            extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
+            # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
+            model_raw_results = ModelRawResults.load(extracted_forest_size_path)
+            # Save the weights
+            experiment_correlations[seed].append(model_raw_results.correlation)
+    return experiment_correlations
 def extract_coherences_across_seeds(models_dir, results_dir, experiment_id):
    experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
    experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
@@ -179,7 +208,6 @@ def extract_coherences_across_seeds(models_dir, results_dir, experiment_id):
    return experiment_coherences
 if __name__ == "__main__":
    # get environment variables in .env
    load_dotenv(find_dotenv('.env'))
@@ -203,6 +231,7 @@ if __name__ == "__main__":
    parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.')
    parser.add_argument('--wo_loss_plots', action='store_true', default=DEFAULT_WO_LOSS_PLOTS, help='Do not compute the loss plots.')
    parser.add_argument('--plot_preds_coherence', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the coherence of the prediction trees.')
+    parser.add_argument('--plot_preds_correlation', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the correlation of the prediction trees.')
    args = parser.parse_args()
    if args.stage not in list(range(1, 6)):
@@ -501,8 +530,20 @@ if __name__ == "__main__":
            omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
                args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False, extracted_forest_sizes=extracted_forest_sizes)
+        """print(omp_with_params_dev_scores)
+        import sys
+        sys.exit(0)"""
+        #all_labels = ['base', 'random', 'omp', 'omp_wo_weights']
        all_labels = ['base', 'random', 'omp']
-        all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores]
+        omp_with_params_test_scores_new = dict()
+        filter_num = -1
+        """filter_num = 9
+        for key, value in omp_with_params_test_scores.items():
+            omp_with_params_test_scores_new[key] = value[:filter_num]"""
+        #all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
+        #    omp_with_params_without_weights_test_scores]
+        all_scores = [base_with_params_dev_scores, random_with_params_dev_scores, omp_with_params_dev_scores]
        #all_scores = [base_with_params_train_scores, random_with_params_train_scores, omp_with_params_train_scores,
        #    omp_with_params_without_weights_train_scores]
@@ -515,29 +556,32 @@ if __name__ == "__main__":
                label = 'similarity_predictions'
            elif 'ensemble' in args.experiment_ids[i]:
                label = 'ensemble'
+            elif 'omp_distillation' in args.experiment_ids[i]:
+                label = 'omp_distillation'
            else:
                logger.error('Invalid value encountered')
                continue
            logger.info(f'Loading {label} experiment scores...')
            current_experiment_id = int(args.experiment_ids[i].split('=')[1])
-            current_train_scores, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
+            current_train_scores, current_dev_scores, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
                args.models_dir, args.results_dir, current_experiment_id)
            all_labels.append(label)
-            all_scores.append(current_test_scores)
+            #all_scores.append(current_test_scores)
            #all_scores.append(current_train_scores)
+            all_scores.append(current_dev_scores)
-        output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5')
+        output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_new')
        pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
        Plotter.plot_stage2_losses(
-            file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_test_train,dev.png",
+            file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_dev_clean.png",
            all_experiment_scores=all_scores,
            all_labels=all_labels,
            x_value=with_params_extracted_forest_sizes,
            xlabel='Number of trees extracted',
            ylabel=base_with_params_experiment_score_metric,
-            title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
+            title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name), filter_num=filter_num)
    """if args.plot_weight_density:
        root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage{args.stage}')
@@ -581,14 +625,14 @@ if __name__ == "__main__":
        experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id)
        Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png'))
    if args.plot_preds_coherence:
-        root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5')
+        root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_new')
-        all_labels = ['random', 'omp', 'omp_normalize_D']
+        pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True)
-        random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
+        all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble']
-            with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
+        _, _, _, with_params_extracted_forest_sizes, _ = \
            extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
-        coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 4]]
+        coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 5, 6, 7, 8]]
        Plotter.plot_stage2_losses(
-            file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}_30_all.png",
+            file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}.png",
            all_experiment_scores=coherence_values,
            all_labels=all_labels,
            x_value=with_params_extracted_forest_sizes,
@@ -596,5 +640,21 @@ if __name__ == "__main__":
            ylabel='Coherence',
            title='Coherence values of {}'.format(args.dataset_name))
        logger.info(f'Computing preds coherence plot...')
+    if args.plot_preds_correlation:
+        root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_new')
+        pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True)
+        all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble']
+        _, _, _, with_params_extracted_forest_sizes, _ = \
+            extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
+        correlation_values = [extract_correlations_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 5, 6, 7, 8]]
+        Plotter.plot_stage2_losses(
+            file_path=root_output_path + os.sep + f"correlations_{'-'.join(all_labels)}.png",
+            all_experiment_scores=correlation_values,
+            all_labels=all_labels,
+            x_value=with_params_extracted_forest_sizes,
+            xlabel='Number of trees extracted',
+            ylabel='correlation',
+            title='correlation values of {}'.format(args.dataset_name))
+        logger.info(f'Computing preds correlation plot...')
    logger.info('Done.')
--- a/code/prepare_models.py
+++ b/code/prepare_models.py
+import pathlib
+import glob2
+import os
+import shutil
+from tqdm import tqdm
+if __name__ == "__main__":
+    models_source_path = 'models'
+    models_destination_path = 'bolsonaro_models_25-03-20'
+    datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds',
+        'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs']
+    pathlib.Path(models_destination_path).mkdir(parents=True, exist_ok=True)
+    with tqdm(datasets) as dataset_bar:
+        for dataset in dataset_bar:
+            dataset_bar.set_description(dataset)
+            found_paths = glob2.glob(os.path.join(models_source_path, dataset, 'stage5_new',
+                '**', 'model_raw_results.pickle'), recursive=True)
+            pathlib.Path(os.path.join(models_destination_path, dataset)).mkdir(parents=True, exist_ok=True)
+            with tqdm(found_paths) as found_paths_bar:
+                for path in found_paths_bar:
+                    found_paths_bar.set_description(path)
+                    new_path = path.replace(f'models/{dataset}/stage5_new/', '')
+                    (new_path, filename) = os.path.split(new_path)
+                    new_path = os.path.join(models_destination_path, dataset, new_path)
+                    pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
+                    shutil.copyfile(src=path, dst=os.path.join(new_path, filename))
+                    found_paths_bar.update(1)
+            dataset_bar.update(1)