diff --git a/code/bolsonaro/visualization/plotter.py b/code/bolsonaro/visualization/plotter.py index 5a5f72ad9fade836dcfed3c2ef6f452653dcf3d1..7d3154e1b15f153c85ef0be360fd990f4395fde5 100644 --- a/code/bolsonaro/visualization/plotter.py +++ b/code/bolsonaro/visualization/plotter.py @@ -51,6 +51,7 @@ class Plotter(object): @staticmethod def plot_mean_and_CI(ax, mean, lb, ub, x_value, color_mean=None, facecolor=None, label=None): + #print(x_value, mean, lb, ub) # plot the shaded range of the confidence intervals ax.fill_between(x_value, ub, lb, facecolor=facecolor, alpha=.5) # plot the mean on top @@ -105,7 +106,7 @@ class Plotter(object): @staticmethod def plot_stage2_losses(file_path, all_experiment_scores, x_value, - xlabel, ylabel, all_labels, title): + xlabel, ylabel, all_labels, title, filter_num=-1): fig, ax = plt.subplots() @@ -124,13 +125,14 @@ class Plotter(object): # Compute the mean and the std for the CI mean_experiment_scores = np.average(experiment_scores, axis=0) std_experiment_scores = np.std(experiment_scores, axis=0) + # Plot the score curve with the CI Plotter.plot_mean_and_CI( ax=ax, mean=mean_experiment_scores, lb=mean_experiment_scores + std_experiment_scores, ub=mean_experiment_scores - std_experiment_scores, - x_value=x_value, + x_value=x_value[:filter_num] if len(mean_experiment_scores) == filter_num else x_value, color_mean=colors[i], facecolor=colors[i], label=all_labels[i] diff --git a/code/compute_results.py b/code/compute_results.py index 28b08ac26599fd716d130d06ef80b54e2d1338e7..0d3850931c83f1a1ef349e519501b4d3955cb684 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -150,6 +150,35 @@ def extract_weights_across_seeds(models_dir, results_dir, experiment_id): return experiment_weights +def extract_correlations_across_seeds(models_dir, results_dir, experiment_id): + experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} + experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds + experiment_correlations = dict() + + # For each seed results stored in models/{experiment_id}/seeds + seeds = os.listdir(experiment_seed_root_path) + seeds.sort(key=int) + for seed in seeds: + experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed} + extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' # models/{experiment_id}/seeds/{seed}/forest_size + + # {{seed}:[]} + experiment_correlations[seed] = list() + + # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes + extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path) + extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ] + extracted_forest_sizes.sort(key=int) + for extracted_forest_size in extracted_forest_sizes: + # models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size} + extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file + model_raw_results = ModelRawResults.load(extracted_forest_size_path) + # Save the weights + experiment_correlations[seed].append(model_raw_results.correlation) + + return experiment_correlations + def extract_coherences_across_seeds(models_dir, results_dir, experiment_id): experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds @@ -179,7 +208,6 @@ def extract_coherences_across_seeds(models_dir, results_dir, experiment_id): return experiment_coherences - if __name__ == "__main__": # get environment variables in .env load_dotenv(find_dotenv('.env')) @@ -203,6 +231,7 @@ if __name__ == "__main__": parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.') parser.add_argument('--wo_loss_plots', action='store_true', default=DEFAULT_WO_LOSS_PLOTS, help='Do not compute the loss plots.') parser.add_argument('--plot_preds_coherence', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the coherence of the prediction trees.') + parser.add_argument('--plot_preds_correlation', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the correlation of the prediction trees.') args = parser.parse_args() if args.stage not in list(range(1, 6)): @@ -501,8 +530,20 @@ if __name__ == "__main__": omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False, extracted_forest_sizes=extracted_forest_sizes) + """print(omp_with_params_dev_scores) + import sys + sys.exit(0)""" + + #all_labels = ['base', 'random', 'omp', 'omp_wo_weights'] all_labels = ['base', 'random', 'omp'] - all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores] + omp_with_params_test_scores_new = dict() + filter_num = -1 + """filter_num = 9 + for key, value in omp_with_params_test_scores.items(): + omp_with_params_test_scores_new[key] = value[:filter_num]""" + #all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, + # omp_with_params_without_weights_test_scores] + all_scores = [base_with_params_dev_scores, random_with_params_dev_scores, omp_with_params_dev_scores] #all_scores = [base_with_params_train_scores, random_with_params_train_scores, omp_with_params_train_scores, # omp_with_params_without_weights_train_scores] @@ -515,29 +556,32 @@ if __name__ == "__main__": label = 'similarity_predictions' elif 'ensemble' in args.experiment_ids[i]: label = 'ensemble' + elif 'omp_distillation' in args.experiment_ids[i]: + label = 'omp_distillation' else: logger.error('Invalid value encountered') continue logger.info(f'Loading {label} experiment scores...') current_experiment_id = int(args.experiment_ids[i].split('=')[1]) - current_train_scores, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes( + current_train_scores, current_dev_scores, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes( args.models_dir, args.results_dir, current_experiment_id) all_labels.append(label) - all_scores.append(current_test_scores) + #all_scores.append(current_test_scores) #all_scores.append(current_train_scores) + all_scores.append(current_dev_scores) - output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5') + output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_new') pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) Plotter.plot_stage2_losses( - file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_test_train,dev.png", + file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_dev_clean.png", all_experiment_scores=all_scores, all_labels=all_labels, x_value=with_params_extracted_forest_sizes, xlabel='Number of trees extracted', ylabel=base_with_params_experiment_score_metric, - title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) + title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name), filter_num=filter_num) """if args.plot_weight_density: root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage{args.stage}') @@ -581,14 +625,14 @@ if __name__ == "__main__": experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id) Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png')) if args.plot_preds_coherence: - root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5') - all_labels = ['random', 'omp', 'omp_normalize_D'] - random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ - with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ + root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_new') + pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True) + all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble'] + _, _, _, with_params_extracted_forest_sizes, _ = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2) - coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 4]] + coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 5, 6, 7, 8]] Plotter.plot_stage2_losses( - file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}_30_all.png", + file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}.png", all_experiment_scores=coherence_values, all_labels=all_labels, x_value=with_params_extracted_forest_sizes, @@ -596,5 +640,21 @@ if __name__ == "__main__": ylabel='Coherence', title='Coherence values of {}'.format(args.dataset_name)) logger.info(f'Computing preds coherence plot...') + if args.plot_preds_correlation: + root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_new') + pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True) + all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble'] + _, _, _, with_params_extracted_forest_sizes, _ = \ + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2) + correlation_values = [extract_correlations_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 5, 6, 7, 8]] + Plotter.plot_stage2_losses( + file_path=root_output_path + os.sep + f"correlations_{'-'.join(all_labels)}.png", + all_experiment_scores=correlation_values, + all_labels=all_labels, + x_value=with_params_extracted_forest_sizes, + xlabel='Number of trees extracted', + ylabel='correlation', + title='correlation values of {}'.format(args.dataset_name)) + logger.info(f'Computing preds correlation plot...') logger.info('Done.') diff --git a/code/prepare_models.py b/code/prepare_models.py new file mode 100644 index 0000000000000000000000000000000000000000..04a2ec3f4446b48268bddaa2e452ef31af9d7eb3 --- /dev/null +++ b/code/prepare_models.py @@ -0,0 +1,31 @@ +import pathlib +import glob2 +import os +import shutil +from tqdm import tqdm + + +if __name__ == "__main__": + models_source_path = 'models' + models_destination_path = 'bolsonaro_models_25-03-20' + datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds', + 'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs'] + + pathlib.Path(models_destination_path).mkdir(parents=True, exist_ok=True) + + with tqdm(datasets) as dataset_bar: + for dataset in dataset_bar: + dataset_bar.set_description(dataset) + found_paths = glob2.glob(os.path.join(models_source_path, dataset, 'stage5_new', + '**', 'model_raw_results.pickle'), recursive=True) + pathlib.Path(os.path.join(models_destination_path, dataset)).mkdir(parents=True, exist_ok=True) + with tqdm(found_paths) as found_paths_bar: + for path in found_paths_bar: + found_paths_bar.set_description(path) + new_path = path.replace(f'models/{dataset}/stage5_new/', '') + (new_path, filename) = os.path.split(new_path) + new_path = os.path.join(models_destination_path, dataset, new_path) + pathlib.Path(new_path).mkdir(parents=True, exist_ok=True) + shutil.copyfile(src=path, dst=os.path.join(new_path, filename)) + found_paths_bar.update(1) + dataset_bar.update(1)