diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py index 96fbb06217b2771c1cf906f7ee5d89097296bcb1..7e39c13add645401a4953a74025add113a153e13 100644 --- a/code/bolsonaro/utils.py +++ b/code/bolsonaro/utils.py @@ -14,5 +14,5 @@ def resolve_experiment_id(models_dir): if os.path.isdir(models_dir + os.sep + x)] if len(ids) > 0: ids.sort(key=int) - return int(max(ids)) + 1 + return int(max([int(i) for i in ids])) + 1 return 1 diff --git a/code/bolsonaro/visualization/plotter.py b/code/bolsonaro/visualization/plotter.py index c119d47431b25d49ec15b0a147228cc5ed9b92cd..a548e1b1072529eac230cc2a7f0d8ccbc921240b 100644 --- a/code/bolsonaro/visualization/plotter.py +++ b/code/bolsonaro/visualization/plotter.py @@ -6,17 +6,14 @@ from sklearn.neighbors.kde import KernelDensity class Plotter(object): @staticmethod - def weight_density(weights): - """ - TODO: to complete - """ + def weight_density(weights, X, file_path): X_plot = [np.exp(elem) for elem in weights] fig, ax = plt.subplots() for kernel in ['gaussian', 'tophat', 'epanechnikov']: kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X_plot) log_dens = kde.score_samples(X_plot) - ax.plot(X_plot[:, 0], np.exp(log_dens), '-', + ax.plot(X_plot, np.exp(log_dens), '-', label="kernel = '{0}'".format(kernel)) ax.legend(loc='upper left') @@ -24,4 +21,46 @@ class Plotter(object): ax.set_xlim(-4, 9) ax.set_ylim(-0.02, 0.4) - plt.show() + fig.savefig(file_path, dpi=fig.dpi) + plt.close(fig) + + @staticmethod + def plot_mean_and_CI(ax, mean, lb, ub, x_value, color_mean=None, facecolor=None, label=None): + # plot the shaded range of the confidence intervals + ax.fill_between(x_value, ub, lb, facecolor=facecolor, alpha=.5) + # plot the mean on top + ax.plot(x_value, mean, c=color_mean, label=label) + + @staticmethod + def plot_losses(file_path, all_experiment_scores, x_value, xlabel, ylabel, all_labels, title): + fig, ax = plt.subplots() + + n = len(all_experiment_scores) + + colors = Plotter.get_colors_from_cmap(n) + + for i in range(n): + experiment_scores = list(all_experiment_scores[i].values()) + mean_experiment_scores = np.average(experiment_scores, axis=0) + std_experiment_scores = np.std(experiment_scores, axis=0) + Plotter.plot_mean_and_CI( + ax=ax, + mean=mean_experiment_scores, + lb=mean_experiment_scores + std_experiment_scores, + ub=mean_experiment_scores - std_experiment_scores, + x_value=x_value, + color_mean=colors[i], + facecolor=colors[i], + label=all_labels[i] + ) + + plt.xlabel(xlabel) + plt.ylabel(ylabel) + plt.title(title) + plt.legend(loc='upper right') + fig.savefig(file_path, dpi=fig.dpi) + plt.close(fig) + + @staticmethod + def get_colors_from_cmap(n_colors, colormap_name='nipy_spectral'): + return [plt.get_cmap(colormap_name)(1. * i/n_colors) for i in range(n_colors)] diff --git a/code/compute_results.py b/code/compute_results.py index 16bbbe4e73609059f4609472b3e23da17ef7813a..21777e2d4611995f83cc25ba08ded2789ea17650 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -2,6 +2,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_loader import DatasetLoader from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.models.model_factory import ModelFactory +from bolsonaro.visualization.plotter import Plotter import argparse import pathlib @@ -34,15 +35,43 @@ if __name__ == "__main__": for experiment_id in experiments_ids: experiment_id_path = args.models_dir + os.sep + experiment_id + pathlib.Path(args.results_dir + os.sep + experiment_id).mkdir(parents=True, exist_ok=True) experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' + + experiment_train_scores = dict() + experiment_dev_scores = dict() + experiment_test_scores = dict() + experiment_score_metrics = list() + for seed in os.listdir(experiment_seed_root_path): experiment_seed_path = experiment_seed_root_path + os.sep + seed dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) dataset = DatasetLoader.load(dataset_parameters) extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' - for extracted_forest_size in os.listdir(extracted_forest_size_root_path): + + experiment_train_scores[seed] = list() + experiment_dev_scores[seed] = list() + experiment_test_scores[seed] = list() + + extracted_forest_sizes = os.listdir(extracted_forest_size_root_path) + for extracted_forest_size in extracted_forest_sizes: extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size model_raw_results = ModelRawResults.load(extracted_forest_size_path) model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results) - - + experiment_train_scores[seed].append(model_raw_results.train_score) + experiment_dev_scores[seed].append(model_raw_results.dev_score) + experiment_test_scores[seed].append(model_raw_results.test_score) + experiment_score_metrics.append(model_raw_results.score_metric) + + if len(set(experiment_score_metrics)) > 1: + raise ValueError("The metrics used to compute the dev score aren't the same everytime") + + Plotter.plot_losses( + file_path=args.results_dir + os.sep + experiment_id + os.sep + 'losses.png', + all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores], + x_value=extracted_forest_sizes, + xlabel='Number of trees extracted', + ylabel=experiment_score_metrics[0], + all_labels=['train', 'dev', 'test'], + title='Loss values of the trained model' + ) diff --git a/code/train.py b/code/train.py index 5f854b68afedea8574001d7c26ef4deaee46f248..647ed19c706daa2db8faf6a5b6851fd81f2088bd 100644 --- a/code/train.py +++ b/code/train.py @@ -66,6 +66,8 @@ if __name__ == "__main__": experiment_id = resolve_experiment_id(args.models_dir) experiment_id_str = str(experiment_id) + logger.info('Experiment id: {}'.format(experiment_id_str)) + with tqdm(seeds) as seed_bar: for seed in seed_bar: seed_bar.set_description('seed={}'.format(seed))