diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py index fbb80a591f9c1f42ac6fe3d1982d43d108faa026..725f2c200f579bb0b3d0e6e2fb5f2465c57ca810 100644 --- a/code/bolsonaro/models/model_raw_results.py +++ b/code/bolsonaro/models/model_raw_results.py @@ -9,7 +9,8 @@ class ModelRawResults(object): def __init__(self, model_weights, training_time, datetime, train_score, dev_score, test_score, train_score_base, dev_score_base, - test_score_base, score_metric, base_score_metric): + test_score_base, score_metric, base_score_metric, + coherence=''): self._model_weights = model_weights self._training_time = training_time @@ -22,6 +23,7 @@ class ModelRawResults(object): self._test_score_base = test_score_base self._score_metric = score_metric self._base_score_metric = base_score_metric + self._coherence = coherence @property def model_weights(self): @@ -67,6 +69,10 @@ class ModelRawResults(object): def base_score_metric(self): return self._base_score_metric + @property + def coherence(self): + return self._coherence + def save(self, models_dir): if not os.path.exists(models_dir): os.mkdir(models_dir) diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 3327bee92401c3a993383c2d8b83a0ef80c206ba..c2bd7673262bb5bfd18ae617b1ba0c62b1092506 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -141,6 +141,32 @@ class Trainer(object): result = self._base_regression_score_metric(y_true, y_pred) return result + def compute_preds_coherence(self, model, X): + from sklearn.preprocessing import normalize + import itertools + + if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: + estimators = model.forest + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: + estimators = model.forest + elif type(model) in [RandomForestRegressor, RandomForestClassifier]: + estimators = model.estimators_ + + predictions = list() + for ti in estimators: + predictions.append(ti.predict(X)) + + predictions = normalize(predictions) + + """similarities = list() + for ti, tj in list(itertools.combinations(predictions, 2)): + similarities.append(np.abs(ti @ tj)) + coherence = np.max(np.asarray(similarities))""" + + coherence = np.max(np.abs((predictions @ predictions.T - np.eye(len(predictions))))) + + return coherence + def compute_results(self, model, models_dir): """ :param model: Object with @@ -173,7 +199,8 @@ class Trainer(object): dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), score_metric=self._score_metric_name, - base_score_metric=self._base_score_metric_name + base_score_metric=self._base_score_metric_name, + coherence=self.compute_preds_coherence(model, self._dataset.X_train) ) results.save(models_dir) self._logger.info("Base performance on test: {}".format(results.test_score_base)) diff --git a/code/compute_results.py b/code/compute_results.py index d77779e82e295b5e76c0347551c20b8ef258a546..e05fff6bda2e2b0ae516698876cc04c8d32959c4 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -148,6 +148,34 @@ def extract_weights_across_seeds(models_dir, results_dir, experiment_id): return experiment_weights +def extract_coherences_across_seeds(models_dir, results_dir, experiment_id): + experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} + experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds + experiment_coherences = dict() + + # For each seed results stored in models/{experiment_id}/seeds + seeds = os.listdir(experiment_seed_root_path) + seeds.sort(key=int) + for seed in seeds: + experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed} + extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' # models/{experiment_id}/seeds/{seed}/forest_size + + # {{seed}:[]} + experiment_coherences[seed] = list() + + # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes + extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path) + extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ] + extracted_forest_sizes.sort(key=int) + for extracted_forest_size in extracted_forest_sizes: + # models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size} + extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file + model_raw_results = ModelRawResults.load(extracted_forest_size_path) + # Save the weights + experiment_coherences[seed].append(model_raw_results.coherence) + + return experiment_coherences if __name__ == "__main__": # get environment variables in .env @@ -493,7 +521,7 @@ if __name__ == "__main__": ylabel=base_with_params_experiment_score_metric, title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) - if args.plot_weight_density: + if args.plot_weight_density or args.plot_preds_coherence: root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage{args.stage}') if args.stage == 1: @@ -526,8 +554,25 @@ if __name__ == "__main__": omp_experiment_ids.append((label, current_experiment_id)) for (experiment_label, experiment_id) in omp_experiment_ids: - logger.info(f'Computing weight density plot for experiment {experiment_label}...') - experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id) - Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png')) + if args.plot_weight_density: + logger.info(f'Computing weight density plot for experiment {experiment_label}...') + experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id) + Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png')) + if args.plot_preds_coherence: + all_labels = ['random', 'omp'] + random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ + with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1])) + coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in range(2, 4)] + print(coherence_values[1]) + Plotter.plot_stage2_losses( + file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}.png", + all_experiment_scores=coherence_values, + all_labels=all_labels, + x_value=with_params_extracted_forest_sizes, + xlabel='Number of trees extracted', + ylabel='Coherence', + title='Coherence values of {}'.format(args.dataset_name)) + logger.info(f'Computing preds coherence plot for experiment {experiment_label}...') logger.info('Done.') diff --git a/code/train.py b/code/train.py index 95498cdf03a894ca8c8cf91d6702acc6aef1a799..ad80795443b21c6715c5ebffe3111deb1b11cc8d 100644 --- a/code/train.py +++ b/code/train.py @@ -121,6 +121,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb trainer.init(model, subsets_used=parameters['subsets_used']) trainer.train(model) + #trainer.compute_preds_coherence(model) trainer.compute_results(model, sub_models_dir) logger.info(f'Training done for seed {seed_str}') seed_job_pb.update(1) @@ -169,6 +170,7 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz trainer.init(model, subsets_used=parameters['subsets_used']) trainer.train(model, extracted_forest_size=extracted_forest_size) + #trainer.compute_preds_coherence(model) trainer.compute_results(model, sub_models_dir) """