diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py index fcb4220896e89e3a000f1058e34316dd9073a883..26e86daf3231eaa4abbd8578ff19a300911e9254 100644 --- a/code/bolsonaro/models/model_raw_results.py +++ b/code/bolsonaro/models/model_raw_results.py @@ -68,12 +68,10 @@ class ModelRawResults(object): return self._base_score_metric def save(self, models_dir): - if not os.path.exists(models_dir): - os.mkdir(models_dir) save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle', self.__dict__) @staticmethod - def load(models_dir): + def load(models_dir): return load_obj_from_pickle(models_dir + os.sep + 'model_raw_results.pickle', ModelRawResults) diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py index d0f726825e0b12055ff617e5f5af37e987e3a35a..7df9014e51126731503e58634511120475c20847 100644 --- a/code/bolsonaro/models/omp_forest.py +++ b/code/bolsonaro/models/omp_forest.py @@ -25,6 +25,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): return self._base_forest_estimator.score(X, y) def _base_estimator_predictions(self, X): + # We need to use predict_proba to get the probabilities of each class return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T @property @@ -124,24 +125,3 @@ class SingleOmpForest(OmpForest): forest_predictions /= self._forest_norms return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) - - def predict_no_weights(self, X): - """ - Apply the SingleOmpForest to X without using the weights. - - Make all the base tree predictions - - :param X: a Forest - :return: a np.array of the predictions of the entire forest - """ - forest_predictions = self._base_estimator_predictions(X).T - - if self._models_parameters.normalize_D: - forest_predictions /= self._forest_norms - - weights = self._omp.coef_ - omp_trees_indices = np.nonzero(weights) - - select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0) - - return select_trees diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py index 36d12be6727c25fcc029c13b1a13490f24be1295..270f115df362351e2b038ed2226c617c0544dd4a 100644 --- a/code/bolsonaro/models/omp_forest_classifier.py +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -106,36 +106,6 @@ class OmpForestMulticlassClassifier(OmpForest): max_preds = np.argmax(preds, axis=1) return np.array(label_names)[max_preds] - def predict_no_weights(self, X): - """ - Apply the SingleOmpForest to X without using the weights. - - Make all the base tree predictions - - :param X: a Forest - :return: a np.array of the predictions of the entire forest - """ - - forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T - - if self._models_parameters.normalize_D: - forest_predictions /= self._forest_norms - - label_names = [] - preds = [] - num_class = 0 - for class_label, omp_class in self._dct_class_omp.items(): - weights = omp_class.coef_ - omp_trees_indices = np.nonzero(weights) - label_names.append(class_label) - atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1 - preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)) - num_class += 1 - - preds = np.array(preds).T - max_preds = np.argmax(preds, axis=1) - return np.array(label_names)[max_preds] - def score(self, X, y, metric=DEFAULT_SCORE_METRIC): predictions = self.predict(X) diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index c543b853b28dd507002c0ce1bd0cca9048cb4202..389ab9de42d02602e4e6725b29f14534518cc816 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -95,18 +95,12 @@ class Trainer(object): ) self._end_time = time.time() - def __score_func(self, model, X, y_true, weights=True): + def __score_func(self, model, X, y_true): if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: - if weights: - y_pred = model.predict(X) - else: - y_pred = model.predict_no_weights(X) + y_pred = model.predict(X) result = self._regression_score_metric(y_true, y_pred) elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: - if weights: - y_pred = model.predict(X) - else: - y_pred = model.predict_no_weights(X) + y_pred = model.predict(X) if type(model) is OmpForestBinaryClassifier: y_pred = y_pred.round() result = self._classification_score_metric(y_true, y_pred) @@ -163,29 +157,3 @@ class Trainer(object): self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) self._logger.info("Performance on dev: {}".format(results.dev_score)) - - if type(model) not in [RandomForestRegressor, RandomForestClassifier]: - results = ModelRawResults( - model_object='', - training_time=self._end_time - self._begin_time, - datetime=datetime.datetime.now(), - train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False), - dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False), - test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), - train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), - dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), - test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), - score_metric=self._score_metric_name, - base_score_metric=self._base_score_metric_name - ) - results.save(models_dir+'_no_weights') - self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) - self._logger.info("Performance on test: {}".format(results.test_score)) - - self._logger.info("Base performance on train without weights: {}".format(results.train_score_base)) - self._logger.info("Performance on train: {}".format(results.train_score)) - - self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base)) - self._logger.info("Performance on dev: {}".format(results.dev_score)) - - diff --git a/code/bolsonaro/visualization/plotter.py b/code/bolsonaro/visualization/plotter.py index 5a5f72ad9fade836dcfed3c2ef6f452653dcf3d1..7d2cde23d24df4fb3f41cf5413b3769fc8d9e959 100644 --- a/code/bolsonaro/visualization/plotter.py +++ b/code/bolsonaro/visualization/plotter.py @@ -109,16 +109,16 @@ class Plotter(object): fig, ax = plt.subplots() - nb_experiments = len(all_experiment_scores) + n = len(all_experiment_scores) """ Get as many different colors from the specified cmap (here nipy_spectral) as there are curve to plot. """ - colors = Plotter.get_colors_from_cmap(nb_experiments) + colors = Plotter.get_colors_from_cmap(n) - # For each curve to plot - for i in range(nb_experiments): + # For each curve to plot + for i in range(n): # Retreive the scores in a list for each seed experiment_scores = list(all_experiment_scores[i].values()) # Compute the mean and the std for the CI diff --git a/code/compute_results.py b/code/compute_results.py index f15a7ff80249c538f2a408b564965de125b21cc4..4fce32742b42a98c426e07f6536a09f3538872ed 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -17,7 +17,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id): extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' return len(os.listdir(extracted_forest_sizes_root_path)) -def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True): +def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id): experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds @@ -49,15 +49,11 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path) - extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ] extracted_forest_sizes.sort(key=int) all_extracted_forest_sizes.append(list(map(int, extracted_forest_sizes))) for extracted_forest_size in extracted_forest_sizes: # models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size} - if weights: - extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size - else: - extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + '_no_weights' + extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file model_raw_results = ModelRawResults.load(extracted_forest_size_path) # Save the scores @@ -364,11 +360,6 @@ if __name__ == "__main__": omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( args.models_dir, args.results_dir, args.experiment_ids[2]) - #omp_with_params_without_weights - logger.info('Loading omp_with_params experiment scores...') - omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \ - omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2], weights=False) """# base_with_params logger.info('Loading base_with_params experiment scores 2...') @@ -393,9 +384,8 @@ if __name__ == "__main__": Plotter.plot_stage2_losses( file_path=output_path + os.sep + 'losses.png', - all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, - omp_with_params_without_weights_test_scores], - all_labels=['base', 'random', 'omp', 'omp_without_weights'], + all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores], + all_labels=['base', 'random', 'omp'], x_value=with_params_extracted_forest_sizes, xlabel='Number of trees extracted', ylabel=experiments_score_metric, diff --git a/experiments/iris/stage1/none_with_params.json b/experiments/iris/stage1/none_with_params.json index c6915e3989c24dcee31b74c67415d86a50e50b0f..b26a467d9ad76e6643b39bc952f1a02e956004dc 100644 --- a/experiments/iris/stage1/none_with_params.json +++ b/experiments/iris/stage1/none_with_params.json @@ -13,11 +13,9 @@ "test_size": 0.2, "random_seed_number": 1, "seeds": [ - 1, - 2, - 3, - 4, - 5 + 58, + 43535, + 234234 ], "subsets_used": "train,dev", "normalize_weights": false, diff --git a/experiments/iris/stage1/omp_with_params.json b/experiments/iris/stage1/omp_with_params.json index 941788592683f9ffad87edbce1a3924cd7d14895..35cbb39d2a7d53f87401b9d2ddba05287beeeef9 100644 --- a/experiments/iris/stage1/omp_with_params.json +++ b/experiments/iris/stage1/omp_with_params.json @@ -13,11 +13,9 @@ "test_size": 0.2, "random_seed_number": 1, "seeds": [ - 1, - 2, - 3, - 4, - 5 + 58, + 43535, + 234234 ], "subsets_used": "train,dev", "normalize_weights": false, diff --git a/results/boston/stage4/losses.png b/results/boston/stage4/losses.png index 0762b7c1057045bb08a9d698e82446baf3558e22..c5d57ce0b386934e9bd2cadcce5b44f8fb8a40d4 100644 Binary files a/results/boston/stage4/losses.png and b/results/boston/stage4/losses.png differ diff --git a/results/iris/stage1/losses.png b/results/iris/stage1/losses.png index 2e8d2608b74f13894c5cc006e70d38ee031653a2..2a120da925eef72954d16ce98f3b1bb72cdb43e9 100644 Binary files a/results/iris/stage1/losses.png and b/results/iris/stage1/losses.png differ diff --git a/scripts/run_compute_results.sh b/scripts/run_compute_results.sh index d67571d78a9499b75a2c4558a517b01035025beb..f9f130e19c4d467e9d0416a051b8353f071b42dd 100644 --- a/scripts/run_compute_results.sh +++ b/scripts/run_compute_results.sh @@ -1,5 +1,7 @@ -seeds='1 2 3' -for dataset in boston iris diabetes digits linnerud wine breast_cancer olivetti_faces 20newsgroups_vectorized lfw_pairs california_housing diamonds +for dataset in diamonds california_housing boston iris diabetes digits linnerud wine breast_cancer olivetti_faces 20newsgroups_vectorized lfw_pairs do + python code/compute_results.py --stage=1 --experiment_ids 1 2 3 4 5 6 --dataset_name=$dataset --models_dir=models/$dataset/stage1 + python code/compute_results.py --stage=2 --experiment_ids 1 2 3 4 --dataset_name=$dataset --models_dir=models/$dataset/stage2 + python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=$dataset --models_dir=models/$dataset/stage3 python code/compute_results.py --stage=4 --experiment_ids 1 2 3 --dataset_name=$dataset --models_dir=models/$dataset/stage4 done diff --git a/scripts/run_stage5_experiments.sh b/scripts/run_stage5_experiments.sh deleted file mode 100644 index c36433ee72d0092bd3aa79d9a08a093dde78a696..0000000000000000000000000000000000000000 --- a/scripts/run_stage5_experiments.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -core_number=5 -walltime=1:00 -seeds='1 2 3' - -for dataset in diabetes #diamonds california_housing boston linnerud -do - oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" - oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" - oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" - oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" -done