From 59e65276c53c73d309730f8d59ea9a2ead5c9124 Mon Sep 17 00:00:00 2001 From: Charly Lamothe <charly.lamothe@univ-amu.fr> Date: Fri, 28 Feb 2020 16:00:33 +0100 Subject: [PATCH] Finish to add similarity method to the pipeline. Add kmeans pruning method --- .../models/kmeans_forest_regressor.py | 63 +++++++++++++++++++ code/bolsonaro/models/model_factory.py | 15 ++++- .../models/similarity_forest_regressor.py | 4 +- code/bolsonaro/trainer.py | 9 ++- code/compute_results.py | 57 +++++++++++++---- code/train.py | 4 +- 6 files changed, 130 insertions(+), 22 deletions(-) create mode 100644 code/bolsonaro/models/kmeans_forest_regressor.py diff --git a/code/bolsonaro/models/kmeans_forest_regressor.py b/code/bolsonaro/models/kmeans_forest_regressor.py new file mode 100644 index 0000000..181332d --- /dev/null +++ b/code/bolsonaro/models/kmeans_forest_regressor.py @@ -0,0 +1,63 @@ +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error +from sklearn.base import BaseEstimator +from sklearn.cluster import KMeans +from abc import abstractmethod, ABCMeta +import numpy as np +from scipy.stats import mode + + +class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): + """ + On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan. + """ + + def __init__(self, models_parameters): + self._models_parameters = models_parameters + self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], + random_state=models_parameters.seed) + self._extracted_forest_size = self._models_parameters.extracted_forest_size + + @property + def models_parameters(self): + return self._models_parameters + + def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): + self._regressor.fit(X_train, y_train) + + predictions = list() + for tree in self._regressor.estimators_: + predictions.append(tree.predict(X_train)) + predictions = np.array(predictions) + + kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) + labels = np.array(kmeans.labels_) + + # for each cluster select the best tree on the validation set + pruned_forest = list() + for c in range(self._extracted_forest_size): + index = np.where(labels == c)[0] + cluster = list() + for i in index: + y_val_pred = self._regressor.estimators_[i].predict(X_val) + tree_pred = score_metric(y_val, y_val_pred) + cluster.append(tree_pred) + best_tree_index = np.argmax(cluster) + pruned_forest.append(self._regressor.estimators_[index[best_tree_index]]) + + self._regressor.estimators_ = pruned_forest + + def predict(self, X): + return self._regressor.predict(X) + + def score(self, X, y): + predictions = list() + for tree in self._regressor.estimators_: + predictions.append(tree.predict(X)) + predictions = np.array(predictions) + mean_predictions = np.mean(predictions, axis=0) + score = mean_squared_error(mean_predictions, y) + return score + + def predict_base_estimator(self, X): + return self._regressor.predict(X) diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index 74993cc..bbda6ca 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor +from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor from bolsonaro.data.task import Task from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier @@ -22,9 +23,11 @@ class ModelFactory(object): elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) - else: + elif model_parameters.extraction_strategy == 'none': return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) + else: + raise ValueError('Invalid extraction strategy') elif task == Task.REGRESSION: if model_parameters.extraction_strategy == 'omp': return OmpForestRegressor(model_parameters) @@ -33,15 +36,21 @@ class ModelFactory(object): random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'similarity': return SimilarityForestRegressor(model_parameters) - else: + elif model_parameters.extraction_strategy == 'kmeans': + return KMeansForestRegressor(model_parameters) + elif model_parameters.extraction_strategy == 'none': return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) + else: + raise ValueError('Invalid extraction strategy') elif task == Task.MULTICLASSIFICATION: if model_parameters.extraction_strategy == 'omp': return OmpForestMulticlassClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) - else: + elif model_parameters.extraction_strategy == 'none': return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) + else: + raise ValueError('Invalid extraction strategy') diff --git a/code/bolsonaro/models/similarity_forest_regressor.py b/code/bolsonaro/models/similarity_forest_regressor.py index f8d9c3e..8d8b5a1 100644 --- a/code/bolsonaro/models/similarity_forest_regressor.py +++ b/code/bolsonaro/models/similarity_forest_regressor.py @@ -21,7 +21,6 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): return self._models_parameters def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): - self._regressor.fit(X_train, y_train) y_val_pred = self._regressor.predict(X_val) @@ -63,3 +62,6 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): test_mean = np.mean(test_list, axis=0) score = mean_squared_error(test_mean, y) return score + + def predict_base_estimator(self, X): + return self._regressor.predict(X) diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index ce233d5..7c436d2 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -2,6 +2,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor +from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.data.task import Task from . import LOG_PATH @@ -96,7 +97,7 @@ class Trainer(object): self._end_time = time.time() def __score_func(self, model, X, y_true): - if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: + if type(model) in [OmpForestRegressor, RandomForestRegressor]: y_pred = model.predict(X) result = self._regression_score_metric(y_true, y_pred) elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: @@ -104,10 +105,12 @@ class Trainer(object): if type(model) is OmpForestBinaryClassifier: y_pred = y_pred.round() result = self._classification_score_metric(y_true, y_pred) + elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor]: + result = model.score(X, y_true) return result def __score_func_base(self, model, X, y_true): - if type(model) == OmpForestRegressor: + if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor]: y_pred = model.predict_base_estimator(X) result = self._base_regression_score_metric(y_true, y_pred) elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: @@ -116,7 +119,7 @@ class Trainer(object): elif type(model) == RandomForestClassifier: y_pred = model.predict(X) result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]: + elif type(model) is RandomForestRegressor: y_pred = model.predict(X) result = self._base_regression_score_metric(y_true, y_pred) return result diff --git a/code/compute_results.py b/code/compute_results.py index 473044d..408a76a 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -380,20 +380,51 @@ if __name__ == "__main__": xlabel='Number of trees extracted', ylabel=experiments_score_metric, title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) + elif args.stage == 5: + # Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary + extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1]) + + # base_with_params + logger.info('Loading base_with_params experiment scores...') + base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ + base_with_params_experiment_score_metric = \ + extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], + extracted_forest_sizes_number) + # random_with_params + logger.info('Loading random_with_params experiment scores...') + random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ + with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) + # omp_with_params + logger.info('Loading omp_with_params experiment scores...') + omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ + omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, args.experiment_ids[2]) + # omp_with_params + logger.info('Loading kmeans_with_params experiment scores...') + kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \ + kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, args.experiment_ids[3]) + + # Sanity check on the metrics retreived + if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric + == omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric): + raise ValueError('Score metrics of all experiments must be the same.') + experiments_score_metric = base_with_params_experiment_score_metric + + output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4') + pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) + + Plotter.plot_stage2_losses( + file_path=output_path + os.sep + 'losses.png', + all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, + kmeans_with_params_test_scores], + all_labels=['base', 'random', 'omp', 'kmeans'], + x_value=with_params_extracted_forest_sizes, + xlabel='Number of trees extracted', + ylabel=experiments_score_metric, + title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) else: raise ValueError('This stage number is not supported yet, but it will be!') logger.info('Done.') - - """ - TODO: - For each dataset: - Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams) - Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations) - Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev - Stage 4) A figure to finally compare the perf of our approach using the previous selected - parameters vs the baseline vs other papers using different extracted forest size - (percentage of the tree size found previously in best hyperparams search) on the abscissa. - - IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1). - """ diff --git a/code/train.py b/code/train.py index e51514c..0ca2b47 100644 --- a/code/train.py +++ b/code/train.py @@ -163,7 +163,7 @@ if __name__ == "__main__": parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') - parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.') + parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.') args = parser.parse_args() if args.experiment_configuration: @@ -173,7 +173,7 @@ if __name__ == "__main__": else: parameters = args.__dict__ - if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']: + if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']: raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) -- GitLab