From 59e65276c53c73d309730f8d59ea9a2ead5c9124 Mon Sep 17 00:00:00 2001
From: Charly Lamothe <charly.lamothe@univ-amu.fr>
Date: Fri, 28 Feb 2020 16:00:33 +0100
Subject: [PATCH] Finish to add similarity method to the pipeline. Add kmeans
 pruning method

---
 .../models/kmeans_forest_regressor.py         | 63 +++++++++++++++++++
 code/bolsonaro/models/model_factory.py        | 15 ++++-
 .../models/similarity_forest_regressor.py     |  4 +-
 code/bolsonaro/trainer.py                     |  9 ++-
 code/compute_results.py                       | 57 +++++++++++++----
 code/train.py                                 |  4 +-
 6 files changed, 130 insertions(+), 22 deletions(-)
 create mode 100644 code/bolsonaro/models/kmeans_forest_regressor.py

diff --git a/code/bolsonaro/models/kmeans_forest_regressor.py b/code/bolsonaro/models/kmeans_forest_regressor.py
new file mode 100644
index 0000000..181332d
--- /dev/null
+++ b/code/bolsonaro/models/kmeans_forest_regressor.py
@@ -0,0 +1,63 @@
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_squared_error
+from sklearn.base import BaseEstimator
+from sklearn.cluster import KMeans
+from abc import abstractmethod, ABCMeta
+import numpy as np
+from scipy.stats import mode
+
+
+class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
+    """
+    On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
+    """
+
+    def __init__(self, models_parameters):
+        self._models_parameters = models_parameters
+        self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
+            random_state=models_parameters.seed)
+        self._extracted_forest_size = self._models_parameters.extracted_forest_size
+
+    @property
+    def models_parameters(self):
+        return self._models_parameters
+
+    def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
+        self._regressor.fit(X_train, y_train)
+
+        predictions = list()
+        for tree in self._regressor.estimators_:
+            predictions.append(tree.predict(X_train))
+        predictions = np.array(predictions)
+
+        kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
+        labels = np.array(kmeans.labels_)
+
+        # for each cluster select the best tree on the validation set
+        pruned_forest = list()
+        for c in range(self._extracted_forest_size):
+            index = np.where(labels == c)[0]
+            cluster = list()
+            for i in index:
+                y_val_pred = self._regressor.estimators_[i].predict(X_val)
+                tree_pred = score_metric(y_val, y_val_pred)
+                cluster.append(tree_pred)
+            best_tree_index = np.argmax(cluster)
+            pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])
+        
+        self._regressor.estimators_ = pruned_forest
+
+    def predict(self, X):
+        return self._regressor.predict(X)
+
+    def score(self, X, y):
+        predictions = list()
+        for tree in self._regressor.estimators_:
+            predictions.append(tree.predict(X))
+        predictions = np.array(predictions)
+        mean_predictions = np.mean(predictions, axis=0)
+        score = mean_squared_error(mean_predictions, y)
+        return score
+
+    def predict_base_estimator(self, X):
+        return self._regressor.predict(X)
diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py
index 74993cc..bbda6ca 100644
--- a/code/bolsonaro/models/model_factory.py
+++ b/code/bolsonaro/models/model_factory.py
@@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om
 from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
 from bolsonaro.models.model_parameters import ModelParameters
 from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
+from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
 from bolsonaro.data.task import Task
 
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
@@ -22,9 +23,11 @@ class ModelFactory(object):
             elif model_parameters.extraction_strategy == 'random':
                 return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
                     random_state=model_parameters.seed)
-            else:
+            elif model_parameters.extraction_strategy == 'none':
                 return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
                     random_state=model_parameters.seed)
+            else:
+                raise ValueError('Invalid extraction strategy')
         elif task == Task.REGRESSION:
             if model_parameters.extraction_strategy == 'omp':
                 return OmpForestRegressor(model_parameters)
@@ -33,15 +36,21 @@ class ModelFactory(object):
                     random_state=model_parameters.seed)
             elif model_parameters.extraction_strategy == 'similarity':
                 return SimilarityForestRegressor(model_parameters)
-            else:
+            elif model_parameters.extraction_strategy == 'kmeans':
+                return KMeansForestRegressor(model_parameters)
+            elif model_parameters.extraction_strategy == 'none':
                 return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
                     random_state=model_parameters.seed)
+            else:
+                raise ValueError('Invalid extraction strategy')
         elif task == Task.MULTICLASSIFICATION:
             if model_parameters.extraction_strategy == 'omp':
                 return OmpForestMulticlassClassifier(model_parameters)
             elif model_parameters.extraction_strategy == 'random':
                 return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
                     random_state=model_parameters.seed)
-            else:
+            elif model_parameters.extraction_strategy == 'none':
                 return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
                     random_state=model_parameters.seed)
+            else:
+                raise ValueError('Invalid extraction strategy')
diff --git a/code/bolsonaro/models/similarity_forest_regressor.py b/code/bolsonaro/models/similarity_forest_regressor.py
index f8d9c3e..8d8b5a1 100644
--- a/code/bolsonaro/models/similarity_forest_regressor.py
+++ b/code/bolsonaro/models/similarity_forest_regressor.py
@@ -21,7 +21,6 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
         return self._models_parameters
 
     def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
-
         self._regressor.fit(X_train, y_train)
 
         y_val_pred = self._regressor.predict(X_val)
@@ -63,3 +62,6 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
         test_mean = np.mean(test_list, axis=0)
         score = mean_squared_error(test_mean, y)
         return score
+
+    def predict_base_estimator(self, X):
+        return self._regressor.predict(X)
diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py
index ce233d5..7c436d2 100644
--- a/code/bolsonaro/trainer.py
+++ b/code/bolsonaro/trainer.py
@@ -2,6 +2,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults
 from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
 from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
 from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
+from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
 from bolsonaro.error_handling.logger_factory import LoggerFactory
 from bolsonaro.data.task import Task
 from . import LOG_PATH
@@ -96,7 +97,7 @@ class Trainer(object):
         self._end_time = time.time()
 
     def __score_func(self, model, X, y_true):
-        if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
+        if type(model) in [OmpForestRegressor, RandomForestRegressor]:
             y_pred = model.predict(X)
             result = self._regression_score_metric(y_true, y_pred)
         elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
@@ -104,10 +105,12 @@ class Trainer(object):
             if type(model) is OmpForestBinaryClassifier:
                 y_pred = y_pred.round()
             result = self._classification_score_metric(y_true, y_pred)
+        elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor]:
+            result = model.score(X, y_true)
         return result
 
     def __score_func_base(self, model, X, y_true):
-        if type(model) == OmpForestRegressor:
+        if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor]:
             y_pred = model.predict_base_estimator(X)
             result = self._base_regression_score_metric(y_true, y_pred)
         elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
@@ -116,7 +119,7 @@ class Trainer(object):
         elif type(model) == RandomForestClassifier:
             y_pred = model.predict(X)
             result = self._base_classification_score_metric(y_true, y_pred)
-        elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
+        elif type(model) is RandomForestRegressor:
             y_pred = model.predict(X)
             result = self._base_regression_score_metric(y_true, y_pred)
         return result
diff --git a/code/compute_results.py b/code/compute_results.py
index 473044d..408a76a 100644
--- a/code/compute_results.py
+++ b/code/compute_results.py
@@ -380,20 +380,51 @@ if __name__ == "__main__":
             xlabel='Number of trees extracted',
             ylabel=experiments_score_metric,
             title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
+    elif args.stage == 5:
+        # Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
+        extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
+
+        # base_with_params
+        logger.info('Loading base_with_params experiment scores...')
+        base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
+            base_with_params_experiment_score_metric = \
+            extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
+            extracted_forest_sizes_number)
+        # random_with_params
+        logger.info('Loading random_with_params experiment scores...')
+        random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
+            with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
+            extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
+        # omp_with_params
+        logger.info('Loading omp_with_params experiment scores...')
+        omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
+            omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
+                args.models_dir, args.results_dir, args.experiment_ids[2])
+        # omp_with_params
+        logger.info('Loading kmeans_with_params experiment scores...')
+        kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \
+            kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
+                args.models_dir, args.results_dir, args.experiment_ids[3])
+        
+        # Sanity check on the metrics retreived
+        if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric
+            == omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric):
+            raise ValueError('Score metrics of all experiments must be the same.')
+        experiments_score_metric = base_with_params_experiment_score_metric
+
+        output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4')
+        pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
+
+        Plotter.plot_stage2_losses(
+            file_path=output_path + os.sep + 'losses.png',
+            all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
+                kmeans_with_params_test_scores],
+            all_labels=['base', 'random', 'omp', 'kmeans'],
+            x_value=with_params_extracted_forest_sizes,
+            xlabel='Number of trees extracted',
+            ylabel=experiments_score_metric,
+            title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
     else:
         raise ValueError('This stage number is not supported yet, but it will be!')
 
     logger.info('Done.')
-
-    """
-    TODO:
-    For each dataset:
-    Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
-    Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations)
-    Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
-    Stage 4) A figure to finally compare the perf of our approach using the previous selected
-        parameters vs the baseline vs other papers using different extracted forest size
-        (percentage of the tree size found previously in best hyperparams search) on the abscissa.
-
-    IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1).
-    """
diff --git a/code/train.py b/code/train.py
index e51514c..0ca2b47 100644
--- a/code/train.py
+++ b/code/train.py
@@ -163,7 +163,7 @@ if __name__ == "__main__":
     parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
     parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
     parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
-    parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.')
+    parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.')
     args = parser.parse_args()
 
     if args.experiment_configuration:
@@ -173,7 +173,7 @@ if __name__ == "__main__":
     else:
         parameters = args.__dict__
 
-    if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']:
+    if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']:
         raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
 
     pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
-- 
GitLab