Add Paolo's first implementation of this paper:...

Add Paolo's first implementation of this paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/

Add Paolo's first implementation of this paper:...
c80ddd61 · Charly Lamothe · 21ccc627 · c80ddd61 · c80ddd61 · c80ddd61
Commit c80ddd61 authored 5 years ago by Charly Lamothe
--- a/code/bolsonaro/models/model_factory.py
+++ b/code/bolsonaro/models/model_factory.py
 from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
 from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
-from bolsonaro.data.task import Task
 from bolsonaro.models.model_parameters import ModelParameters
+from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
+from bolsonaro.data.task import Task
 from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 import os
@@ -30,6 +31,8 @@ class ModelFactory(object):
            elif model_parameters.extraction_strategy == 'random':
                return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size,
                    random_state=model_parameters.seed)
+            elif model_parameters.extraction_strategy == 'similarity':
+                return SimilarityForestRegressor(model_parameters)
            else:
                return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
                    random_state=model_parameters.seed)

--- a/code/bolsonaro/models/similarity_forest_regressor.py
+++ b/code/bolsonaro/models/similarity_forest_regressor.py
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_squared_error
+from sklearn.base import BaseEstimator
+from abc import abstractmethod, ABCMeta
+import numpy as np
+class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
+    """
+    https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
+    """
+    def __init__(self, models_parameters):
+        self._models_parameters = models_parameters
+        self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
+            random_state=models_parameters.seed)
+        self._extracted_forest_size = self._models_parameters.extracted_forest_size
+    @property
+    def models_parameters(self):
+        return self._models_parameters
+    def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
+        self._regressor.fit(X_train, y_train)
+        y_val_pred = self._regressor.predict(X_val)
+        forest_pred = score_metric(y_val, y_val_pred)
+        forest = self._regressor.estimators_
+        selected_trees = list()
+        tree_list = list(self._regressor.estimators_)
+        for _ in range(self._extracted_forest_size):
+            best_similarity = 100000
+            found_index = 0
+            for i in range(len(tree_list)):
+                lonely_tree = tree_list[i]
+                del tree_list[i]
+                val_list = list()
+                for tree in tree_list:
+                    val_pred = tree.predict(X_val)
+                    val_list.append(val_pred)
+                val_list = np.array(val_list)
+                val_mean = np.mean(val_list, axis=0)
+                val_score = score_metric(val_mean, y_val)
+                temp_similarity = abs(forest_pred - val_score)
+                if (temp_similarity < best_similarity):
+                    found_index = i
+                    best_similarity = temp_similarity
+                tree_list.insert(i, lonely_tree)
+            selected_trees.append(tree_list[found_index])
+            del tree_list[found_index]
+        pruned_forest = list(set(forest) - set(selected_trees))
+        self._regressor.estimators_ = pruned_forest
+    def score(self, X, y):
+        test_list = list()
+        for mod in self._regressor.estimators_:
+            test_pred = mod.predict(X)
+            test_list.append(test_pred)
+        test_list = np.array(test_list)
+        test_mean = np.mean(test_list, axis=0)
+        score = mean_squared_error(test_mean, y)
+        return score
--- a/code/bolsonaro/trainer.py
+++ b/code/bolsonaro/trainer.py
 from bolsonaro.models.model_raw_results import ModelRawResults
 from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
 from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
+from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
 from bolsonaro.error_handling.logger_factory import LoggerFactory
 from bolsonaro.data.task import Task
 from . import LOG_PATH
@@ -87,15 +88,15 @@ class Trainer(object):
            )
        else:
            model.fit(
-                X_forest=self._X_forest,
+                self._X_forest,
-                y_forest=self._y_forest,
+                self._y_forest,
-                X_omp=self._X_omp,
+                self._X_omp,
-                y_omp=self._y_omp
+                self._y_omp
            )
        self._end_time = time.time()
    def __score_func(self, model, X, y_true):
-        if type(model) in [OmpForestRegressor, RandomForestRegressor]:
+        if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
            y_pred = model.predict(X)
            result = self._regression_score_metric(y_true, y_pred)
        elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
@@ -115,7 +116,7 @@ class Trainer(object):
        elif type(model) == RandomForestClassifier:
            y_pred = model.predict(X)
            result = self._base_classification_score_metric(y_true, y_pred)
-        elif type(model) == RandomForestRegressor:
+        elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
            y_pred = model.predict(X)
            result = self._base_regression_score_metric(y_true, y_pred)
        return result

--- a/code/train.py
+++ b/code/train.py
@@ -163,7 +163,7 @@ if __name__ == "__main__":
    parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
    parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
    parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
-    parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random or none.')
+    parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.')
    args = parser.parse_args()
    if args.experiment_configuration:
@@ -173,7 +173,7 @@ if __name__ == "__main__":
    else:
        parameters = args.__dict__
-    if parameters['extraction_strategy'] not in ['omp', 'random', 'none']:
+    if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']:
        raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
    pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)

--- a/results/california_housing/stage4/losses_2.png
+++ b/results/california_housing/stage4/losses_2.png
--- a/results/california_housing/stage4_backup/losses_2.png
+++ b/results/california_housing/stage4_backup/losses_2.png