From b62b7df7d6eed466d0e095511ccc8096b2512f8b Mon Sep 17 00:00:00 2001
From: Luc Giffon <luc.giffon@lis-lab.fr>
Date: Mon, 4 Nov 2019 16:42:11 +0100
Subject: [PATCH] support for normalize parameter + optimisation on train (wtf
 was that for loop)

---
 code/bolsonaro/models/model_parameters.py     | 10 ++++++++--
 code/bolsonaro/models/omp_forest_regressor.py | 20 +++++++++++++++++++
 code/train.py                                 |  3 ++-
 3 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py
index b1fec8c..2d8dba5 100644
--- a/code/bolsonaro/models/model_parameters.py
+++ b/code/bolsonaro/models/model_parameters.py
@@ -4,10 +4,11 @@ import os
 
 class ModelParameters(object):
 
-    def __init__(self, forest_size, extracted_forest_size, seed=None):
+    def __init__(self, forest_size, extracted_forest_size, normalize, seed=None):
         self._forest_size = forest_size
         self._extracted_forest_size = extracted_forest_size
         self._seed = seed
+        self._normalize = normalize
 
     @property
     def forest_size(self):
@@ -21,12 +22,17 @@ class ModelParameters(object):
     def seed(self):
         return self._seed
 
+    @property
+    def normalize(self):
+        return self._normalize
+
     def save(self, directory_path, experiment_id):
         with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file:
             json.dump({
                 'forest_size': self._forest_size,
                 'extracted_forest_size': self._extracted_forest_size,
-                'seed': self._seed
+                'seed': self._seed,
+                'normalize': self._normalize
             },
             output_file,
             indent=4)
diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py
index 9de6c74..7813da9 100644
--- a/code/bolsonaro/models/omp_forest_regressor.py
+++ b/code/bolsonaro/models/omp_forest_regressor.py
@@ -3,12 +3,17 @@ from sklearn.linear_model import OrthogonalMatchingPursuit
 from sklearn.base import BaseEstimator
 import numpy as np
 
+from bolsonaro import LOG_PATH
+from bolsonaro.error_handling.logger_factory import LoggerFactory
+
+
 class OmpForestRegressor(BaseEstimator):
 
     def __init__(self, models_parameters):
         self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size,
             random_state=models_parameters.seed)
         self._models_parameters = models_parameters
+        self._logger = LoggerFactory.create(LOG_PATH, __name__)
 
     def fit(self, X_train, y_train):
         self._forest = self._train_forest(X_train, y_train)
@@ -45,10 +50,25 @@ class OmpForestRegressor(BaseEstimator):
         :param y_train: (n_sample,) array
         :return:
         """
+        self._logger.debug("Forest make prediction on X_train")
         D = np.array([tree.predict(X_train) for tree in self._forest]).T
+
+        if self._models_parameters.normalize:
+            self._logger.debug("Compute norm of predicted vectors on X_train")
+            self._forest_norms = np.linalg.norm(D, axis=0)
+            D /= self._forest_norms
+
+
         omp = OrthogonalMatchingPursuit(
             n_nonzero_coefs=self._models_parameters.extracted_forest_size,
             fit_intercept=False, normalize=False)
+        self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
+                           .format(self._models_parameters.extracted_forest_size))
         omp.fit(D, y_train)
         weights = omp.coef_  # why not to use directly the omp estimator and bypass it using the coefs?
         return weights
+
+    def predict(self):
+        raise NotImplementedError("TODO: implement predict function")
+        # todo don't forget to deal with the normalize parameter
+        # should the norm used on train or the new norms be used for normalization?
diff --git a/code/train.py b/code/train.py
index 74d90a5..9a50283 100644
--- a/code/train.py
+++ b/code/train.py
@@ -92,7 +92,8 @@ if __name__ == "__main__":
             model_parameters = ModelParameters(
                 forest_size=args.forest_size,
                 extracted_forest_size=extracted_forest_size,
-                seed=random_seed
+                seed=random_seed,
+                normalize=args.normalize
             )
             model_parameters.save(sub_models_dir, experiment_id)
 
-- 
GitLab