From d786531442fdbaba164853b87a4f8b6c700d6748 Mon Sep 17 00:00:00 2001
From: Luc Giffon <luc.giffon@lis-lab.fr>
Date: Mon, 4 Nov 2019 17:03:30 +0100
Subject: [PATCH] now the model can make predictions: todo: manage result
 recording

---
 code/bolsonaro/models/omp_forest_regressor.py | 50 ++++++++++++++++---
 code/bolsonaro/trainer.py                     |  1 +
 code/train.py                                 |  5 +-
 3 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py
index 7813da9..2f87892 100644
--- a/code/bolsonaro/models/omp_forest_regressor.py
+++ b/code/bolsonaro/models/omp_forest_regressor.py
@@ -34,6 +34,9 @@ class OmpForestRegressor(BaseEstimator):
     def models_parameters(self):
         return self._models_parameters
 
+    def score_regressor(self, X, y):
+        return self._regressor.score(X, y)
+
     def _train_forest(self, X_train, y_train):
         self._regressor.fit(X_train, y_train)
         forest = self._regressor.estimators_
@@ -51,24 +54,57 @@ class OmpForestRegressor(BaseEstimator):
         :return:
         """
         self._logger.debug("Forest make prediction on X_train")
-        D = np.array([tree.predict(X_train) for tree in self._forest]).T
+        D = self._forest_prediction(X_train)
 
         if self._models_parameters.normalize:
             self._logger.debug("Compute norm of predicted vectors on X_train")
             self._forest_norms = np.linalg.norm(D, axis=0)
             D /= self._forest_norms
 
-
         omp = OrthogonalMatchingPursuit(
             n_nonzero_coefs=self._models_parameters.extracted_forest_size,
             fit_intercept=False, normalize=False)
         self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
                            .format(self._models_parameters.extracted_forest_size))
         omp.fit(D, y_train)
-        weights = omp.coef_  # why not to use directly the omp estimator and bypass it using the coefs?
+        weights = omp.coef_
+        # question: why not to use directly the omp estimator instead of bypassing it using the coefs?
         return weights
 
-    def predict(self):
-        raise NotImplementedError("TODO: implement predict function")
-        # todo don't forget to deal with the normalize parameter
-        # should the norm used on train or the new norms be used for normalization?
+    def _forest_prediction(self, X):
+        return np.array([tree.predict(X) for tree in self._forest]).T
+
+    def predict(self, X):
+        """
+        Apply the OMPForestRegressor to X.
+
+        :param X:
+        :return:
+        """
+        D = self._forest_prediction(X)
+
+        if self._models_parameters.normalize:
+            D /= self._forest_norms
+
+        predictions = D @ self.weights
+
+        return predictions
+
+
+    def score(self, X, y, metric="mse"):
+        """
+        Evaluate OMPForestRegressor on (`X`, `y`) using `metric`
+
+        :param X:
+        :param y:
+        :param metric:
+        :return:
+        """
+        predictions = self.predict(X)
+
+        if metric == "mse":
+            evaluation = np.mean(np.square(predictions - y))
+        else:
+            raise ValueError("Metric value {} is not known.")
+
+        return evaluation
\ No newline at end of file
diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py
index 95fab4a..0e239e6 100644
--- a/code/bolsonaro/trainer.py
+++ b/code/bolsonaro/trainer.py
@@ -17,6 +17,7 @@ class Trainer(object):
         # why is this function named iterate?
         self._logger.info('Training model using train set...')
         begin_time = time.time()
+        # todo: OMP may be running with X_dev ou Y_dev
         model.fit(self._dataset.X_train, self._dataset.y_train)
         end_time = time.time()
 
diff --git a/code/train.py b/code/train.py
index 9a50283..7b589a3 100644
--- a/code/train.py
+++ b/code/train.py
@@ -20,7 +20,7 @@ if __name__ == "__main__":
     load_dotenv(find_dotenv())
 
     default_dataset_name = 'boston'
-    default_normalize = False
+    default_normalize = True
     default_forest_size = 100
     default_extracted_forest_size = 10
     # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
@@ -100,3 +100,6 @@ if __name__ == "__main__":
             model = ModelFactory.build(dataset.task, model_parameters)
 
             trainer.iterate(model, sub_models_dir)
+
+            print(model.score(dataset.X_test, dataset.y_test))
+            print(model.score_regressor(dataset.X_test, dataset.y_test))
\ No newline at end of file
-- 
GitLab