From d786531442fdbaba164853b87a4f8b6c700d6748 Mon Sep 17 00:00:00 2001 From: Luc Giffon <luc.giffon@lis-lab.fr> Date: Mon, 4 Nov 2019 17:03:30 +0100 Subject: [PATCH] now the model can make predictions: todo: manage result recording --- code/bolsonaro/models/omp_forest_regressor.py | 50 ++++++++++++++++--- code/bolsonaro/trainer.py | 1 + code/train.py | 5 +- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 7813da9..2f87892 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -34,6 +34,9 @@ class OmpForestRegressor(BaseEstimator): def models_parameters(self): return self._models_parameters + def score_regressor(self, X, y): + return self._regressor.score(X, y) + def _train_forest(self, X_train, y_train): self._regressor.fit(X_train, y_train) forest = self._regressor.estimators_ @@ -51,24 +54,57 @@ class OmpForestRegressor(BaseEstimator): :return: """ self._logger.debug("Forest make prediction on X_train") - D = np.array([tree.predict(X_train) for tree in self._forest]).T + D = self._forest_prediction(X_train) if self._models_parameters.normalize: self._logger.debug("Compute norm of predicted vectors on X_train") self._forest_norms = np.linalg.norm(D, axis=0) D /= self._forest_norms - omp = OrthogonalMatchingPursuit( n_nonzero_coefs=self._models_parameters.extracted_forest_size, fit_intercept=False, normalize=False) self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." .format(self._models_parameters.extracted_forest_size)) omp.fit(D, y_train) - weights = omp.coef_ # why not to use directly the omp estimator and bypass it using the coefs? + weights = omp.coef_ + # question: why not to use directly the omp estimator instead of bypassing it using the coefs? return weights - def predict(self): - raise NotImplementedError("TODO: implement predict function") - # todo don't forget to deal with the normalize parameter - # should the norm used on train or the new norms be used for normalization? + def _forest_prediction(self, X): + return np.array([tree.predict(X) for tree in self._forest]).T + + def predict(self, X): + """ + Apply the OMPForestRegressor to X. + + :param X: + :return: + """ + D = self._forest_prediction(X) + + if self._models_parameters.normalize: + D /= self._forest_norms + + predictions = D @ self.weights + + return predictions + + + def score(self, X, y, metric="mse"): + """ + Evaluate OMPForestRegressor on (`X`, `y`) using `metric` + + :param X: + :param y: + :param metric: + :return: + """ + predictions = self.predict(X) + + if metric == "mse": + evaluation = np.mean(np.square(predictions - y)) + else: + raise ValueError("Metric value {} is not known.") + + return evaluation \ No newline at end of file diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 95fab4a..0e239e6 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -17,6 +17,7 @@ class Trainer(object): # why is this function named iterate? self._logger.info('Training model using train set...') begin_time = time.time() + # todo: OMP may be running with X_dev ou Y_dev model.fit(self._dataset.X_train, self._dataset.y_train) end_time = time.time() diff --git a/code/train.py b/code/train.py index 9a50283..7b589a3 100644 --- a/code/train.py +++ b/code/train.py @@ -20,7 +20,7 @@ if __name__ == "__main__": load_dotenv(find_dotenv()) default_dataset_name = 'boston' - default_normalize = False + default_normalize = True default_forest_size = 100 default_extracted_forest_size = 10 # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} @@ -100,3 +100,6 @@ if __name__ == "__main__": model = ModelFactory.build(dataset.task, model_parameters) trainer.iterate(model, sub_models_dir) + + print(model.score(dataset.X_test, dataset.y_test)) + print(model.score_regressor(dataset.X_test, dataset.y_test)) \ No newline at end of file -- GitLab