From b62b7df7d6eed466d0e095511ccc8096b2512f8b Mon Sep 17 00:00:00 2001 From: Luc Giffon <luc.giffon@lis-lab.fr> Date: Mon, 4 Nov 2019 16:42:11 +0100 Subject: [PATCH] support for normalize parameter + optimisation on train (wtf was that for loop) --- code/bolsonaro/models/model_parameters.py | 10 ++++++++-- code/bolsonaro/models/omp_forest_regressor.py | 20 +++++++++++++++++++ code/train.py | 3 ++- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index b1fec8c..2d8dba5 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -4,10 +4,11 @@ import os class ModelParameters(object): - def __init__(self, forest_size, extracted_forest_size, seed=None): + def __init__(self, forest_size, extracted_forest_size, normalize, seed=None): self._forest_size = forest_size self._extracted_forest_size = extracted_forest_size self._seed = seed + self._normalize = normalize @property def forest_size(self): @@ -21,12 +22,17 @@ class ModelParameters(object): def seed(self): return self._seed + @property + def normalize(self): + return self._normalize + def save(self, directory_path, experiment_id): with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file: json.dump({ 'forest_size': self._forest_size, 'extracted_forest_size': self._extracted_forest_size, - 'seed': self._seed + 'seed': self._seed, + 'normalize': self._normalize }, output_file, indent=4) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 9de6c74..7813da9 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -3,12 +3,17 @@ from sklearn.linear_model import OrthogonalMatchingPursuit from sklearn.base import BaseEstimator import numpy as np +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory + + class OmpForestRegressor(BaseEstimator): def __init__(self, models_parameters): self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, random_state=models_parameters.seed) self._models_parameters = models_parameters + self._logger = LoggerFactory.create(LOG_PATH, __name__) def fit(self, X_train, y_train): self._forest = self._train_forest(X_train, y_train) @@ -45,10 +50,25 @@ class OmpForestRegressor(BaseEstimator): :param y_train: (n_sample,) array :return: """ + self._logger.debug("Forest make prediction on X_train") D = np.array([tree.predict(X_train) for tree in self._forest]).T + + if self._models_parameters.normalize: + self._logger.debug("Compute norm of predicted vectors on X_train") + self._forest_norms = np.linalg.norm(D, axis=0) + D /= self._forest_norms + + omp = OrthogonalMatchingPursuit( n_nonzero_coefs=self._models_parameters.extracted_forest_size, fit_intercept=False, normalize=False) + self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." + .format(self._models_parameters.extracted_forest_size)) omp.fit(D, y_train) weights = omp.coef_ # why not to use directly the omp estimator and bypass it using the coefs? return weights + + def predict(self): + raise NotImplementedError("TODO: implement predict function") + # todo don't forget to deal with the normalize parameter + # should the norm used on train or the new norms be used for normalization? diff --git a/code/train.py b/code/train.py index 74d90a5..9a50283 100644 --- a/code/train.py +++ b/code/train.py @@ -92,7 +92,8 @@ if __name__ == "__main__": model_parameters = ModelParameters( forest_size=args.forest_size, extracted_forest_size=extracted_forest_size, - seed=random_seed + seed=random_seed, + normalize=args.normalize ) model_parameters.save(sub_models_dir, experiment_id) -- GitLab