From 690cf820b4d557f66999cfe96cea0835cf88b100 Mon Sep 17 00:00:00 2001 From: Luc Giffon <luc.giffon@lis-lab.fr> Date: Mon, 4 Nov 2019 16:14:36 +0100 Subject: [PATCH] Add few comments/docstring + update gitignore with models files + update readme for .env variables --- .gitignore | 2 ++ README.md | 11 +++++++++++ code/bolsonaro/models/omp_forest_regressor.py | 16 +++++++++++++--- code/bolsonaro/trainer.py | 1 + code/bolsonaro/utils.py | 8 ++++++++ code/train.py | 10 +++++++++- 6 files changed, 44 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 9cc2e27..be95874 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +models/* + */.kile/* *.kilepr # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index 8ffb3de..8d46ebd 100644 --- a/README.md +++ b/README.md @@ -49,5 +49,16 @@ Project Organization Instal project -------------- +First install the project pacakge: + pip install -r requirements.txt + +Then create a file `.env` by copying the file `.env.example`: + cp .env.example .env + +Then you must set the project directory in the `.env` file : + + project_dir = "path/to/your/project/directory" + +This directory will be used for storing the model parameters. \ No newline at end of file diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index be60cae..9de6c74 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -1,7 +1,7 @@ from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import OrthogonalMatchingPursuit from sklearn.base import BaseEstimator - +import numpy as np class OmpForestRegressor(BaseEstimator): @@ -35,10 +35,20 @@ class OmpForestRegressor(BaseEstimator): return forest def _extract_subforest(self, X_train, y_train): - D = [[tree.predict([elem])[0] for tree in self._forest] for elem in X_train] + """ + Given an already estimated regressor: apply OMP to get the weight of each tree. + + The X_train data is used for interrogation of every tree in the forest. The y_train data + is used for finding the weights in OMP. + + :param X_train: (n_sample, n_features) array + :param y_train: (n_sample,) array + :return: + """ + D = np.array([tree.predict(X_train) for tree in self._forest]).T omp = OrthogonalMatchingPursuit( n_nonzero_coefs=self._models_parameters.extracted_forest_size, fit_intercept=False, normalize=False) omp.fit(D, y_train) - weights = omp.coef_ + weights = omp.coef_ # why not to use directly the omp estimator and bypass it using the coefs? return weights diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 7c1436b..95fab4a 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -14,6 +14,7 @@ class Trainer(object): self._logger = LoggerFactory.create(LOG_PATH, __name__) def iterate(self, model, models_dir): + # why is this function named iterate? self._logger.info('Training model using train set...') begin_time = time.time() model.fit(self._dataset.X_train, self._dataset.y_train) diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py index 2affd37..96fbb06 100644 --- a/code/bolsonaro/utils.py +++ b/code/bolsonaro/utils.py @@ -2,6 +2,14 @@ import os def resolve_experiment_id(models_dir): + """ + Return the ID of the next experiment. + + The ID is an int equal to n+1 where n is the current number of directory in `models_dir + ` + :param models_dir: + :return: + """ ids = [x for x in os.listdir(models_dir) if os.path.isdir(models_dir + os.sep + x)] if len(ids) > 0: diff --git a/code/train.py b/code/train.py index 0e6896c..74d90a5 100644 --- a/code/train.py +++ b/code/train.py @@ -1,3 +1,5 @@ +from dotenv import load_dotenv + from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_loader import DatasetLoader from bolsonaro.models.model_factory import ModelFactory @@ -5,6 +7,7 @@ from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.trainer import Trainer from bolsonaro.utils import resolve_experiment_id +from dotenv import find_dotenv, load_dotenv import argparse import pathlib import random @@ -13,11 +16,15 @@ import errno if __name__ == "__main__": + # get environment variables in .env + load_dotenv(find_dotenv()) + default_dataset_name = 'boston' default_normalize = False default_forest_size = 100 default_extracted_forest_size = 10 - default_models_dir = 'models' + # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} + default_models_dir = os.environ["project_dir"] + os.sep + 'models' default_dev_size = 0.2 default_test_size = 0.2 default_use_random_seed = True @@ -43,6 +50,7 @@ if __name__ == "__main__": if type(args.extracted_forest_size) == list \ else [args.extracted_forest_size] + # todo the seeds shouldn't be randomly generated but fixed in range instead. We want it to be reproducible: exact same arguments should return exact same results. random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \ for i in range(args.random_seed_number)] \ if args.use_random_seed else None -- GitLab