Add few comments/docstring + update gitignore with models files + update readme for .env variables

690cf820 · Luc Giffon · a826e7cc · 690cf820 · 690cf820 · 690cf820
Commit 690cf820 authored 5 years ago by Luc Giffon
--- a/.gitignore
+++ b/.gitignore
+models/*
+
 */.kile/*
 *.kilepr
 # Byte-compiled / optimized / DLL files

--- a/README.md
+++ b/README.md
@@ -49,5 +49,16 @@ Project Organization
 Instal project
 --------------

+First install the project pacakge:
+
 	pip install -r requirements.txt

+Then create a file `.env` by copying the file `.env.example`:
+	
+	cp .env.example .env
+	
+Then you must set the project directory in the `.env` file :
+ 
+	project_dir = "path/to/your/project/directory"	
+
+This directory will be used for storing the model parameters.
\ No newline at end of file
--- a/code/bolsonaro/models/omp_forest_regressor.py
+++ b/code/bolsonaro/models/omp_forest_regressor.py
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.linear_model import OrthogonalMatchingPursuit
 from sklearn.base import BaseEstimator
-
+import numpy as np

 class OmpForestRegressor(BaseEstimator):

@@ -35,10 +35,20 @@ class OmpForestRegressor(BaseEstimator):
        return forest
    
    def _extract_subforest(self, X_train, y_train):
-        D = [[tree.predict([elem])[0] for tree in self._forest] for elem in X_train]
+        """
+        Given an already estimated regressor: apply OMP to get the weight of each tree.
+
+        The X_train data is used for interrogation of every tree in the forest. The y_train data
+        is used for finding the weights in OMP.
+
+        :param X_train: (n_sample, n_features) array
+        :param y_train: (n_sample,) array
+        :return:
+        """
+        D = np.array([tree.predict(X_train) for tree in self._forest]).T
        omp = OrthogonalMatchingPursuit(
            n_nonzero_coefs=self._models_parameters.extracted_forest_size,
            fit_intercept=False, normalize=False)
        omp.fit(D, y_train)
-        weights = omp.coef_
+        weights = omp.coef_  # why not to use directly the omp estimator and bypass it using the coefs?
        return weights
--- a/code/bolsonaro/trainer.py
+++ b/code/bolsonaro/trainer.py
@@ -14,6 +14,7 @@ class Trainer(object):
        self._logger = LoggerFactory.create(LOG_PATH, __name__)

    def iterate(self, model, models_dir):
+        # why is this function named iterate?
        self._logger.info('Training model using train set...')
        begin_time = time.time()
        model.fit(self._dataset.X_train, self._dataset.y_train)

--- a/code/bolsonaro/utils.py
+++ b/code/bolsonaro/utils.py
@@ -2,6 +2,14 @@ import os


 def resolve_experiment_id(models_dir):
+    """
+    Return the ID of the next experiment.
+
+    The ID is an int equal to n+1 where n is the current number of directory in `models_dir
+    `
+    :param models_dir:
+    :return:
+    """
    ids = [x for x in os.listdir(models_dir) 
        if os.path.isdir(models_dir + os.sep + x)]
    if len(ids) > 0:

--- a/code/train.py
+++ b/code/train.py
+from dotenv import load_dotenv
+
 from bolsonaro.data.dataset_parameters import DatasetParameters
 from bolsonaro.data.dataset_loader import DatasetLoader
 from bolsonaro.models.model_factory import ModelFactory
@@ -5,6 +7,7 @@ from bolsonaro.models.model_parameters import ModelParameters
 from bolsonaro.trainer import Trainer
 from bolsonaro.utils import resolve_experiment_id

+from dotenv import find_dotenv, load_dotenv
 import argparse
 import pathlib
 import random
@@ -13,11 +16,15 @@ import errno


 if __name__ == "__main__":
+    # get environment variables in .env
+    load_dotenv(find_dotenv())
+
    default_dataset_name = 'boston'
    default_normalize = False
    default_forest_size = 100
    default_extracted_forest_size = 10
-    default_models_dir = 'models'
+    # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
+    default_models_dir = os.environ["project_dir"] + os.sep + 'models'
    default_dev_size = 0.2
    default_test_size = 0.2
    default_use_random_seed = True
@@ -43,6 +50,7 @@ if __name__ == "__main__":
        if type(args.extracted_forest_size) == list \
        else [args.extracted_forest_size]

+    # todo the seeds shouldn't be randomly generated but fixed in range instead. We want it to be reproducible: exact same arguments should return exact same results.
    random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \
        for i in range(args.random_seed_number)] \
        if args.use_random_seed else None