From 690cf820b4d557f66999cfe96cea0835cf88b100 Mon Sep 17 00:00:00 2001
From: Luc Giffon <luc.giffon@lis-lab.fr>
Date: Mon, 4 Nov 2019 16:14:36 +0100
Subject: [PATCH] Add few comments/docstring + update gitignore with models
 files + update readme for .env variables

---
 .gitignore                                    |  2 ++
 README.md                                     | 11 +++++++++++
 code/bolsonaro/models/omp_forest_regressor.py | 16 +++++++++++++---
 code/bolsonaro/trainer.py                     |  1 +
 code/bolsonaro/utils.py                       |  8 ++++++++
 code/train.py                                 | 10 +++++++++-
 6 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 9cc2e27..be95874 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+models/*
+
 */.kile/*
 *.kilepr
 # Byte-compiled / optimized / DLL files
diff --git a/README.md b/README.md
index 8ffb3de..8d46ebd 100644
--- a/README.md
+++ b/README.md
@@ -49,5 +49,16 @@ Project Organization
 Instal project
 --------------
 
+First install the project pacakge:
+
 	pip install -r requirements.txt
+
+Then create a file `.env` by copying the file `.env.example`:
 	
+	cp .env.example .env
+	
+Then you must set the project directory in the `.env` file :
+ 
+	project_dir = "path/to/your/project/directory"	
+
+This directory will be used for storing the model parameters.
\ No newline at end of file
diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py
index be60cae..9de6c74 100644
--- a/code/bolsonaro/models/omp_forest_regressor.py
+++ b/code/bolsonaro/models/omp_forest_regressor.py
@@ -1,7 +1,7 @@
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.linear_model import OrthogonalMatchingPursuit
 from sklearn.base import BaseEstimator
-
+import numpy as np
 
 class OmpForestRegressor(BaseEstimator):
 
@@ -35,10 +35,20 @@ class OmpForestRegressor(BaseEstimator):
         return forest
     
     def _extract_subforest(self, X_train, y_train):
-        D = [[tree.predict([elem])[0] for tree in self._forest] for elem in X_train]
+        """
+        Given an already estimated regressor: apply OMP to get the weight of each tree.
+
+        The X_train data is used for interrogation of every tree in the forest. The y_train data
+        is used for finding the weights in OMP.
+
+        :param X_train: (n_sample, n_features) array
+        :param y_train: (n_sample,) array
+        :return:
+        """
+        D = np.array([tree.predict(X_train) for tree in self._forest]).T
         omp = OrthogonalMatchingPursuit(
             n_nonzero_coefs=self._models_parameters.extracted_forest_size,
             fit_intercept=False, normalize=False)
         omp.fit(D, y_train)
-        weights = omp.coef_
+        weights = omp.coef_  # why not to use directly the omp estimator and bypass it using the coefs?
         return weights
diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py
index 7c1436b..95fab4a 100644
--- a/code/bolsonaro/trainer.py
+++ b/code/bolsonaro/trainer.py
@@ -14,6 +14,7 @@ class Trainer(object):
         self._logger = LoggerFactory.create(LOG_PATH, __name__)
 
     def iterate(self, model, models_dir):
+        # why is this function named iterate?
         self._logger.info('Training model using train set...')
         begin_time = time.time()
         model.fit(self._dataset.X_train, self._dataset.y_train)
diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py
index 2affd37..96fbb06 100644
--- a/code/bolsonaro/utils.py
+++ b/code/bolsonaro/utils.py
@@ -2,6 +2,14 @@ import os
 
 
 def resolve_experiment_id(models_dir):
+    """
+    Return the ID of the next experiment.
+
+    The ID is an int equal to n+1 where n is the current number of directory in `models_dir
+    `
+    :param models_dir:
+    :return:
+    """
     ids = [x for x in os.listdir(models_dir) 
         if os.path.isdir(models_dir + os.sep + x)]
     if len(ids) > 0:
diff --git a/code/train.py b/code/train.py
index 0e6896c..74d90a5 100644
--- a/code/train.py
+++ b/code/train.py
@@ -1,3 +1,5 @@
+from dotenv import load_dotenv
+
 from bolsonaro.data.dataset_parameters import DatasetParameters
 from bolsonaro.data.dataset_loader import DatasetLoader
 from bolsonaro.models.model_factory import ModelFactory
@@ -5,6 +7,7 @@ from bolsonaro.models.model_parameters import ModelParameters
 from bolsonaro.trainer import Trainer
 from bolsonaro.utils import resolve_experiment_id
 
+from dotenv import find_dotenv, load_dotenv
 import argparse
 import pathlib
 import random
@@ -13,11 +16,15 @@ import errno
 
 
 if __name__ == "__main__":
+    # get environment variables in .env
+    load_dotenv(find_dotenv())
+
     default_dataset_name = 'boston'
     default_normalize = False
     default_forest_size = 100
     default_extracted_forest_size = 10
-    default_models_dir = 'models'
+    # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
+    default_models_dir = os.environ["project_dir"] + os.sep + 'models'
     default_dev_size = 0.2
     default_test_size = 0.2
     default_use_random_seed = True
@@ -43,6 +50,7 @@ if __name__ == "__main__":
         if type(args.extracted_forest_size) == list \
         else [args.extracted_forest_size]
 
+    # todo the seeds shouldn't be randomly generated but fixed in range instead. We want it to be reproducible: exact same arguments should return exact same results.
     random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \
         for i in range(args.random_seed_number)] \
         if args.use_random_seed else None
-- 
GitLab