diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000000000000000000000000000000000000..9ca543b382b4889be1d93ed2065ef234b789153c
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,12 @@
+# Environment variables go here, can be read by `python-dotenv` package:
+#
+#   `src/script.py`
+#   ----------------------------------------------------------------
+#    import dotenv
+#
+#    project_dir = os.path.join(os.path.dirname(__file__), os.pardir)
+#    dotenv_path = os.path.join(project_dir, '.env')
+#    dotenv.load_dotenv(dotenv_path)
+#   ----------------------------------------------------------------
+
+project_dir = "."
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 9cc2e27f5cc3bd830b9038f8562b9edc51eb3f19..be95874710f5ee55a4e0104c6f78ff8a21abb705 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,5 @@
+models/*
+
 */.kile/*
 *.kilepr
 # Byte-compiled / optimized / DLL files
diff --git a/README.md b/README.md
index 8ffb3de812f21466cd7bfe0aaf8c92518529963c..8d46ebd2a36736d601c1dc94ee3f19c5118555fd 100644
--- a/README.md
+++ b/README.md
@@ -49,5 +49,16 @@ Project Organization
 Instal project
 --------------
 
+First install the project pacakge:
+
 	pip install -r requirements.txt
+
+Then create a file `.env` by copying the file `.env.example`:
 	
+	cp .env.example .env
+	
+Then you must set the project directory in the `.env` file :
+ 
+	project_dir = "path/to/your/project/directory"	
+
+This directory will be used for storing the model parameters.
\ No newline at end of file
diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py
index 6ad4b1f769d35b67b9ebcec6dae6b03ed68607e7..4ab7e3faf434959ba9da0d32b3c3801b26775ec3 100644
--- a/code/bolsonaro/data/dataset_loader.py
+++ b/code/bolsonaro/data/dataset_loader.py
@@ -71,7 +71,7 @@ class DatasetLoader(object):
             test_size=dataset_parameters.dev_size,
             random_state=dataset_parameters.random_state)
 
-        # TODO
+        # TODO?
         if dataset_parameters.normalize:
             pass
 
diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py
index 556c96059189397cad7a96f035316aec55bd944a..f19dc903a74488f006cc3f54107a7cd3cdc2f81c 100644
--- a/code/bolsonaro/data/dataset_parameters.py
+++ b/code/bolsonaro/data/dataset_parameters.py
@@ -4,12 +4,13 @@ import os
 
 class DatasetParameters(object):
 
-    def __init__(self, name, test_size, dev_size, random_state, normalize):
+    def __init__(self, name, test_size, dev_size, random_state, normalize, train_on_subset):
         self._name = name
         self._test_size = test_size
         self._dev_size = dev_size
         self._random_state = random_state
         self._normalize = normalize
+        self._train_on_subset = train_on_subset
 
     @property
     def name(self):
@@ -31,6 +32,10 @@ class DatasetParameters(object):
     def normalize(self):
         return self._normalize
 
+    @property
+    def train_on_subset(self):
+        return self._train_on_subset
+
     def save(self, directory_path, experiment_id):
         with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file:
             json.dump({
@@ -38,7 +43,8 @@ class DatasetParameters(object):
                 'test_size': self._test_size,
                 'dev_size': self._dev_size,
                 'random_state': self._random_state,
-                'normalize': self._normalize
+                'normalize': self._normalize,
+                'train_on_subset': self._train_on_subset
             },
             output_file,
             indent=4)
diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py
index b1fec8c6516aff4dbf8e2a0c521387e001ae242b..2d8dba5f78b29f6db093f497a63bee5cf634822c 100644
--- a/code/bolsonaro/models/model_parameters.py
+++ b/code/bolsonaro/models/model_parameters.py
@@ -4,10 +4,11 @@ import os
 
 class ModelParameters(object):
 
-    def __init__(self, forest_size, extracted_forest_size, seed=None):
+    def __init__(self, forest_size, extracted_forest_size, normalize, seed=None):
         self._forest_size = forest_size
         self._extracted_forest_size = extracted_forest_size
         self._seed = seed
+        self._normalize = normalize
 
     @property
     def forest_size(self):
@@ -21,12 +22,17 @@ class ModelParameters(object):
     def seed(self):
         return self._seed
 
+    @property
+    def normalize(self):
+        return self._normalize
+
     def save(self, directory_path, experiment_id):
         with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file:
             json.dump({
                 'forest_size': self._forest_size,
                 'extracted_forest_size': self._extracted_forest_size,
-                'seed': self._seed
+                'seed': self._seed,
+                'normalize': self._normalize
             },
             output_file,
             indent=4)
diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py
index be60cae324c4542eed468765cbb1e8e6b8c86f5f..8fe7dc9bbe4061f259bdc74417b38a607ac6db46 100644
--- a/code/bolsonaro/models/omp_forest_regressor.py
+++ b/code/bolsonaro/models/omp_forest_regressor.py
@@ -1,6 +1,10 @@
+from bolsonaro import LOG_PATH
+from bolsonaro.error_handling.logger_factory import LoggerFactory
+
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.linear_model import OrthogonalMatchingPursuit
 from sklearn.base import BaseEstimator
+import numpy as np
 
 
 class OmpForestRegressor(BaseEstimator):
@@ -9,6 +13,7 @@ class OmpForestRegressor(BaseEstimator):
         self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size,
             random_state=models_parameters.seed)
         self._models_parameters = models_parameters
+        self._logger = LoggerFactory.create(LOG_PATH, __name__)
 
     def fit(self, X_train, y_train):
         self._forest = self._train_forest(X_train, y_train)
@@ -29,16 +34,78 @@ class OmpForestRegressor(BaseEstimator):
     def models_parameters(self):
         return self._models_parameters
 
+    def score_regressor(self, X, y):
+        return self._regressor.score(X, y)
+
     def _train_forest(self, X_train, y_train):
         self._regressor.fit(X_train, y_train)
         forest = self._regressor.estimators_
         return forest
     
     def _extract_subforest(self, X_train, y_train):
-        D = [[tree.predict([elem])[0] for tree in self._forest] for elem in X_train]
+        """
+        Given an already estimated regressor: apply OMP to get the weight of each tree.
+
+        The X_train data is used for interrogation of every tree in the forest. The y_train data
+        is used for finding the weights in OMP.
+
+        :param X_train: (n_sample, n_features) array
+        :param y_train: (n_sample,) array
+        :return:
+        """
+        self._logger.debug("Forest make prediction on X_train")
+        D = self._forest_prediction(X_train)
+
+        if self._models_parameters.normalize:
+            # question: maybe consider other kinds of normalization
+            self._logger.debug("Compute norm of predicted vectors on X_train")
+            self._forest_norms = np.linalg.norm(D, axis=0)
+            D /= self._forest_norms
+
         omp = OrthogonalMatchingPursuit(
             n_nonzero_coefs=self._models_parameters.extracted_forest_size,
             fit_intercept=False, normalize=False)
+        self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
+                           .format(self._models_parameters.extracted_forest_size))
         omp.fit(D, y_train)
         weights = omp.coef_
+        # question: why not to use directly the omp estimator instead of bypassing it using the coefs?
         return weights
+
+    def _forest_prediction(self, X):
+        return np.array([tree.predict(X) for tree in self._forest]).T
+
+    def predict(self, X):
+        """
+        Apply the OMPForestRegressor to X.
+
+        :param X:
+        :return:
+        """
+        D = self._forest_prediction(X)
+
+        if self._models_parameters.normalize:
+            D /= self._forest_norms
+
+        predictions = D @ self.weights
+
+        return predictions
+
+
+    def score(self, X, y, metric="mse"):
+        """
+        Evaluate OMPForestRegressor on (`X`, `y`) using `metric`
+
+        :param X:
+        :param y:
+        :param metric:
+        :return:
+        """
+        predictions = self.predict(X)
+
+        if metric == "mse":
+            evaluation = np.mean(np.square(predictions - y))
+        else:
+            raise ValueError("Metric value {} is not known.")
+
+        return evaluation
\ No newline at end of file
diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py
index 7c1436b87b48e7069500ba85f1e60104ac619c4c..3370ce7b0e9211ffaa92305b4ddcadfc41ea5cb1 100644
--- a/code/bolsonaro/trainer.py
+++ b/code/bolsonaro/trainer.py
@@ -13,12 +13,21 @@ class Trainer(object):
         self._dataset = dataset
         self._logger = LoggerFactory.create(LOG_PATH, __name__)
 
-    def iterate(self, model, models_dir):
+    def train(self, model, models_dir):
         self._logger.info('Training model using train set...')
         begin_time = time.time()
-        model.fit(self._dataset.X_train, self._dataset.y_train)
+        if self._dataset.dataset_parameters.train_on_subset == 'train':
+            X, y = self._dataset.X_train, self._dataset.y_train
+        elif self._dataset.dataset_parameters.train_on_subset == 'dev':
+            X, y = self._dataset.X_dev, self._dataset.y_dev
+        else:
+            raise ValueError("Unsupported train_on_subset value '{}'".format(self._dataset.dataset_parameters.train_on_subset))
+        model.fit(X, y)
         end_time = time.time()
 
+        self._dump_raw_results(models_dir, model, end_time, begin_time)
+
+    def _dump_raw_results(self, models_dir, model, end_time, begin_time):
         output_file_path = models_dir + os.sep + 'model.pickle'
         self._logger.info('Saving trained model to {}'.format(output_file_path))
         with open(output_file_path, 'wb') as output_file:
diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py
index 2affd37e8e20e96406b1d2fc585b3e042d7a1d2d..96fbb06217b2771c1cf906f7ee5d89097296bcb1 100644
--- a/code/bolsonaro/utils.py
+++ b/code/bolsonaro/utils.py
@@ -2,6 +2,14 @@ import os
 
 
 def resolve_experiment_id(models_dir):
+    """
+    Return the ID of the next experiment.
+
+    The ID is an int equal to n+1 where n is the current number of directory in `models_dir
+    `
+    :param models_dir:
+    :return:
+    """
     ids = [x for x in os.listdir(models_dir) 
         if os.path.isdir(models_dir + os.sep + x)]
     if len(ids) > 0:
diff --git a/code/train.py b/code/train.py
index 0e6896c522ad2ca26d848a634f740d8d2afb008d..1283e90a3d05cf9bb5ee0a3b4777e41a09dcfa1c 100644
--- a/code/train.py
+++ b/code/train.py
@@ -4,7 +4,10 @@ from bolsonaro.models.model_factory import ModelFactory
 from bolsonaro.models.model_parameters import ModelParameters
 from bolsonaro.trainer import Trainer
 from bolsonaro.utils import resolve_experiment_id
+from bolsonaro import LOG_PATH
+from bolsonaro.error_handling.logger_factory import LoggerFactory
 
+from dotenv import find_dotenv, load_dotenv
 import argparse
 import pathlib
 import random
@@ -13,17 +16,21 @@ import errno
 
 
 if __name__ == "__main__":
+    # get environment variables in .env
+    load_dotenv(find_dotenv('.env.example'))
+
     default_dataset_name = 'boston'
-    default_normalize = False
+    default_normalize = True
     default_forest_size = 100
     default_extracted_forest_size = 10
-    default_models_dir = 'models'
+    # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
+    default_models_dir = os.environ["project_dir"] + os.sep + 'models'
     default_dev_size = 0.2
     default_test_size = 0.2
-    default_use_random_seed = True
     default_random_seed_number = 1
     begin_random_seed_range = 1
     end_random_seed_range = 2000
+    default_train_on_subset = 'train'
 
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
@@ -31,29 +38,34 @@ if __name__ == "__main__":
     parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
     parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
     parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
-    parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio')
-    parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio')
-    parser.add_argument('--use_random_seed', action='store_true', default=default_use_random_seed, help='Random seed used for the data split')
-    parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used')
+    parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio.')
+    parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio.')
+    parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used.')
+    parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
+    parser.add_argument('--train_on_subset', nargs='?', type=str, default=default_train_on_subset, help='Specify on witch subset the model will be trained (either train or dev).')
     args = parser.parse_args()
 
     pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True)
 
+    logger = LoggerFactory.create(LOG_PATH, __name__)
+
     args.extracted_forest_size = args.extracted_forest_size \
         if type(args.extracted_forest_size) == list \
         else [args.extracted_forest_size]
 
-    random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \
-        for i in range(args.random_seed_number)] \
-        if args.use_random_seed else None
+    if args.seeds != None and args.random_seed_number > 1:
+        logger.warn('seeds and random_seed_number parameters are both specified. Seeds will be used.')    
+    seeds = args.seeds if args.seeds is not None \
+        else [random.randint(begin_random_seed_range, end_random_seed_range) \
+        for i in range(args.random_seed_number)]
 
     experiment_id = resolve_experiment_id(args.models_dir)
     experiment_id_str = str(experiment_id)
 
-    for random_seed in random_seeds:
-        random_seed_str = str(random_seed)
+    for seed in seeds:
+        seed_str = str(seed)
         models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \
-            os.sep + random_seed_str
+            os.sep + seed_str
         try:
             os.makedirs(models_dir)
         except OSError as e:
@@ -64,8 +76,9 @@ if __name__ == "__main__":
             name=args.dataset_name,
             test_size=args.test_size,
             dev_size=args.dev_size,
-            random_state=random_seed,
-            normalize=args.normalize
+            random_state=seed,
+            normalize=args.normalize,
+            train_on_subset=args.train_on_subset
         )
         dataset_parameters.save(models_dir, experiment_id_str)
 
@@ -84,10 +97,14 @@ if __name__ == "__main__":
             model_parameters = ModelParameters(
                 forest_size=args.forest_size,
                 extracted_forest_size=extracted_forest_size,
-                seed=random_seed
+                seed=seed,
+                normalize=args.normalize
             )
             model_parameters.save(sub_models_dir, experiment_id)
 
             model = ModelFactory.build(dataset.task, model_parameters)
 
-            trainer.iterate(model, sub_models_dir)
+            trainer.train(model, sub_models_dir)
+
+            logger.info('Error on test set: {}'.format(model.score(dataset.X_test, dataset.y_test)))
+            logger.info('Accuracy on test set: {}'.format(model.score_regressor(dataset.X_test, dataset.y_test)))