Skip to content
Snippets Groups Projects
Commit f3513289 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Merge branch 'luc_manage_normalization' into 'wip_clean_scripts'

Luc manage normalization

See merge request !2
parents 96e26620 3c6dc3e5
No related branches found
No related tags found
2 merge requests!3clean scripts,!2Luc manage normalization
# Environment variables go here, can be read by `python-dotenv` package:
#
# `src/script.py`
# ----------------------------------------------------------------
# import dotenv
#
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)
# dotenv_path = os.path.join(project_dir, '.env')
# dotenv.load_dotenv(dotenv_path)
# ----------------------------------------------------------------
project_dir = "."
\ No newline at end of file
models/*
*/.kile/* */.kile/*
*.kilepr *.kilepr
# Byte-compiled / optimized / DLL files # Byte-compiled / optimized / DLL files
......
...@@ -49,5 +49,16 @@ Project Organization ...@@ -49,5 +49,16 @@ Project Organization
Instal project Instal project
-------------- --------------
First install the project pacakge:
pip install -r requirements.txt pip install -r requirements.txt
Then create a file `.env` by copying the file `.env.example`:
cp .env.example .env
Then you must set the project directory in the `.env` file :
project_dir = "path/to/your/project/directory"
This directory will be used for storing the model parameters.
\ No newline at end of file
...@@ -71,7 +71,7 @@ class DatasetLoader(object): ...@@ -71,7 +71,7 @@ class DatasetLoader(object):
test_size=dataset_parameters.dev_size, test_size=dataset_parameters.dev_size,
random_state=dataset_parameters.random_state) random_state=dataset_parameters.random_state)
# TODO # TODO?
if dataset_parameters.normalize: if dataset_parameters.normalize:
pass pass
......
...@@ -4,12 +4,13 @@ import os ...@@ -4,12 +4,13 @@ import os
class DatasetParameters(object): class DatasetParameters(object):
def __init__(self, name, test_size, dev_size, random_state, normalize): def __init__(self, name, test_size, dev_size, random_state, normalize, train_on_subset):
self._name = name self._name = name
self._test_size = test_size self._test_size = test_size
self._dev_size = dev_size self._dev_size = dev_size
self._random_state = random_state self._random_state = random_state
self._normalize = normalize self._normalize = normalize
self._train_on_subset = train_on_subset
@property @property
def name(self): def name(self):
...@@ -31,6 +32,10 @@ class DatasetParameters(object): ...@@ -31,6 +32,10 @@ class DatasetParameters(object):
def normalize(self): def normalize(self):
return self._normalize return self._normalize
@property
def train_on_subset(self):
return self._train_on_subset
def save(self, directory_path, experiment_id): def save(self, directory_path, experiment_id):
with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file: with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file:
json.dump({ json.dump({
...@@ -38,7 +43,8 @@ class DatasetParameters(object): ...@@ -38,7 +43,8 @@ class DatasetParameters(object):
'test_size': self._test_size, 'test_size': self._test_size,
'dev_size': self._dev_size, 'dev_size': self._dev_size,
'random_state': self._random_state, 'random_state': self._random_state,
'normalize': self._normalize 'normalize': self._normalize,
'train_on_subset': self._train_on_subset
}, },
output_file, output_file,
indent=4) indent=4)
...@@ -4,10 +4,11 @@ import os ...@@ -4,10 +4,11 @@ import os
class ModelParameters(object): class ModelParameters(object):
def __init__(self, forest_size, extracted_forest_size, seed=None): def __init__(self, forest_size, extracted_forest_size, normalize, seed=None):
self._forest_size = forest_size self._forest_size = forest_size
self._extracted_forest_size = extracted_forest_size self._extracted_forest_size = extracted_forest_size
self._seed = seed self._seed = seed
self._normalize = normalize
@property @property
def forest_size(self): def forest_size(self):
...@@ -21,12 +22,17 @@ class ModelParameters(object): ...@@ -21,12 +22,17 @@ class ModelParameters(object):
def seed(self): def seed(self):
return self._seed return self._seed
@property
def normalize(self):
return self._normalize
def save(self, directory_path, experiment_id): def save(self, directory_path, experiment_id):
with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file: with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file:
json.dump({ json.dump({
'forest_size': self._forest_size, 'forest_size': self._forest_size,
'extracted_forest_size': self._extracted_forest_size, 'extracted_forest_size': self._extracted_forest_size,
'seed': self._seed 'seed': self._seed,
'normalize': self._normalize
}, },
output_file, output_file,
indent=4) indent=4)
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
import numpy as np
class OmpForestRegressor(BaseEstimator): class OmpForestRegressor(BaseEstimator):
...@@ -9,6 +13,7 @@ class OmpForestRegressor(BaseEstimator): ...@@ -9,6 +13,7 @@ class OmpForestRegressor(BaseEstimator):
self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed) random_state=models_parameters.seed)
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def fit(self, X_train, y_train): def fit(self, X_train, y_train):
self._forest = self._train_forest(X_train, y_train) self._forest = self._train_forest(X_train, y_train)
...@@ -29,16 +34,78 @@ class OmpForestRegressor(BaseEstimator): ...@@ -29,16 +34,78 @@ class OmpForestRegressor(BaseEstimator):
def models_parameters(self): def models_parameters(self):
return self._models_parameters return self._models_parameters
def score_regressor(self, X, y):
return self._regressor.score(X, y)
def _train_forest(self, X_train, y_train): def _train_forest(self, X_train, y_train):
self._regressor.fit(X_train, y_train) self._regressor.fit(X_train, y_train)
forest = self._regressor.estimators_ forest = self._regressor.estimators_
return forest return forest
def _extract_subforest(self, X_train, y_train): def _extract_subforest(self, X_train, y_train):
D = [[tree.predict([elem])[0] for tree in self._forest] for elem in X_train] """
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X_train data is used for interrogation of every tree in the forest. The y_train data
is used for finding the weights in OMP.
:param X_train: (n_sample, n_features) array
:param y_train: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X_train")
D = self._forest_prediction(X_train)
if self._models_parameters.normalize:
# question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X_train")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
omp = OrthogonalMatchingPursuit( omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=self._models_parameters.extracted_forest_size, n_nonzero_coefs=self._models_parameters.extracted_forest_size,
fit_intercept=False, normalize=False) fit_intercept=False, normalize=False)
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
omp.fit(D, y_train) omp.fit(D, y_train)
weights = omp.coef_ weights = omp.coef_
# question: why not to use directly the omp estimator instead of bypassing it using the coefs?
return weights return weights
def _forest_prediction(self, X):
return np.array([tree.predict(X) for tree in self._forest]).T
def predict(self, X):
"""
Apply the OMPForestRegressor to X.
:param X:
:return:
"""
D = self._forest_prediction(X)
if self._models_parameters.normalize:
D /= self._forest_norms
predictions = D @ self.weights
return predictions
def score(self, X, y, metric="mse"):
"""
Evaluate OMPForestRegressor on (`X`, `y`) using `metric`
:param X:
:param y:
:param metric:
:return:
"""
predictions = self.predict(X)
if metric == "mse":
evaluation = np.mean(np.square(predictions - y))
else:
raise ValueError("Metric value {} is not known.")
return evaluation
\ No newline at end of file
...@@ -13,12 +13,21 @@ class Trainer(object): ...@@ -13,12 +13,21 @@ class Trainer(object):
self._dataset = dataset self._dataset = dataset
self._logger = LoggerFactory.create(LOG_PATH, __name__) self._logger = LoggerFactory.create(LOG_PATH, __name__)
def iterate(self, model, models_dir): def train(self, model, models_dir):
self._logger.info('Training model using train set...') self._logger.info('Training model using train set...')
begin_time = time.time() begin_time = time.time()
model.fit(self._dataset.X_train, self._dataset.y_train) if self._dataset.dataset_parameters.train_on_subset == 'train':
X, y = self._dataset.X_train, self._dataset.y_train
elif self._dataset.dataset_parameters.train_on_subset == 'dev':
X, y = self._dataset.X_dev, self._dataset.y_dev
else:
raise ValueError("Unsupported train_on_subset value '{}'".format(self._dataset.dataset_parameters.train_on_subset))
model.fit(X, y)
end_time = time.time() end_time = time.time()
self._dump_raw_results(models_dir, model, end_time, begin_time)
def _dump_raw_results(self, models_dir, model, end_time, begin_time):
output_file_path = models_dir + os.sep + 'model.pickle' output_file_path = models_dir + os.sep + 'model.pickle'
self._logger.info('Saving trained model to {}'.format(output_file_path)) self._logger.info('Saving trained model to {}'.format(output_file_path))
with open(output_file_path, 'wb') as output_file: with open(output_file_path, 'wb') as output_file:
......
...@@ -2,6 +2,14 @@ import os ...@@ -2,6 +2,14 @@ import os
def resolve_experiment_id(models_dir): def resolve_experiment_id(models_dir):
"""
Return the ID of the next experiment.
The ID is an int equal to n+1 where n is the current number of directory in `models_dir
`
:param models_dir:
:return:
"""
ids = [x for x in os.listdir(models_dir) ids = [x for x in os.listdir(models_dir)
if os.path.isdir(models_dir + os.sep + x)] if os.path.isdir(models_dir + os.sep + x)]
if len(ids) > 0: if len(ids) > 0:
......
...@@ -4,7 +4,10 @@ from bolsonaro.models.model_factory import ModelFactory ...@@ -4,7 +4,10 @@ from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.trainer import Trainer from bolsonaro.trainer import Trainer
from bolsonaro.utils import resolve_experiment_id from bolsonaro.utils import resolve_experiment_id
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from dotenv import find_dotenv, load_dotenv
import argparse import argparse
import pathlib import pathlib
import random import random
...@@ -13,17 +16,21 @@ import errno ...@@ -13,17 +16,21 @@ import errno
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env
load_dotenv(find_dotenv('.env.example'))
default_dataset_name = 'boston' default_dataset_name = 'boston'
default_normalize = False default_normalize = True
default_forest_size = 100 default_forest_size = 100
default_extracted_forest_size = 10 default_extracted_forest_size = 10
default_models_dir = 'models' # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
default_models_dir = os.environ["project_dir"] + os.sep + 'models'
default_dev_size = 0.2 default_dev_size = 0.2
default_test_size = 0.2 default_test_size = 0.2
default_use_random_seed = True
default_random_seed_number = 1 default_random_seed_number = 1
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
default_train_on_subset = 'train'
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
...@@ -31,29 +38,34 @@ if __name__ == "__main__": ...@@ -31,29 +38,34 @@ if __name__ == "__main__":
parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio') parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio.')
parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio') parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio.')
parser.add_argument('--use_random_seed', action='store_true', default=default_use_random_seed, help='Random seed used for the data split') parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--train_on_subset', nargs='?', type=str, default=default_train_on_subset, help='Specify on witch subset the model will be trained (either train or dev).')
args = parser.parse_args() args = parser.parse_args()
pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, __name__)
args.extracted_forest_size = args.extracted_forest_size \ args.extracted_forest_size = args.extracted_forest_size \
if type(args.extracted_forest_size) == list \ if type(args.extracted_forest_size) == list \
else [args.extracted_forest_size] else [args.extracted_forest_size]
random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \ if args.seeds != None and args.random_seed_number > 1:
for i in range(args.random_seed_number)] \ logger.warn('seeds and random_seed_number parameters are both specified. Seeds will be used.')
if args.use_random_seed else None seeds = args.seeds if args.seeds is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(args.random_seed_number)]
experiment_id = resolve_experiment_id(args.models_dir) experiment_id = resolve_experiment_id(args.models_dir)
experiment_id_str = str(experiment_id) experiment_id_str = str(experiment_id)
for random_seed in random_seeds: for seed in seeds:
random_seed_str = str(random_seed) seed_str = str(seed)
models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + random_seed_str os.sep + seed_str
try: try:
os.makedirs(models_dir) os.makedirs(models_dir)
except OSError as e: except OSError as e:
...@@ -64,8 +76,9 @@ if __name__ == "__main__": ...@@ -64,8 +76,9 @@ if __name__ == "__main__":
name=args.dataset_name, name=args.dataset_name,
test_size=args.test_size, test_size=args.test_size,
dev_size=args.dev_size, dev_size=args.dev_size,
random_state=random_seed, random_state=seed,
normalize=args.normalize normalize=args.normalize,
train_on_subset=args.train_on_subset
) )
dataset_parameters.save(models_dir, experiment_id_str) dataset_parameters.save(models_dir, experiment_id_str)
...@@ -84,10 +97,14 @@ if __name__ == "__main__": ...@@ -84,10 +97,14 @@ if __name__ == "__main__":
model_parameters = ModelParameters( model_parameters = ModelParameters(
forest_size=args.forest_size, forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size, extracted_forest_size=extracted_forest_size,
seed=random_seed seed=seed,
normalize=args.normalize
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters) model = ModelFactory.build(dataset.task, model_parameters)
trainer.iterate(model, sub_models_dir) trainer.train(model, sub_models_dir)
logger.info('Error on test set: {}'.format(model.score(dataset.X_test, dataset.y_test)))
logger.info('Accuracy on test set: {}'.format(model.score_regressor(dataset.X_test, dataset.y_test)))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment