Skip to content
Snippets Groups Projects
Commit 8b8eb9a5 authored by Charly LAMOTHE's avatar Charly LAMOTHE
Browse files

- In compute_results, add loadenv, load raw results, and update experiment_ids...

- In compute_results, add loadenv, load raw results, and update experiment_ids so that it's possible to specify a list of experiments ids. The default behavior is to load all exp ids;
- Fix normalization option in train.py. By default it normalizes D, and it doesn't when specify wo_normalization option;
- Fix logger.warn to logger.warning in train.py
- Replace the dumping of result in trainer.py by a dedicated class to save and load the trained model and training metadatas: model_raw_results.py;
- Rename too long func DatasetLoader.load_from_name to DatasetLoader.load;
- Add loading functions in dataset_parameters and model_parameters;
- Set console logging level to INFO to summarize the most important console logs;
- Add a load function in model_factory;
- In omp_forest_regressor, move private funcs to the bottom of the file.

TODO: compute the plot from the loaded raw results in compute_results file.
parent f3513289
No related branches found
No related tags found
1 merge request!3clean scripts
...@@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split ...@@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split
class DatasetLoader(object): class DatasetLoader(object):
@staticmethod @staticmethod
def load_from_name(dataset_parameters): def load(dataset_parameters):
name = dataset_parameters.name name = dataset_parameters.name
if name == 'boston': if name == 'boston':
dataset_loading_func = load_boston dataset_loading_func = load_boston
......
...@@ -48,3 +48,16 @@ class DatasetParameters(object): ...@@ -48,3 +48,16 @@ class DatasetParameters(object):
}, },
output_file, output_file,
indent=4) indent=4)
@staticmethod
def load(directory_path, experiment_id):
with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'r') as input_file:
parameters = json.load(input_file)
return DatasetParameters(
name=parameters['name'],
test_size=parameters['test_size'],
dev_size=parameters['dev_size'],
random_state=parameters['random_state'],
normalize=parameters['normalize'],
train_on_subset=parameters['train_on_subset']
)
...@@ -50,7 +50,7 @@ class LoggerFactory(object): ...@@ -50,7 +50,7 @@ class LoggerFactory(object):
# Create console handler # Create console handler
ch = logging.StreamHandler() ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG) ch.setLevel(logging.INFO)
# Create formatter # Create formatter
formatter = logging.Formatter('%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter('%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(levelname)s - %(message)s')
......
from bolsonaro.models.omp_forest_classifier import OmpForestClassifier from bolsonaro.models.omp_forest_classifier import OmpForestClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters
import os
import pickle
class ModelFactory(object): class ModelFactory(object):
...@@ -14,3 +18,11 @@ class ModelFactory(object): ...@@ -14,3 +18,11 @@ class ModelFactory(object):
else: else:
raise ValueError("Unsupported task '{}'".format(task)) raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters) return model_func(model_parameters)
@staticmethod
def load(task, directory_path, experiment_id, model_raw_results):
model_parameters = ModelParameters.load(directory_path, experiment_id)
model = ModelFactory.build(task, model_parameters)
model.set_forest(model_raw_results.forest)
model.set_weights(model_raw_results.weights)
return model
...@@ -36,3 +36,14 @@ class ModelParameters(object): ...@@ -36,3 +36,14 @@ class ModelParameters(object):
}, },
output_file, output_file,
indent=4) indent=4)
@staticmethod
def load(directory_path, experiment_id):
with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'r') as input_file:
parameters = json.load(input_file)
return ModelParameters(
forest_size=parameters['forest_size'],
extracted_forest_size=parameters['extracted_forest_size'],
seed=parameters['seed'],
normalize=parameters['normalize']
)
import pickle
import os
import datetime
class ModelRawResults(object):
def __init__(self, forest, weights, training_time,
datetime, train_score, dev_score, test_score,
score_metric, train_score_regressor, dev_score_regressor,
test_score_regressor):
self._forest = forest
self._weights = weights
self._training_time = training_time
self._datetime = datetime
self._train_score = train_score
self._dev_score = dev_score
self._test_score = test_score
self._score_metric = score_metric
self._train_score_regressor = train_score_regressor
self._dev_score_regressor = dev_score_regressor
self._test_score_regressor = test_score_regressor
@property
def forest(self):
return self._forest
@property
def weights(self):
return self._weights
@property
def training_time(self):
return self._training_time
@property
def datetime(self):
return self._datetime
@property
def train_score(self):
return self._train_score
@property
def dev_score(self):
return self._dev_score
@property
def test_score(self):
return self._test_score
@property
def score_metric(self):
return self._score_metric
@property
def train_score_regressor(self):
return self._train_score_regressor
@property
def dev_score_regressor(self):
return self._dev_score_regressor
@property
def test_score_regressor(self):
return self._test_score_regressor
@staticmethod
def save(models_dir, model, end_time, begin_time, dataset, logger):
output_file_path = models_dir + os.sep + 'model_raw_results.pickle'
logger.debug('Saving trained model and raw results to {}'.format(output_file_path))
with open(output_file_path, 'wb') as output_file:
pickle.dump({
'forest': model.forest,
'weights': model.weights,
'training_time': end_time - begin_time,
'datetime': datetime.datetime.now(),
'train_score': model.score(dataset.X_train, dataset.y_train),
'dev_score': model.score(dataset.X_dev, dataset.y_dev),
'test_score': model.score(dataset.X_test, dataset.y_test),
'score_metric': model.default_score_metric,
'train_score_regressor': model.score_regressor(dataset.X_train, dataset.y_train),
'dev_score_regressor': model.score_regressor(dataset.X_dev, dataset.y_dev),
'test_score_regressor': model.score_regressor(dataset.X_test, dataset.y_test)
}, output_file)
@staticmethod
def load(models_dir):
model_file_path = models_dir + os.sep + 'model_raw_results.pickle'
with open(model_file_path, 'rb') as input_file:
model_data = pickle.load(input_file)
return ModelRawResults(
forest=model_data['forest'],
weights=model_data['weights'],
training_time=model_data['training_time'],
datetime=model_data['datetime'],
train_score=model_data['train_score'],
dev_score=model_data['dev_score'],
test_score=model_data['test_score'],
score_metric=model_data['score_metric'],
train_score_regressor=model_data['train_score_regressor'],
dev_score_regressor=model_data['dev_score_regressor'],
test_score_regressor=model_data['test_score_regressor']
)
...@@ -9,72 +9,41 @@ import numpy as np ...@@ -9,72 +9,41 @@ import numpy as np
class OmpForestRegressor(BaseEstimator): class OmpForestRegressor(BaseEstimator):
default_score_metric = 'mse'
def __init__(self, models_parameters): def __init__(self, models_parameters):
self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed) random_state=models_parameters.seed)
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__) self._logger = LoggerFactory.create(LOG_PATH, __name__)
def fit(self, X_train, y_train):
self._forest = self._train_forest(X_train, y_train)
self._weights = self._extract_subforest(X_train, y_train)
return self
@property @property
def forest(self): def forest(self):
return self._forest return self._forest
def set_forest(self, forest):
self._forest = forest
self._regressor.estimators_ = forest
@property @property
def weights(self): def weights(self):
return self._weights return self._weights
def set_weights(self, weights):
self._weights = weights
@property @property
def models_parameters(self): def models_parameters(self):
return self._models_parameters return self._models_parameters
def fit(self, X, y):
self._forest = self._train_forest(X, y)
self._weights = self._extract_subforest(X, y)
return self
def score_regressor(self, X, y): def score_regressor(self, X, y):
return self._regressor.score(X, y) return self._regressor.score(X, y)
def _train_forest(self, X_train, y_train):
self._regressor.fit(X_train, y_train)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X_train, y_train):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X_train data is used for interrogation of every tree in the forest. The y_train data
is used for finding the weights in OMP.
:param X_train: (n_sample, n_features) array
:param y_train: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X_train")
D = self._forest_prediction(X_train)
if self._models_parameters.normalize:
# question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X_train")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=self._models_parameters.extracted_forest_size,
fit_intercept=False, normalize=False)
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
omp.fit(D, y_train)
weights = omp.coef_
# question: why not to use directly the omp estimator instead of bypassing it using the coefs?
return weights
def _forest_prediction(self, X):
return np.array([tree.predict(X) for tree in self._forest]).T
def predict(self, X): def predict(self, X):
""" """
Apply the OMPForestRegressor to X. Apply the OMPForestRegressor to X.
...@@ -91,8 +60,7 @@ class OmpForestRegressor(BaseEstimator): ...@@ -91,8 +60,7 @@ class OmpForestRegressor(BaseEstimator):
return predictions return predictions
def score(self, X, y, metric=default_score_metric):
def score(self, X, y, metric="mse"):
""" """
Evaluate OMPForestRegressor on (`X`, `y`) using `metric` Evaluate OMPForestRegressor on (`X`, `y`) using `metric`
...@@ -103,9 +71,47 @@ class OmpForestRegressor(BaseEstimator): ...@@ -103,9 +71,47 @@ class OmpForestRegressor(BaseEstimator):
""" """
predictions = self.predict(X) predictions = self.predict(X)
if metric == "mse": if metric == 'mse':
evaluation = np.mean(np.square(predictions - y)) evaluation = np.mean(np.square(predictions - y))
else: else:
raise ValueError("Metric value {} is not known.") raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
def _train_forest(self, X, y):
self._regressor.fit(X, y)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X, y):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X data is used for interrogation of every tree in the forest. The y data
is used for finding the weights in OMP.
:param X: (n_sample, n_features) array
:param y: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X")
D = self._forest_prediction(X)
return evaluation if self._models_parameters.normalize:
\ No newline at end of file # question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=self._models_parameters.extracted_forest_size,
fit_intercept=False, normalize=False)
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
omp.fit(D, y)
weights = omp.coef_
# question: why not to use directly the omp estimator instead of bypassing it using the coefs?
return weights
def _forest_prediction(self, X):
return np.array([tree.predict(X) for tree in self._forest]).T
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
from . import LOG_PATH from . import LOG_PATH
import pickle
import os
import time import time
import datetime
class Trainer(object): class Trainer(object):
...@@ -14,26 +12,17 @@ class Trainer(object): ...@@ -14,26 +12,17 @@ class Trainer(object):
self._logger = LoggerFactory.create(LOG_PATH, __name__) self._logger = LoggerFactory.create(LOG_PATH, __name__)
def train(self, model, models_dir): def train(self, model, models_dir):
self._logger.info('Training model using train set...') self._logger.debug('Training model using train set...')
begin_time = time.time() begin_time = time.time()
if self._dataset.dataset_parameters.train_on_subset == 'train': train_on_subset = self._dataset.dataset_parameters.train_on_subset
if train_on_subset == 'train':
X, y = self._dataset.X_train, self._dataset.y_train X, y = self._dataset.X_train, self._dataset.y_train
elif self._dataset.dataset_parameters.train_on_subset == 'dev': elif train_on_subset == 'dev':
X, y = self._dataset.X_dev, self._dataset.y_dev X, y = self._dataset.X_dev, self._dataset.y_dev
else: else:
raise ValueError("Unsupported train_on_subset value '{}'".format(self._dataset.dataset_parameters.train_on_subset)) raise ValueError("Unsupported train_on_subset value '{}'".format(train_on_subset))
self._logger.debug('Fitting on {} subset'.format(train_on_subset))
model.fit(X, y) model.fit(X, y)
end_time = time.time() end_time = time.time()
self._dump_raw_results(models_dir, model, end_time, begin_time) ModelRawResults.save(models_dir, model, end_time, begin_time, self._dataset, self._logger)
def _dump_raw_results(self, models_dir, model, end_time, begin_time):
output_file_path = models_dir + os.sep + 'model.pickle'
self._logger.info('Saving trained model to {}'.format(output_file_path))
with open(output_file_path, 'wb') as output_file:
pickle.dump({
'forest': model.forest,
'weights': model.weights,
'training_time': end_time - begin_time,
'datetime': datetime.datetime.now()
}, output_file)
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.model_factory import ModelFactory
import argparse import argparse
import pathlib import pathlib
from dotenv import find_dotenv, load_dotenv
import os
if __name__ == "__main__": if __name__ == "__main__":
default_results_dir = 'results' # get environment variables in .env
default_models_dir = 'models' load_dotenv(find_dotenv('.env.example'))
default_experiment_id = -1
default_results_dir = os.environ["project_dir"] + os.sep + 'results'
default_models_dir = os.environ["project_dir"] + os.sep + 'models'
default_experiment_ids = None
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.') parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
parser.add_argument('--experiment_id', nargs='?', type=int, default=default_experiment_id, help='Compute the results of a single experiment id') parser.add_argument('--experiment_ids', nargs='+', type=int, default=default_experiment_ids, help='Compute the results of the specified experiment id(s)')
args = parser.parse_args() args = parser.parse_args()
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
if args.experiment_id == -1: experiments_ids = [str(experiment_id) for experiment_id in args.experiment_ids] \
pass if args.experiment_ids is not None \
else: else os.listdir(args.models_dir)
pass
if experiments_ids is None or len(experiments_ids) == 0:
raise ValueError("No experiment id was found or specified.")
for experiment_id in experiments_ids:
experiment_id_path = args.models_dir + os.sep + experiment_id
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds'
for seed in os.listdir(experiment_seed_root_path):
experiment_seed_path = experiment_seed_root_path + os.sep + seed
dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id)
dataset = DatasetLoader.load(dataset_parameters)
extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size'
for extracted_forest_size in os.listdir(extracted_forest_size_root_path):
extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size
model_raw_results = ModelRawResults.load(extracted_forest_size_path)
model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results)
...@@ -21,6 +21,7 @@ if __name__ == "__main__": ...@@ -21,6 +21,7 @@ if __name__ == "__main__":
default_dataset_name = 'boston' default_dataset_name = 'boston'
default_normalize = True default_normalize = True
default_wo_normalization = False
default_forest_size = 100 default_forest_size = 100
default_extracted_forest_size = 10 default_extracted_forest_size = 10
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
...@@ -34,7 +35,7 @@ if __name__ == "__main__": ...@@ -34,7 +35,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize', action='store_true', default=default_normalize, help='Normalize the data by doing the L2 division of the pred vectors.') parser.add_argument('--wo_normalization', action='store_true', default=default_wo_normalization, help='Withouyt normalize the data by doing the L2 division of the pred vectors.')
parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
...@@ -47,22 +48,27 @@ if __name__ == "__main__": ...@@ -47,22 +48,27 @@ if __name__ == "__main__":
pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, __name__) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
args.extracted_forest_size = args.extracted_forest_size \ args.extracted_forest_size = args.extracted_forest_size \
if type(args.extracted_forest_size) == list \ if type(args.extracted_forest_size) == list \
else [args.extracted_forest_size] else [args.extracted_forest_size]
if args.seeds != None and args.random_seed_number > 1: if args.seeds != None and args.random_seed_number > 1:
logger.warn('seeds and random_seed_number parameters are both specified. Seeds will be used.') logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
seeds = args.seeds if args.seeds is not None \ seeds = args.seeds if args.seeds is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \ else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(args.random_seed_number)] for i in range(args.random_seed_number)]
normalize = default_normalize and args.wo_normalization is False
logger.debug('normalize={}'.format(normalize))
experiment_id = resolve_experiment_id(args.models_dir) experiment_id = resolve_experiment_id(args.models_dir)
experiment_id_str = str(experiment_id) experiment_id_str = str(experiment_id)
for seed in seeds: for seed in seeds:
logger.debug('Seed={}'.format(seed))
seed_str = str(seed) seed_str = str(seed)
models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + seed_str os.sep + seed_str
...@@ -77,16 +83,17 @@ if __name__ == "__main__": ...@@ -77,16 +83,17 @@ if __name__ == "__main__":
test_size=args.test_size, test_size=args.test_size,
dev_size=args.dev_size, dev_size=args.dev_size,
random_state=seed, random_state=seed,
normalize=args.normalize, normalize=normalize,
train_on_subset=args.train_on_subset train_on_subset=args.train_on_subset
) )
dataset_parameters.save(models_dir, experiment_id_str) dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load_from_name(dataset_parameters) dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset) trainer = Trainer(dataset)
for extracted_forest_size in args.extracted_forest_size: for extracted_forest_size in args.extracted_forest_size:
logger.debug('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
try: try:
os.makedirs(sub_models_dir) os.makedirs(sub_models_dir)
...@@ -98,7 +105,7 @@ if __name__ == "__main__": ...@@ -98,7 +105,7 @@ if __name__ == "__main__":
forest_size=args.forest_size, forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size, extracted_forest_size=extracted_forest_size,
seed=seed, seed=seed,
normalize=args.normalize normalize=normalize
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment