Skip to content
Snippets Groups Projects
Commit 8b8eb9a5 authored by Charly LAMOTHE's avatar Charly LAMOTHE
Browse files

- In compute_results, add loadenv, load raw results, and update experiment_ids...

- In compute_results, add loadenv, load raw results, and update experiment_ids so that it's possible to specify a list of experiments ids. The default behavior is to load all exp ids;
- Fix normalization option in train.py. By default it normalizes D, and it doesn't when specify wo_normalization option;
- Fix logger.warn to logger.warning in train.py
- Replace the dumping of result in trainer.py by a dedicated class to save and load the trained model and training metadatas: model_raw_results.py;
- Rename too long func DatasetLoader.load_from_name to DatasetLoader.load;
- Add loading functions in dataset_parameters and model_parameters;
- Set console logging level to INFO to summarize the most important console logs;
- Add a load function in model_factory;
- In omp_forest_regressor, move private funcs to the bottom of the file.

TODO: compute the plot from the loaded raw results in compute_results file.
parent f3513289
No related branches found
No related tags found
1 merge request!3clean scripts
......@@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split
class DatasetLoader(object):
@staticmethod
def load_from_name(dataset_parameters):
def load(dataset_parameters):
name = dataset_parameters.name
if name == 'boston':
dataset_loading_func = load_boston
......
......@@ -48,3 +48,16 @@ class DatasetParameters(object):
},
output_file,
indent=4)
@staticmethod
def load(directory_path, experiment_id):
with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'r') as input_file:
parameters = json.load(input_file)
return DatasetParameters(
name=parameters['name'],
test_size=parameters['test_size'],
dev_size=parameters['dev_size'],
random_state=parameters['random_state'],
normalize=parameters['normalize'],
train_on_subset=parameters['train_on_subset']
)
......@@ -50,7 +50,7 @@ class LoggerFactory(object):
# Create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setLevel(logging.INFO)
# Create formatter
formatter = logging.Formatter('%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(levelname)s - %(message)s')
......
from bolsonaro.models.omp_forest_classifier import OmpForestClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters
import os
import pickle
class ModelFactory(object):
......@@ -14,3 +18,11 @@ class ModelFactory(object):
else:
raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters)
@staticmethod
def load(task, directory_path, experiment_id, model_raw_results):
model_parameters = ModelParameters.load(directory_path, experiment_id)
model = ModelFactory.build(task, model_parameters)
model.set_forest(model_raw_results.forest)
model.set_weights(model_raw_results.weights)
return model
......@@ -36,3 +36,14 @@ class ModelParameters(object):
},
output_file,
indent=4)
@staticmethod
def load(directory_path, experiment_id):
with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'r') as input_file:
parameters = json.load(input_file)
return ModelParameters(
forest_size=parameters['forest_size'],
extracted_forest_size=parameters['extracted_forest_size'],
seed=parameters['seed'],
normalize=parameters['normalize']
)
import pickle
import os
import datetime
class ModelRawResults(object):
def __init__(self, forest, weights, training_time,
datetime, train_score, dev_score, test_score,
score_metric, train_score_regressor, dev_score_regressor,
test_score_regressor):
self._forest = forest
self._weights = weights
self._training_time = training_time
self._datetime = datetime
self._train_score = train_score
self._dev_score = dev_score
self._test_score = test_score
self._score_metric = score_metric
self._train_score_regressor = train_score_regressor
self._dev_score_regressor = dev_score_regressor
self._test_score_regressor = test_score_regressor
@property
def forest(self):
return self._forest
@property
def weights(self):
return self._weights
@property
def training_time(self):
return self._training_time
@property
def datetime(self):
return self._datetime
@property
def train_score(self):
return self._train_score
@property
def dev_score(self):
return self._dev_score
@property
def test_score(self):
return self._test_score
@property
def score_metric(self):
return self._score_metric
@property
def train_score_regressor(self):
return self._train_score_regressor
@property
def dev_score_regressor(self):
return self._dev_score_regressor
@property
def test_score_regressor(self):
return self._test_score_regressor
@staticmethod
def save(models_dir, model, end_time, begin_time, dataset, logger):
output_file_path = models_dir + os.sep + 'model_raw_results.pickle'
logger.debug('Saving trained model and raw results to {}'.format(output_file_path))
with open(output_file_path, 'wb') as output_file:
pickle.dump({
'forest': model.forest,
'weights': model.weights,
'training_time': end_time - begin_time,
'datetime': datetime.datetime.now(),
'train_score': model.score(dataset.X_train, dataset.y_train),
'dev_score': model.score(dataset.X_dev, dataset.y_dev),
'test_score': model.score(dataset.X_test, dataset.y_test),
'score_metric': model.default_score_metric,
'train_score_regressor': model.score_regressor(dataset.X_train, dataset.y_train),
'dev_score_regressor': model.score_regressor(dataset.X_dev, dataset.y_dev),
'test_score_regressor': model.score_regressor(dataset.X_test, dataset.y_test)
}, output_file)
@staticmethod
def load(models_dir):
model_file_path = models_dir + os.sep + 'model_raw_results.pickle'
with open(model_file_path, 'rb') as input_file:
model_data = pickle.load(input_file)
return ModelRawResults(
forest=model_data['forest'],
weights=model_data['weights'],
training_time=model_data['training_time'],
datetime=model_data['datetime'],
train_score=model_data['train_score'],
dev_score=model_data['dev_score'],
test_score=model_data['test_score'],
score_metric=model_data['score_metric'],
train_score_regressor=model_data['train_score_regressor'],
dev_score_regressor=model_data['dev_score_regressor'],
test_score_regressor=model_data['test_score_regressor']
)
......@@ -9,72 +9,41 @@ import numpy as np
class OmpForestRegressor(BaseEstimator):
default_score_metric = 'mse'
def __init__(self, models_parameters):
self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed)
self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def fit(self, X_train, y_train):
self._forest = self._train_forest(X_train, y_train)
self._weights = self._extract_subforest(X_train, y_train)
return self
@property
def forest(self):
return self._forest
def set_forest(self, forest):
self._forest = forest
self._regressor.estimators_ = forest
@property
def weights(self):
return self._weights
def set_weights(self, weights):
self._weights = weights
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X, y):
self._forest = self._train_forest(X, y)
self._weights = self._extract_subforest(X, y)
return self
def score_regressor(self, X, y):
return self._regressor.score(X, y)
def _train_forest(self, X_train, y_train):
self._regressor.fit(X_train, y_train)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X_train, y_train):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X_train data is used for interrogation of every tree in the forest. The y_train data
is used for finding the weights in OMP.
:param X_train: (n_sample, n_features) array
:param y_train: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X_train")
D = self._forest_prediction(X_train)
if self._models_parameters.normalize:
# question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X_train")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=self._models_parameters.extracted_forest_size,
fit_intercept=False, normalize=False)
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
omp.fit(D, y_train)
weights = omp.coef_
# question: why not to use directly the omp estimator instead of bypassing it using the coefs?
return weights
def _forest_prediction(self, X):
return np.array([tree.predict(X) for tree in self._forest]).T
def predict(self, X):
"""
Apply the OMPForestRegressor to X.
......@@ -91,8 +60,7 @@ class OmpForestRegressor(BaseEstimator):
return predictions
def score(self, X, y, metric="mse"):
def score(self, X, y, metric=default_score_metric):
"""
Evaluate OMPForestRegressor on (`X`, `y`) using `metric`
......@@ -103,9 +71,47 @@ class OmpForestRegressor(BaseEstimator):
"""
predictions = self.predict(X)
if metric == "mse":
if metric == 'mse':
evaluation = np.mean(np.square(predictions - y))
else:
raise ValueError("Metric value {} is not known.")
raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
def _train_forest(self, X, y):
self._regressor.fit(X, y)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X, y):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X data is used for interrogation of every tree in the forest. The y data
is used for finding the weights in OMP.
:param X: (n_sample, n_features) array
:param y: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X")
D = self._forest_prediction(X)
if self._models_parameters.normalize:
# question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=self._models_parameters.extracted_forest_size,
fit_intercept=False, normalize=False)
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
omp.fit(D, y)
weights = omp.coef_
# question: why not to use directly the omp estimator instead of bypassing it using the coefs?
return weights
def _forest_prediction(self, X):
return np.array([tree.predict(X) for tree in self._forest]).T
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.error_handling.logger_factory import LoggerFactory
from . import LOG_PATH
import pickle
import os
import time
import datetime
class Trainer(object):
......@@ -14,26 +12,17 @@ class Trainer(object):
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def train(self, model, models_dir):
self._logger.info('Training model using train set...')
self._logger.debug('Training model using train set...')
begin_time = time.time()
if self._dataset.dataset_parameters.train_on_subset == 'train':
train_on_subset = self._dataset.dataset_parameters.train_on_subset
if train_on_subset == 'train':
X, y = self._dataset.X_train, self._dataset.y_train
elif self._dataset.dataset_parameters.train_on_subset == 'dev':
elif train_on_subset == 'dev':
X, y = self._dataset.X_dev, self._dataset.y_dev
else:
raise ValueError("Unsupported train_on_subset value '{}'".format(self._dataset.dataset_parameters.train_on_subset))
raise ValueError("Unsupported train_on_subset value '{}'".format(train_on_subset))
self._logger.debug('Fitting on {} subset'.format(train_on_subset))
model.fit(X, y)
end_time = time.time()
self._dump_raw_results(models_dir, model, end_time, begin_time)
def _dump_raw_results(self, models_dir, model, end_time, begin_time):
output_file_path = models_dir + os.sep + 'model.pickle'
self._logger.info('Saving trained model to {}'.format(output_file_path))
with open(output_file_path, 'wb') as output_file:
pickle.dump({
'forest': model.forest,
'weights': model.weights,
'training_time': end_time - begin_time,
'datetime': datetime.datetime.now()
}, output_file)
ModelRawResults.save(models_dir, model, end_time, begin_time, self._dataset, self._logger)
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.model_factory import ModelFactory
import argparse
import pathlib
from dotenv import find_dotenv, load_dotenv
import os
if __name__ == "__main__":
default_results_dir = 'results'
default_models_dir = 'models'
default_experiment_id = -1
# get environment variables in .env
load_dotenv(find_dotenv('.env.example'))
default_results_dir = os.environ["project_dir"] + os.sep + 'results'
default_models_dir = os.environ["project_dir"] + os.sep + 'models'
default_experiment_ids = None
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
parser.add_argument('--experiment_id', nargs='?', type=int, default=default_experiment_id, help='Compute the results of a single experiment id')
parser.add_argument('--experiment_ids', nargs='+', type=int, default=default_experiment_ids, help='Compute the results of the specified experiment id(s)')
args = parser.parse_args()
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
if args.experiment_id == -1:
pass
else:
pass
experiments_ids = [str(experiment_id) for experiment_id in args.experiment_ids] \
if args.experiment_ids is not None \
else os.listdir(args.models_dir)
if experiments_ids is None or len(experiments_ids) == 0:
raise ValueError("No experiment id was found or specified.")
for experiment_id in experiments_ids:
experiment_id_path = args.models_dir + os.sep + experiment_id
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds'
for seed in os.listdir(experiment_seed_root_path):
experiment_seed_path = experiment_seed_root_path + os.sep + seed
dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id)
dataset = DatasetLoader.load(dataset_parameters)
extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size'
for extracted_forest_size in os.listdir(extracted_forest_size_root_path):
extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size
model_raw_results = ModelRawResults.load(extracted_forest_size_path)
model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results)
......@@ -21,6 +21,7 @@ if __name__ == "__main__":
default_dataset_name = 'boston'
default_normalize = True
default_wo_normalization = False
default_forest_size = 100
default_extracted_forest_size = 10
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
......@@ -34,7 +35,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize', action='store_true', default=default_normalize, help='Normalize the data by doing the L2 division of the pred vectors.')
parser.add_argument('--wo_normalization', action='store_true', default=default_wo_normalization, help='Withouyt normalize the data by doing the L2 division of the pred vectors.')
parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
......@@ -47,22 +48,27 @@ if __name__ == "__main__":
pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, __name__)
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
args.extracted_forest_size = args.extracted_forest_size \
if type(args.extracted_forest_size) == list \
else [args.extracted_forest_size]
if args.seeds != None and args.random_seed_number > 1:
logger.warn('seeds and random_seed_number parameters are both specified. Seeds will be used.')
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
seeds = args.seeds if args.seeds is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(args.random_seed_number)]
normalize = default_normalize and args.wo_normalization is False
logger.debug('normalize={}'.format(normalize))
experiment_id = resolve_experiment_id(args.models_dir)
experiment_id_str = str(experiment_id)
for seed in seeds:
logger.debug('Seed={}'.format(seed))
seed_str = str(seed)
models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + seed_str
......@@ -77,16 +83,17 @@ if __name__ == "__main__":
test_size=args.test_size,
dev_size=args.dev_size,
random_state=seed,
normalize=args.normalize,
normalize=normalize,
train_on_subset=args.train_on_subset
)
dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load_from_name(dataset_parameters)
dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset)
for extracted_forest_size in args.extracted_forest_size:
logger.debug('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
try:
os.makedirs(sub_models_dir)
......@@ -98,7 +105,7 @@ if __name__ == "__main__":
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size,
seed=seed,
normalize=args.normalize
normalize=normalize
)
model_parameters.save(sub_models_dir, experiment_id)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment