From 3c6dc3e5dd91fa627313deda1e6f2ec9256a8af0 Mon Sep 17 00:00:00 2001 From: Charly LAMOTHE <lamothe.c@intlocal.univ-amu.fr> Date: Mon, 4 Nov 2019 21:35:58 +0100 Subject: [PATCH] - Add train_on_subset option to specify on which subset the model will be trained (either train or dev); - find_dotenv() only working by specifying the example env on my machine? - Add the seeds option to specify the seed(s) to use, and remove the use_random_seed, because it's obv if random_seed_number is used; - Use a logger in train.py instead of prints. --- code/bolsonaro/data/dataset_loader.py | 2 +- code/bolsonaro/data/dataset_parameters.py | 10 +++- code/bolsonaro/models/omp_forest_regressor.py | 6 +-- code/bolsonaro/trainer.py | 15 ++++-- code/train.py | 47 ++++++++++--------- 5 files changed, 49 insertions(+), 31 deletions(-) diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index 6ad4b1f..4ab7e3f 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -71,7 +71,7 @@ class DatasetLoader(object): test_size=dataset_parameters.dev_size, random_state=dataset_parameters.random_state) - # TODO + # TODO? if dataset_parameters.normalize: pass diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py index 556c960..f19dc90 100644 --- a/code/bolsonaro/data/dataset_parameters.py +++ b/code/bolsonaro/data/dataset_parameters.py @@ -4,12 +4,13 @@ import os class DatasetParameters(object): - def __init__(self, name, test_size, dev_size, random_state, normalize): + def __init__(self, name, test_size, dev_size, random_state, normalize, train_on_subset): self._name = name self._test_size = test_size self._dev_size = dev_size self._random_state = random_state self._normalize = normalize + self._train_on_subset = train_on_subset @property def name(self): @@ -31,6 +32,10 @@ class DatasetParameters(object): def normalize(self): return self._normalize + @property + def train_on_subset(self): + return self._train_on_subset + def save(self, directory_path, experiment_id): with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file: json.dump({ @@ -38,7 +43,8 @@ class DatasetParameters(object): 'test_size': self._test_size, 'dev_size': self._dev_size, 'random_state': self._random_state, - 'normalize': self._normalize + 'normalize': self._normalize, + 'train_on_subset': self._train_on_subset }, output_file, indent=4) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index a095e14..8fe7dc9 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -1,11 +1,11 @@ +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory + from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import OrthogonalMatchingPursuit from sklearn.base import BaseEstimator import numpy as np -from bolsonaro import LOG_PATH -from bolsonaro.error_handling.logger_factory import LoggerFactory - class OmpForestRegressor(BaseEstimator): diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 0e239e6..3370ce7 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -13,14 +13,21 @@ class Trainer(object): self._dataset = dataset self._logger = LoggerFactory.create(LOG_PATH, __name__) - def iterate(self, model, models_dir): - # why is this function named iterate? + def train(self, model, models_dir): self._logger.info('Training model using train set...') begin_time = time.time() - # todo: OMP may be running with X_dev ou Y_dev - model.fit(self._dataset.X_train, self._dataset.y_train) + if self._dataset.dataset_parameters.train_on_subset == 'train': + X, y = self._dataset.X_train, self._dataset.y_train + elif self._dataset.dataset_parameters.train_on_subset == 'dev': + X, y = self._dataset.X_dev, self._dataset.y_dev + else: + raise ValueError("Unsupported train_on_subset value '{}'".format(self._dataset.dataset_parameters.train_on_subset)) + model.fit(X, y) end_time = time.time() + self._dump_raw_results(models_dir, model, end_time, begin_time) + + def _dump_raw_results(self, models_dir, model, end_time, begin_time): output_file_path = models_dir + os.sep + 'model.pickle' self._logger.info('Saving trained model to {}'.format(output_file_path)) with open(output_file_path, 'wb') as output_file: diff --git a/code/train.py b/code/train.py index 7b589a3..1283e90 100644 --- a/code/train.py +++ b/code/train.py @@ -1,11 +1,11 @@ -from dotenv import load_dotenv - from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_loader import DatasetLoader from bolsonaro.models.model_factory import ModelFactory from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.trainer import Trainer from bolsonaro.utils import resolve_experiment_id +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory from dotenv import find_dotenv, load_dotenv import argparse @@ -17,7 +17,7 @@ import errno if __name__ == "__main__": # get environment variables in .env - load_dotenv(find_dotenv()) + load_dotenv(find_dotenv('.env.example')) default_dataset_name = 'boston' default_normalize = True @@ -27,10 +27,10 @@ if __name__ == "__main__": default_models_dir = os.environ["project_dir"] + os.sep + 'models' default_dev_size = 0.2 default_test_size = 0.2 - default_use_random_seed = True default_random_seed_number = 1 begin_random_seed_range = 1 end_random_seed_range = 2000 + default_train_on_subset = 'train' parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') @@ -38,30 +38,34 @@ if __name__ == "__main__": parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') - parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio') - parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio') - parser.add_argument('--use_random_seed', action='store_true', default=default_use_random_seed, help='Random seed used for the data split') - parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used') + parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio.') + parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio.') + parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used.') + parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') + parser.add_argument('--train_on_subset', nargs='?', type=str, default=default_train_on_subset, help='Specify on witch subset the model will be trained (either train or dev).') args = parser.parse_args() pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) + logger = LoggerFactory.create(LOG_PATH, __name__) + args.extracted_forest_size = args.extracted_forest_size \ if type(args.extracted_forest_size) == list \ else [args.extracted_forest_size] - # todo the seeds shouldn't be randomly generated but fixed in range instead. We want it to be reproducible: exact same arguments should return exact same results. - random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \ - for i in range(args.random_seed_number)] \ - if args.use_random_seed else None + if args.seeds != None and args.random_seed_number > 1: + logger.warn('seeds and random_seed_number parameters are both specified. Seeds will be used.') + seeds = args.seeds if args.seeds is not None \ + else [random.randint(begin_random_seed_range, end_random_seed_range) \ + for i in range(args.random_seed_number)] experiment_id = resolve_experiment_id(args.models_dir) experiment_id_str = str(experiment_id) - for random_seed in random_seeds: - random_seed_str = str(random_seed) + for seed in seeds: + seed_str = str(seed) models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ - os.sep + random_seed_str + os.sep + seed_str try: os.makedirs(models_dir) except OSError as e: @@ -72,8 +76,9 @@ if __name__ == "__main__": name=args.dataset_name, test_size=args.test_size, dev_size=args.dev_size, - random_state=random_seed, - normalize=args.normalize + random_state=seed, + normalize=args.normalize, + train_on_subset=args.train_on_subset ) dataset_parameters.save(models_dir, experiment_id_str) @@ -92,14 +97,14 @@ if __name__ == "__main__": model_parameters = ModelParameters( forest_size=args.forest_size, extracted_forest_size=extracted_forest_size, - seed=random_seed, + seed=seed, normalize=args.normalize ) model_parameters.save(sub_models_dir, experiment_id) model = ModelFactory.build(dataset.task, model_parameters) - trainer.iterate(model, sub_models_dir) + trainer.train(model, sub_models_dir) - print(model.score(dataset.X_test, dataset.y_test)) - print(model.score_regressor(dataset.X_test, dataset.y_test)) \ No newline at end of file + logger.info('Error on test set: {}'.format(model.score(dataset.X_test, dataset.y_test))) + logger.info('Accuracy on test set: {}'.format(model.score_regressor(dataset.X_test, dataset.y_test))) -- GitLab