From cb0030d803676d6d5757026c3b4843a709bbdf60 Mon Sep 17 00:00:00 2001 From: Charly LAMOTHE <lamothe.c@intlocal.univ-amu.fr> Date: Sat, 9 Nov 2019 16:54:06 +0100 Subject: [PATCH] - Compute each computations of a given seed in a dedicated job; - Use as much CPU as possible when training a random forest regressor. --- code/bolsonaro/models/omp_forest_regressor.py | 2 +- code/train.py | 96 ++++++++++--------- requirements.txt | 1 - 3 files changed, 51 insertions(+), 48 deletions(-) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index b9abfa5..65193e1 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -13,7 +13,7 @@ class OmpForestRegressor(BaseEstimator): def __init__(self, models_parameters): self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, - random_state=models_parameters.seed) + random_state=models_parameters.seed, n_jobs=-1) self._models_parameters = models_parameters self._logger = LoggerFactory.create(LOG_PATH, __name__) diff --git a/code/train.py b/code/train.py index eca2594..2d3264d 100644 --- a/code/train.py +++ b/code/train.py @@ -12,9 +12,53 @@ import argparse import pathlib import random import os -from tqdm import tqdm +from concurrent import futures +import threading +def process_job(seed, args, experiment_id): + logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( + seed, threading.get_ident())) + logger.info('seed={}'.format(seed)) + seed_str = str(seed) + experiment_id_str = str(experiment_id) + models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ + os.sep + seed_str + pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True) + + dataset_parameters = DatasetParameters( + name=args.dataset_name, + test_size=args.test_size, + dev_size=args.dev_size, + random_state=seed, + dataset_normalizer=args.dataset_normalizer + ) + dataset_parameters.save(models_dir, experiment_id_str) + + dataset = DatasetLoader.load(dataset_parameters) + + trainer = Trainer(dataset) + + for extracted_forest_size in args.extracted_forest_size: + logger.info('extracted_forest_size={}'.format(extracted_forest_size)) + sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) + pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) + + model_parameters = ModelParameters( + forest_size=args.forest_size, + extracted_forest_size=extracted_forest_size, + normalize_D=args.normalize_D, + subsets_used=args.subsets_used, + normalize_weights=args.normalize_weights, + seed=seed + ) + model_parameters.save(sub_models_dir, experiment_id) + + model = ModelFactory.build(dataset.task, model_parameters) + + trainer.train(model, sub_models_dir) + logger.info('Training done') + if __name__ == "__main__": # get environment variables in .env load_dotenv(find_dotenv('.env.example')) @@ -30,7 +74,6 @@ if __name__ == "__main__": DEFAULT_TEST_SIZE = 0.2 DEFAULT_RANDOM_SEED_NUMBER = 1 DEFAULT_SUBSETS_USED = 'train,dev' - DEFAULT_DISABLE_PROGRESS = False DEFAULT_normalize_weights = False begin_random_seed_range = 1 @@ -48,7 +91,6 @@ if __name__ == "__main__": parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') - parser.add_argument('--disable_progress', action='store_true', default=DEFAULT_DISABLE_PROGRESS, help='Disable the progress bars.') parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_normalize_weights, help='Divide the predictions by the weights sum.') args = parser.parse_args() @@ -68,47 +110,9 @@ if __name__ == "__main__": for i in range(args.random_seed_number)] experiment_id = resolve_experiment_id(args.models_dir) - experiment_id_str = str(experiment_id) - logger.info('Experiment id: {}'.format(experiment_id_str)) - - with tqdm(seeds, disable=args.disable_progress) as seed_bar: - for seed in seed_bar: - seed_bar.set_description('seed={}'.format(seed)) - seed_str = str(seed) - models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ - os.sep + seed_str - pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True) - - dataset_parameters = DatasetParameters( - name=args.dataset_name, - test_size=args.test_size, - dev_size=args.dev_size, - random_state=seed, - dataset_normalizer=args.dataset_normalizer - ) - dataset_parameters.save(models_dir, experiment_id_str) - - dataset = DatasetLoader.load(dataset_parameters) - - trainer = Trainer(dataset) - - with tqdm(args.extracted_forest_size, disable=args.disable_progress) as extracted_forest_size_bar: - for extracted_forest_size in extracted_forest_size_bar: - extracted_forest_size_bar.set_description('extracted_forest_size={}'.format(extracted_forest_size)) - sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) - pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) - - model_parameters = ModelParameters( - forest_size=args.forest_size, - extracted_forest_size=extracted_forest_size, - normalize_D=args.normalize_D, - subsets_used=args.subsets_used, - normalize_weights=args.normalize_weights, - seed=seed - ) - model_parameters.save(sub_models_dir, experiment_id) - - model = ModelFactory.build(dataset.task, model_parameters) - - trainer.train(model, sub_models_dir) + logger.info('Experiment id: {}'.format(experiment_id)) + + with futures.ProcessPoolExecutor(len(seeds)) as executor: + list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, + args, experiment_id) for seed in seeds)) diff --git a/requirements.txt b/requirements.txt index e203595..92bb6f0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,5 @@ flake8 python-dotenv>=0.5.1 scikit-learn python-dotenv -tqdm matplotlib pandas \ No newline at end of file -- GitLab