diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index a93e6090e253dc9bdb3aacfc53e1c99a1f9ef120..ea51ad320918bb714930f11864712d004803e650 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -20,13 +20,3 @@ class ModelFactory(object): else: raise ValueError("Unsupported task '{}'".format(task)) return model_func(model_parameters) - - @staticmethod - def load(task, directory_path, experiment_id, model_raw_results): - raise NotImplementedError - model_parameters = ModelParameters.load(directory_path, experiment_id) - model = ModelFactory.build(task, model_parameters) - # todo faire ce qu'il faut ici pour rétablir correctement le modèle - model.set_forest(model_raw_results.model_object.forest) - model.set_weights(model_raw_results.model_object.weights) - return model diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py index 10ea76921ffacdd814044fc8179eb83717429330..9dff06ac65a726642cc5efe2e6ed8b8f78f40b29 100644 --- a/code/bolsonaro/utils.py +++ b/code/bolsonaro/utils.py @@ -2,6 +2,8 @@ import os import json import pickle from copy import deepcopy +import contextlib +import joblib def resolve_experiment_id(models_dir): @@ -76,3 +78,25 @@ def change_binary_func_load(base_load_function): y = binarize_class_data(y, possible_classes[-1]) return X, y return func_load + + +@contextlib.contextmanager +def tqdm_joblib(tqdm_object): + """Context manager to patch joblib to report into tqdm progress bar given as argument""" + class TqdmBatchCompletionCallback: + def __init__(self, time, index, parallel): + self.index = index + self.parallel = parallel + + def __call__(self, index): + tqdm_object.update() + if self.parallel._original_iterator is not None: + self.parallel.dispatch_next() + + old_batch_callback = joblib.parallel.BatchCompletionCallBack + joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback + try: + yield tqdm_object + finally: + joblib.parallel.BatchCompletionCallBack = old_batch_callback + tqdm_object.close() diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py index a96ef68259d91929d97878fc402760941c6562d5..548a1a4e82c720b711859abd57b0ddcf295e5835 100644 --- a/code/compute_hyperparameters.py +++ b/code/compute_hyperparameters.py @@ -4,7 +4,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.task import Task from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.hyperparameter_searcher import HyperparameterSearcher -from bolsonaro.utils import save_obj_to_json +from bolsonaro.utils import save_obj_to_json, tqdm_joblib import argparse import os @@ -12,7 +12,8 @@ import pathlib import pickle import random from dotenv import find_dotenv, load_dotenv -from concurrent import futures +from joblib import Parallel, delayed +from tqdm import tqdm import threading import numpy as np import math @@ -54,7 +55,7 @@ def process_job(dataset_name, seed, param_space, args): bayesian_searcher = HyperparameterSearcher() opt = bayesian_searcher.search(dataset, param_space, args.n_iter, - args.cv, seed, scorer, args.verbose) + args.cv, seed, scorer) return { '_scorer': scorer, @@ -66,10 +67,9 @@ def process_job(dataset_name, seed, param_space, args): def run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args): # Run one hyperparameter search job per seed - with futures.ProcessPoolExecutor(len(seeds)) as executor: - opt_results = list(f.result() for f in futures.as_completed( - executor.submit(process_job, dataset_name, seed, param_space, args - ) for seed in seeds)) + with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar: + opt_results = Parallel(n_jobs=-1)(delayed(process_job)( + dataset_name, seeds[i], param_space, args) for i in range(len(seeds))) return opt_results def compute_best_params_over_seeds(seeds, dataset_name, param_space, args): @@ -143,7 +143,7 @@ if __name__ == "__main__": parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--use_variable_seed_number', action='store_true', default=DEFAULT_USE_VARIABLE_SEED_NUMBER, help='Compute the amount of random seeds depending on the dataset.') parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.') - parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print information during the bayesian search.') + parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.') args = parser.parse_args() logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) diff --git a/code/train.py b/code/train.py index 507c7ddf69df819caa746dfa0ec106f183625c16..38f2887e16de5a0967500a304034b5934dcd4a5d 100644 --- a/code/train.py +++ b/code/train.py @@ -3,7 +3,7 @@ from bolsonaro.data.dataset_loader import DatasetLoader from bolsonaro.models.model_factory import ModelFactory from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.trainer import Trainer -from bolsonaro.utils import resolve_experiment_id +from bolsonaro.utils import resolve_experiment_id, tqdm_joblib from bolsonaro import LOG_PATH from bolsonaro.error_handling.logger_factory import LoggerFactory @@ -13,9 +13,10 @@ import json import pathlib import random import os -from concurrent import futures +from joblib import Parallel, delayed import threading import json +from tqdm import tqdm def process_job(seed, parameters, experiment_id, hyperparameters): @@ -74,11 +75,13 @@ def process_job(seed, parameters, experiment_id, hyperparameters): trainer.compute_results(model, sub_models_dir) logger.info('Training done') + if __name__ == "__main__": load_dotenv(find_dotenv('.env')) DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models' + DEFAULT_VERBOSE = False begin_random_seed_range = 1 end_random_seed_range = 2000 @@ -98,6 +101,7 @@ if __name__ == "__main__": parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') + parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.') args = parser.parse_args() if args.experiment_configuration: @@ -153,7 +157,7 @@ if __name__ == "__main__": indent=4 ) - # Train as much job as there are seeds - with futures.ProcessPoolExecutor(len(seeds)) as executor: - list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, - parameters, experiment_id, hyperparameters) for seed in seeds)) + # Run as much job as there are seeds + with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar: + Parallel(n_jobs=-1)(delayed(process_job)(seeds[i], + parameters, experiment_id, hyperparameters) for i in range(len(seeds))) diff --git a/experiments/boston/stage1/params.json b/experiments/boston/stage1/params.json index 6a5a1e9a05d8f081af6abe38fa0aadfff1e736b8..f2f3abbe37d05ef6123ce819ebee4dfac2b254a7 100644 --- a/experiments/boston/stage1/params.json +++ b/experiments/boston/stage1/params.json @@ -1,12 +1,28 @@ { "scorer": "neg_mean_squared_error", - "best_score_train": -11.238253315624897, - "best_score_test": -7.312532120669678, + "best_score_train": -13.33228274304088, + "best_score_test": -13.650326577972058, "best_parameters": { - "max_depth": 20, "max_features": "auto", - "min_samples_leaf": 1, - "n_estimators": 1000 + "min_samples_leaf": "1", + "max_depth": "20", + "n_estimators": "1000" }, - "random_seed": 289 + "random_seed": [ + 1812, + 1844, + 1376, + 383, + 310, + 1620, + 54, + 1502, + 324, + 1536, + 1202, + 1069, + 645, + 1706, + 423 + ] } \ No newline at end of file diff --git a/experiments/breast_cancer/stage1/params.json b/experiments/breast_cancer/stage1/params.json index 6b1b22834e0d69a477fa7b4f06b1169b9c3e3016..43739c2db86556421d35a2064aa554b532a5b413 100644 --- a/experiments/breast_cancer/stage1/params.json +++ b/experiments/breast_cancer/stage1/params.json @@ -1,11 +1,28 @@ { "scorer": "accuracy", - "best_score_train": 0.96, - "best_score_test": 0.956140350877193, + "best_score_train": 0.9562271062271059, + "best_score_test": 0.9514619883040936, "best_parameters": { - "max_depth": 20, - "max_features": "sqrt", - "min_samples_leaf": 1, - "n_estimators": 1000 - } + "max_depth": "20", + "min_samples_leaf": "1", + "n_estimators": "1000", + "max_features": "log2" + }, + "random_seed": [ + 1505, + 5, + 484, + 284, + 289, + 1014, + 1752, + 497, + 1350, + 781, + 408, + 256, + 1494, + 1940, + 842 + ] } \ No newline at end of file diff --git a/experiments/diabetes/stage1/params.json b/experiments/diabetes/stage1/params.json index 6c7fbb12b41968d9b8367161a2bb607ad954a65b..472e7382583edd3ce381470a7ba9902aff443f5d 100644 --- a/experiments/diabetes/stage1/params.json +++ b/experiments/diabetes/stage1/params.json @@ -1,12 +1,28 @@ { "scorer": "neg_mean_squared_error", - "best_score_train": -3380.975223665973, - "best_score_test": -2604.589761961369, + "best_score_train": -3565.203897624773, + "best_score_test": -3305.635542701523, "best_parameters": { - "max_depth": 17, "max_features": "auto", - "min_samples_leaf": 10, - "n_estimators": 804 + "min_samples_leaf": "1", + "max_depth": "15", + "n_estimators": "108" }, - "random_seed": 1679 + "random_seed": [ + 661, + 1004, + 469, + 1399, + 32, + 992, + 312, + 895, + 170, + 913, + 347, + 787, + 1596, + 752, + 1093 + ] } \ No newline at end of file diff --git a/experiments/digits/stage1/params.json b/experiments/digits/stage1/params.json index e5662c5afcdfc4ede1a29ef778393d8f8cf95156..2cf1c4b1f5cc91bda737fe2f68c81d37d2682f2a 100644 --- a/experiments/digits/stage1/params.json +++ b/experiments/digits/stage1/params.json @@ -1,12 +1,18 @@ { "scorer": "accuracy", - "best_score_train": 0.9767932489451476, - "best_score_test": 0.9861111111111112, + "best_score_train": 0.9667536988685814, + "best_score_test": 0.9738888888888889, "best_parameters": { - "max_depth": 16, "max_features": "sqrt", - "min_samples_leaf": 1, - "n_estimators": 1000 + "min_samples_leaf": "1", + "n_estimators": "1000", + "max_depth": "20" }, - "random_seed": 1679 + "random_seed": [ + 1, + 103, + 519, + 213, + 953 + ] } \ No newline at end of file diff --git a/experiments/iris/stage1/params.json b/experiments/iris/stage1/params.json index fd852cace9852ee492649374e915b639fe785b28..a91c658dfdf58b7338807194dbbe1f01b70aa431 100644 --- a/experiments/iris/stage1/params.json +++ b/experiments/iris/stage1/params.json @@ -1,12 +1,28 @@ { "scorer": "accuracy", - "best_score_train": 0.9576271186440678, - "best_score_test": 1.0, + "best_score_train": 0.9541666666666668, + "best_score_test": 0.9155555555555556, "best_parameters": { - "max_depth": 20, - "max_features": "log2", - "min_samples_leaf": 1, - "n_estimators": 1000 + "max_features": "sqrt", + "min_samples_leaf": "1", + "max_depth": "1", + "n_estimators": "1000" }, - "random_seed": 883 + "random_seed": [ + 771, + 577, + 1262, + 261, + 1942, + 121, + 1710, + 633, + 1852, + 821, + 423, + 574, + 1452, + 68, + 624 + ] } \ No newline at end of file diff --git a/experiments/linnerud/stage1/params.json b/experiments/linnerud/stage1/params.json index 7db121c4ccb6c0add73e7e554349efbe17410dbe..a1573d389eb93336ada91ee31ac66a7de166cd33 100644 --- a/experiments/linnerud/stage1/params.json +++ b/experiments/linnerud/stage1/params.json @@ -1,12 +1,28 @@ { "scorer": "neg_mean_squared_error", - "best_score_train": -268.00052987557854, - "best_score_test": -206.18071759259263, + "best_score_train": -223.81438159498393, + "best_score_test": -262.4415311793658, "best_parameters": { - "max_depth": 3, + "max_depth": "1", + "min_samples_leaf": "1", "max_features": "sqrt", - "min_samples_leaf": 232, - "n_estimators": 16 + "n_estimators": "1000" }, - "random_seed": 1679 + "random_seed": [ + 1109, + 509, + 686, + 1657, + 922, + 502, + 1414, + 1259, + 1256, + 1923, + 1813, + 1854, + 136, + 1129, + 777 + ] } \ No newline at end of file diff --git a/experiments/wine/stage1/params.json b/experiments/wine/stage1/params.json index 25950f99a1e9ea38247c4c4b76628aad87442511..99795a6caf3b2c0639df4d0bc3306f0906193309 100644 --- a/experiments/wine/stage1/params.json +++ b/experiments/wine/stage1/params.json @@ -1,12 +1,28 @@ { "scorer": "accuracy", - "best_score_train": 0.9857142857142858, - "best_score_test": 0.9722222222222222, + "best_score_train": 0.9846607669616517, + "best_score_test": 0.9796296296296295, "best_parameters": { - "max_depth": 20, - "max_features": "log2", - "min_samples_leaf": 1, - "n_estimators": 1000 + "max_depth": "20", + "min_samples_leaf": "1", + "n_estimators": "1000", + "max_features": "log2" }, - "random_seed": 1679 + "random_seed": [ + 1431, + 826, + 1913, + 168, + 871, + 1691, + 1482, + 1273, + 255, + 805, + 1671, + 448, + 1217, + 1213, + 1160 + ] } \ No newline at end of file