Skip to content
Snippets Groups Projects
Commit 11017545 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

- Replace the futures concurrence by joblib Parallel (and add optional tqdm progress bar);

- Add new best params for 7 datasets.
parent 31d12659
No related branches found
No related tags found
1 merge request!9Resolve "Experiment pipeline"
...@@ -20,13 +20,3 @@ class ModelFactory(object): ...@@ -20,13 +20,3 @@ class ModelFactory(object):
else: else:
raise ValueError("Unsupported task '{}'".format(task)) raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters) return model_func(model_parameters)
@staticmethod
def load(task, directory_path, experiment_id, model_raw_results):
raise NotImplementedError
model_parameters = ModelParameters.load(directory_path, experiment_id)
model = ModelFactory.build(task, model_parameters)
# todo faire ce qu'il faut ici pour rétablir correctement le modèle
model.set_forest(model_raw_results.model_object.forest)
model.set_weights(model_raw_results.model_object.weights)
return model
...@@ -2,6 +2,8 @@ import os ...@@ -2,6 +2,8 @@ import os
import json import json
import pickle import pickle
from copy import deepcopy from copy import deepcopy
import contextlib
import joblib
def resolve_experiment_id(models_dir): def resolve_experiment_id(models_dir):
...@@ -76,3 +78,25 @@ def change_binary_func_load(base_load_function): ...@@ -76,3 +78,25 @@ def change_binary_func_load(base_load_function):
y = binarize_class_data(y, possible_classes[-1]) y = binarize_class_data(y, possible_classes[-1])
return X, y return X, y
return func_load return func_load
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
"""Context manager to patch joblib to report into tqdm progress bar given as argument"""
class TqdmBatchCompletionCallback:
def __init__(self, time, index, parallel):
self.index = index
self.parallel = parallel
def __call__(self, index):
tqdm_object.update()
if self.parallel._original_iterator is not None:
self.parallel.dispatch_next()
old_batch_callback = joblib.parallel.BatchCompletionCallBack
joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
try:
yield tqdm_object
finally:
joblib.parallel.BatchCompletionCallBack = old_batch_callback
tqdm_object.close()
...@@ -4,7 +4,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters ...@@ -4,7 +4,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.hyperparameter_searcher import HyperparameterSearcher from bolsonaro.hyperparameter_searcher import HyperparameterSearcher
from bolsonaro.utils import save_obj_to_json from bolsonaro.utils import save_obj_to_json, tqdm_joblib
import argparse import argparse
import os import os
...@@ -12,7 +12,8 @@ import pathlib ...@@ -12,7 +12,8 @@ import pathlib
import pickle import pickle
import random import random
from dotenv import find_dotenv, load_dotenv from dotenv import find_dotenv, load_dotenv
from concurrent import futures from joblib import Parallel, delayed
from tqdm import tqdm
import threading import threading
import numpy as np import numpy as np
import math import math
...@@ -54,7 +55,7 @@ def process_job(dataset_name, seed, param_space, args): ...@@ -54,7 +55,7 @@ def process_job(dataset_name, seed, param_space, args):
bayesian_searcher = HyperparameterSearcher() bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, param_space, args.n_iter, opt = bayesian_searcher.search(dataset, param_space, args.n_iter,
args.cv, seed, scorer, args.verbose) args.cv, seed, scorer)
return { return {
'_scorer': scorer, '_scorer': scorer,
...@@ -66,10 +67,9 @@ def process_job(dataset_name, seed, param_space, args): ...@@ -66,10 +67,9 @@ def process_job(dataset_name, seed, param_space, args):
def run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args): def run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args):
# Run one hyperparameter search job per seed # Run one hyperparameter search job per seed
with futures.ProcessPoolExecutor(len(seeds)) as executor: with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
opt_results = list(f.result() for f in futures.as_completed( opt_results = Parallel(n_jobs=-1)(delayed(process_job)(
executor.submit(process_job, dataset_name, seed, param_space, args dataset_name, seeds[i], param_space, args) for i in range(len(seeds)))
) for seed in seeds))
return opt_results return opt_results
def compute_best_params_over_seeds(seeds, dataset_name, param_space, args): def compute_best_params_over_seeds(seeds, dataset_name, param_space, args):
...@@ -143,7 +143,7 @@ if __name__ == "__main__": ...@@ -143,7 +143,7 @@ if __name__ == "__main__":
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--use_variable_seed_number', action='store_true', default=DEFAULT_USE_VARIABLE_SEED_NUMBER, help='Compute the amount of random seeds depending on the dataset.') parser.add_argument('--use_variable_seed_number', action='store_true', default=DEFAULT_USE_VARIABLE_SEED_NUMBER, help='Compute the amount of random seeds depending on the dataset.')
parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.') parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print information during the bayesian search.') parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
args = parser.parse_args() args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
......
...@@ -3,7 +3,7 @@ from bolsonaro.data.dataset_loader import DatasetLoader ...@@ -3,7 +3,7 @@ from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.trainer import Trainer from bolsonaro.trainer import Trainer
from bolsonaro.utils import resolve_experiment_id from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
from bolsonaro import LOG_PATH from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
...@@ -13,9 +13,10 @@ import json ...@@ -13,9 +13,10 @@ import json
import pathlib import pathlib
import random import random
import os import os
from concurrent import futures from joblib import Parallel, delayed
import threading import threading
import json import json
from tqdm import tqdm
def process_job(seed, parameters, experiment_id, hyperparameters): def process_job(seed, parameters, experiment_id, hyperparameters):
...@@ -74,11 +75,13 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -74,11 +75,13 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.compute_results(model, sub_models_dir) trainer.compute_results(model, sub_models_dir)
logger.info('Training done') logger.info('Training done')
if __name__ == "__main__": if __name__ == "__main__":
load_dotenv(find_dotenv('.env')) load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models' DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
DEFAULT_VERBOSE = False
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
...@@ -98,6 +101,7 @@ if __name__ == "__main__": ...@@ -98,6 +101,7 @@ if __name__ == "__main__":
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
args = parser.parse_args() args = parser.parse_args()
if args.experiment_configuration: if args.experiment_configuration:
...@@ -153,7 +157,7 @@ if __name__ == "__main__": ...@@ -153,7 +157,7 @@ if __name__ == "__main__":
indent=4 indent=4
) )
# Train as much job as there are seeds # Run as much job as there are seeds
with futures.ProcessPoolExecutor(len(seeds)) as executor: with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, Parallel(n_jobs=-1)(delayed(process_job)(seeds[i],
parameters, experiment_id, hyperparameters) for seed in seeds)) parameters, experiment_id, hyperparameters) for i in range(len(seeds)))
{ {
"scorer": "neg_mean_squared_error", "scorer": "neg_mean_squared_error",
"best_score_train": -11.238253315624897, "best_score_train": -13.33228274304088,
"best_score_test": -7.312532120669678, "best_score_test": -13.650326577972058,
"best_parameters": { "best_parameters": {
"max_depth": 20,
"max_features": "auto", "max_features": "auto",
"min_samples_leaf": 1, "min_samples_leaf": "1",
"n_estimators": 1000 "max_depth": "20",
"n_estimators": "1000"
}, },
"random_seed": 289 "random_seed": [
1812,
1844,
1376,
383,
310,
1620,
54,
1502,
324,
1536,
1202,
1069,
645,
1706,
423
]
} }
\ No newline at end of file
{ {
"scorer": "accuracy", "scorer": "accuracy",
"best_score_train": 0.96, "best_score_train": 0.9562271062271059,
"best_score_test": 0.956140350877193, "best_score_test": 0.9514619883040936,
"best_parameters": { "best_parameters": {
"max_depth": 20, "max_depth": "20",
"max_features": "sqrt", "min_samples_leaf": "1",
"min_samples_leaf": 1, "n_estimators": "1000",
"n_estimators": 1000 "max_features": "log2"
} },
"random_seed": [
1505,
5,
484,
284,
289,
1014,
1752,
497,
1350,
781,
408,
256,
1494,
1940,
842
]
} }
\ No newline at end of file
{ {
"scorer": "neg_mean_squared_error", "scorer": "neg_mean_squared_error",
"best_score_train": -3380.975223665973, "best_score_train": -3565.203897624773,
"best_score_test": -2604.589761961369, "best_score_test": -3305.635542701523,
"best_parameters": { "best_parameters": {
"max_depth": 17,
"max_features": "auto", "max_features": "auto",
"min_samples_leaf": 10, "min_samples_leaf": "1",
"n_estimators": 804 "max_depth": "15",
"n_estimators": "108"
}, },
"random_seed": 1679 "random_seed": [
661,
1004,
469,
1399,
32,
992,
312,
895,
170,
913,
347,
787,
1596,
752,
1093
]
} }
\ No newline at end of file
{ {
"scorer": "accuracy", "scorer": "accuracy",
"best_score_train": 0.9767932489451476, "best_score_train": 0.9667536988685814,
"best_score_test": 0.9861111111111112, "best_score_test": 0.9738888888888889,
"best_parameters": { "best_parameters": {
"max_depth": 16,
"max_features": "sqrt", "max_features": "sqrt",
"min_samples_leaf": 1, "min_samples_leaf": "1",
"n_estimators": 1000 "n_estimators": "1000",
"max_depth": "20"
}, },
"random_seed": 1679 "random_seed": [
1,
103,
519,
213,
953
]
} }
\ No newline at end of file
{ {
"scorer": "accuracy", "scorer": "accuracy",
"best_score_train": 0.9576271186440678, "best_score_train": 0.9541666666666668,
"best_score_test": 1.0, "best_score_test": 0.9155555555555556,
"best_parameters": { "best_parameters": {
"max_depth": 20, "max_features": "sqrt",
"max_features": "log2", "min_samples_leaf": "1",
"min_samples_leaf": 1, "max_depth": "1",
"n_estimators": 1000 "n_estimators": "1000"
}, },
"random_seed": 883 "random_seed": [
771,
577,
1262,
261,
1942,
121,
1710,
633,
1852,
821,
423,
574,
1452,
68,
624
]
} }
\ No newline at end of file
{ {
"scorer": "neg_mean_squared_error", "scorer": "neg_mean_squared_error",
"best_score_train": -268.00052987557854, "best_score_train": -223.81438159498393,
"best_score_test": -206.18071759259263, "best_score_test": -262.4415311793658,
"best_parameters": { "best_parameters": {
"max_depth": 3, "max_depth": "1",
"min_samples_leaf": "1",
"max_features": "sqrt", "max_features": "sqrt",
"min_samples_leaf": 232, "n_estimators": "1000"
"n_estimators": 16
}, },
"random_seed": 1679 "random_seed": [
1109,
509,
686,
1657,
922,
502,
1414,
1259,
1256,
1923,
1813,
1854,
136,
1129,
777
]
} }
\ No newline at end of file
{ {
"scorer": "accuracy", "scorer": "accuracy",
"best_score_train": 0.9857142857142858, "best_score_train": 0.9846607669616517,
"best_score_test": 0.9722222222222222, "best_score_test": 0.9796296296296295,
"best_parameters": { "best_parameters": {
"max_depth": 20, "max_depth": "20",
"max_features": "log2", "min_samples_leaf": "1",
"min_samples_leaf": 1, "n_estimators": "1000",
"n_estimators": 1000 "max_features": "log2"
}, },
"random_seed": 1679 "random_seed": [
1431,
826,
1913,
168,
871,
1691,
1482,
1273,
255,
805,
1671,
448,
1217,
1213,
1160
]
} }
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment