Skip to content
Snippets Groups Projects
Commit 11017545 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

- Replace the futures concurrence by joblib Parallel (and add optional tqdm progress bar);

- Add new best params for 7 datasets.
parent 31d12659
No related branches found
No related tags found
1 merge request!9Resolve "Experiment pipeline"
......@@ -20,13 +20,3 @@ class ModelFactory(object):
else:
raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters)
@staticmethod
def load(task, directory_path, experiment_id, model_raw_results):
raise NotImplementedError
model_parameters = ModelParameters.load(directory_path, experiment_id)
model = ModelFactory.build(task, model_parameters)
# todo faire ce qu'il faut ici pour rétablir correctement le modèle
model.set_forest(model_raw_results.model_object.forest)
model.set_weights(model_raw_results.model_object.weights)
return model
......@@ -2,6 +2,8 @@ import os
import json
import pickle
from copy import deepcopy
import contextlib
import joblib
def resolve_experiment_id(models_dir):
......@@ -76,3 +78,25 @@ def change_binary_func_load(base_load_function):
y = binarize_class_data(y, possible_classes[-1])
return X, y
return func_load
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
"""Context manager to patch joblib to report into tqdm progress bar given as argument"""
class TqdmBatchCompletionCallback:
def __init__(self, time, index, parallel):
self.index = index
self.parallel = parallel
def __call__(self, index):
tqdm_object.update()
if self.parallel._original_iterator is not None:
self.parallel.dispatch_next()
old_batch_callback = joblib.parallel.BatchCompletionCallBack
joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
try:
yield tqdm_object
finally:
joblib.parallel.BatchCompletionCallBack = old_batch_callback
tqdm_object.close()
......@@ -4,7 +4,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.hyperparameter_searcher import HyperparameterSearcher
from bolsonaro.utils import save_obj_to_json
from bolsonaro.utils import save_obj_to_json, tqdm_joblib
import argparse
import os
......@@ -12,7 +12,8 @@ import pathlib
import pickle
import random
from dotenv import find_dotenv, load_dotenv
from concurrent import futures
from joblib import Parallel, delayed
from tqdm import tqdm
import threading
import numpy as np
import math
......@@ -54,7 +55,7 @@ def process_job(dataset_name, seed, param_space, args):
bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, param_space, args.n_iter,
args.cv, seed, scorer, args.verbose)
args.cv, seed, scorer)
return {
'_scorer': scorer,
......@@ -66,10 +67,9 @@ def process_job(dataset_name, seed, param_space, args):
def run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args):
# Run one hyperparameter search job per seed
with futures.ProcessPoolExecutor(len(seeds)) as executor:
opt_results = list(f.result() for f in futures.as_completed(
executor.submit(process_job, dataset_name, seed, param_space, args
) for seed in seeds))
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
opt_results = Parallel(n_jobs=-1)(delayed(process_job)(
dataset_name, seeds[i], param_space, args) for i in range(len(seeds)))
return opt_results
def compute_best_params_over_seeds(seeds, dataset_name, param_space, args):
......@@ -143,7 +143,7 @@ if __name__ == "__main__":
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--use_variable_seed_number', action='store_true', default=DEFAULT_USE_VARIABLE_SEED_NUMBER, help='Compute the amount of random seeds depending on the dataset.')
parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print information during the bayesian search.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
......
......@@ -3,7 +3,7 @@ from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.trainer import Trainer
from bolsonaro.utils import resolve_experiment_id
from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
......@@ -13,9 +13,10 @@ import json
import pathlib
import random
import os
from concurrent import futures
from joblib import Parallel, delayed
import threading
import json
from tqdm import tqdm
def process_job(seed, parameters, experiment_id, hyperparameters):
......@@ -74,11 +75,13 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
DEFAULT_VERBOSE = False
begin_random_seed_range = 1
end_random_seed_range = 2000
......@@ -98,6 +101,7 @@ if __name__ == "__main__":
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
args = parser.parse_args()
if args.experiment_configuration:
......@@ -153,7 +157,7 @@ if __name__ == "__main__":
indent=4
)
# Train as much job as there are seeds
with futures.ProcessPoolExecutor(len(seeds)) as executor:
list(f.result() for f in futures.as_completed(executor.submit(process_job, seed,
parameters, experiment_id, hyperparameters) for seed in seeds))
# Run as much job as there are seeds
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
Parallel(n_jobs=-1)(delayed(process_job)(seeds[i],
parameters, experiment_id, hyperparameters) for i in range(len(seeds)))
{
"scorer": "neg_mean_squared_error",
"best_score_train": -11.238253315624897,
"best_score_test": -7.312532120669678,
"best_score_train": -13.33228274304088,
"best_score_test": -13.650326577972058,
"best_parameters": {
"max_depth": 20,
"max_features": "auto",
"min_samples_leaf": 1,
"n_estimators": 1000
"min_samples_leaf": "1",
"max_depth": "20",
"n_estimators": "1000"
},
"random_seed": 289
"random_seed": [
1812,
1844,
1376,
383,
310,
1620,
54,
1502,
324,
1536,
1202,
1069,
645,
1706,
423
]
}
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.96,
"best_score_test": 0.956140350877193,
"best_score_train": 0.9562271062271059,
"best_score_test": 0.9514619883040936,
"best_parameters": {
"max_depth": 20,
"max_features": "sqrt",
"min_samples_leaf": 1,
"n_estimators": 1000
}
"max_depth": "20",
"min_samples_leaf": "1",
"n_estimators": "1000",
"max_features": "log2"
},
"random_seed": [
1505,
5,
484,
284,
289,
1014,
1752,
497,
1350,
781,
408,
256,
1494,
1940,
842
]
}
\ No newline at end of file
{
"scorer": "neg_mean_squared_error",
"best_score_train": -3380.975223665973,
"best_score_test": -2604.589761961369,
"best_score_train": -3565.203897624773,
"best_score_test": -3305.635542701523,
"best_parameters": {
"max_depth": 17,
"max_features": "auto",
"min_samples_leaf": 10,
"n_estimators": 804
"min_samples_leaf": "1",
"max_depth": "15",
"n_estimators": "108"
},
"random_seed": 1679
"random_seed": [
661,
1004,
469,
1399,
32,
992,
312,
895,
170,
913,
347,
787,
1596,
752,
1093
]
}
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.9767932489451476,
"best_score_test": 0.9861111111111112,
"best_score_train": 0.9667536988685814,
"best_score_test": 0.9738888888888889,
"best_parameters": {
"max_depth": 16,
"max_features": "sqrt",
"min_samples_leaf": 1,
"n_estimators": 1000
"min_samples_leaf": "1",
"n_estimators": "1000",
"max_depth": "20"
},
"random_seed": 1679
"random_seed": [
1,
103,
519,
213,
953
]
}
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.9576271186440678,
"best_score_test": 1.0,
"best_score_train": 0.9541666666666668,
"best_score_test": 0.9155555555555556,
"best_parameters": {
"max_depth": 20,
"max_features": "log2",
"min_samples_leaf": 1,
"n_estimators": 1000
"max_features": "sqrt",
"min_samples_leaf": "1",
"max_depth": "1",
"n_estimators": "1000"
},
"random_seed": 883
"random_seed": [
771,
577,
1262,
261,
1942,
121,
1710,
633,
1852,
821,
423,
574,
1452,
68,
624
]
}
\ No newline at end of file
{
"scorer": "neg_mean_squared_error",
"best_score_train": -268.00052987557854,
"best_score_test": -206.18071759259263,
"best_score_train": -223.81438159498393,
"best_score_test": -262.4415311793658,
"best_parameters": {
"max_depth": 3,
"max_depth": "1",
"min_samples_leaf": "1",
"max_features": "sqrt",
"min_samples_leaf": 232,
"n_estimators": 16
"n_estimators": "1000"
},
"random_seed": 1679
"random_seed": [
1109,
509,
686,
1657,
922,
502,
1414,
1259,
1256,
1923,
1813,
1854,
136,
1129,
777
]
}
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.9857142857142858,
"best_score_test": 0.9722222222222222,
"best_score_train": 0.9846607669616517,
"best_score_test": 0.9796296296296295,
"best_parameters": {
"max_depth": 20,
"max_features": "log2",
"min_samples_leaf": 1,
"n_estimators": 1000
"max_depth": "20",
"min_samples_leaf": "1",
"n_estimators": "1000",
"max_features": "log2"
},
"random_seed": 1679
"random_seed": [
1431,
826,
1913,
168,
871,
1691,
1482,
1273,
255,
805,
1671,
448,
1217,
1213,
1160
]
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment