Newer
Older
from bolsonaro import LOG_PATH
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.hyperparameter_searcher import HyperparameterSearcher
from bolsonaro.utils import save_obj_to_json
import argparse
import os
import pathlib
import pickle
import random
from dotenv import find_dotenv, load_dotenv
"""
I had to install skopt from this repository
https://github.com/darenr/scikit-optimize that handles
the issue described here https://github.com/scikit-optimize/scikit-optimize/issues/762.
"""
from skopt.space import Categorical, Integer, Real
def clean_numpy_int_dict(dictionary):
return dict([a, int(x)] if type(x) == Integer else
[a, clean_numpy_int_dict(x)] if type(x) == dict else
[a, clean_numpy_int_list(x)] if type(x) == list else [a, (x)]
for a, x in dictionary.items())
def clean_numpy_int_list(list_n):
return [int(elem) if type(elem) == Integer else
clean_numpy_int_dict(elem) if type(elem) == dict else
clean_numpy_int_list(elem) if type(elem) == list else elem
for elem in list_n]
if __name__ == "__main__":
# get environment variables in .env
Léo Bouscarrat
committed
DEFAULT_N_ITER = 50
DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000),
'min_samples_leaf': Integer(1, 1000),
'max_depth': Integer(1, 20),
'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])}
# , 'digits', 'linnerud', 'wine']
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.')
parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.')
parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.')
parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.')
args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
begin_random_seed_range = 1
end_random_seed_range = 2000
Léo Bouscarrat
committed
if args.seed is None:
random_seed = random.randint(begin_random_seed_range, end_random_seed_range)
else:
random_seed = args.seed
for dataset_name in args.datasets:
dataset_dir = os.path.join('experiments', dataset_name, 'stage1')
pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True)
logger.info('Bayesian search on dataset {}'.format(dataset_name))
dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None)
dataset = DatasetLoader.load(dataset_parameters)
if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error'
bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter,
args.cv, random_seed, scorer, args.verbose)
dict_results = {'_scorer': scorer,
'_best_score_train': opt.best_score_,
'_best_score_test': opt.score(dataset.X_test, dataset.y_test),
'_best_parameters': clean_numpy_int_dict(opt.best_params_),
'_random_seed': random_seed
}
save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results)