Skip to content
Snippets Groups Projects
Commit bf5803b6 authored by Léo Bouscarrat's avatar Léo Bouscarrat
Browse files

Add functions to do bayesian hyperparameters search

parent 7455fd98
No related branches found
No related tags found
2 merge requests!6Resolve "Gridsearching of the base forest",!3clean scripts
class Dataset(object):
def __init__(self, task, dataset_parameters, X_train, X_dev, X_test, y_train,
def __init__(self, task, X_train, X_dev, X_test, y_train,
y_dev, y_test):
self._task = task
self._dataset_parameters = dataset_parameters
self._X_train = X_train
self._X_dev = X_dev
self._X_test = X_test
......
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.task import Task
from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import load_boston, load_iris, load_diabetes, \
load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
......@@ -35,10 +36,10 @@ class DatasetLoader(object):
elif name == 'breast_cancer':
dataset_loading_func = load_breast_cancer
task = Task.CLASSIFICATION
elif name == 'olivetti_faces':
elif name == 'olivetti_faces': # bug (no return X_y)
dataset_loading_func = fetch_olivetti_faces
task = Task.CLASSIFICATION
elif name == '20newsgroups':
elif name == '20newsgroups': # bug (no return X_y)
dataset_loading_func = fetch_20newsgroups
task = Task.CLASSIFICATION
elif name == '20newsgroups_vectorized':
......@@ -87,5 +88,5 @@ class DatasetLoader(object):
X_dev = scaler.transform(X_dev)
X_test = scaler.transform(X_test)
return Dataset(task, dataset_parameters, X_train,
return Dataset(task, X_train,
X_dev, X_test, y_train, y_dev, y_test)
......@@ -15,11 +15,11 @@ class DatasetParameters(object):
@property
def name(self):
return self._name
@property
def test_size(self):
return self._test_size
@property
def dev_size(self):
return self._dev_size
......@@ -32,6 +32,10 @@ class DatasetParameters(object):
def dataset_normalizer(self):
return self._dataset_normalizer
@property
def hyperparameters(self):
return self._hyperparameters
def save(self, directory_path, experiment_id):
save_obj_to_json(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id),
self.__dict__)
......
'''
This module is used to find the best hyperparameters for a given dataset.
'''
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from skopt import BayesSearchCV
class HyperparameterSearch(object):
def __init__(self):
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def search(self, dataset, hyperparameter_space, n_iter, cv,
random_seed, scorer):
'''
For a given dataset and the space of hyperparameters, does a
bayesian hyperparameters search.
:input dataset: a Dataset object
:input hyperparameter_space: a dictionnary, keys are hyperparameters,
value their spaces defined with skopt
:input n_iter: the number of iterations of the bayesian search
:input cv: the size of the cross validation
:input random_seed: int, the seed for the bayesian search
:input scorer: str, the name of the scorer
:return: a skopt.searchcv.BayesSearchCV object
'''
if dataset.task == Task.CLASSIFICATION:
estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed)
if dataset.task == Task.REGRESSION:
estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed)
opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter,
cv=cv, n_jobs=-1, random_state=random_seed,
scoring=scorer)
opt.fit(dataset.X_train, dataset.y_train)
return opt
import argparse
import os
import pathlib
import pickle
import random
from dotenv import find_dotenv, load_dotenv
from skopt.space import Categorical, Integer, Real
from bolsonaro import LOG_PATH
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.hyperparameter_searcher import HyperparameterSearch
from bolsonaro.utils import save_obj_to_json
def clean_numpy_int_dict(dictionary):
return dict([a, int(x)] if type(x) == Integer else
[a, clean_numpy_int_dict(x)] if type(x) == dict else
[a, clean_numpy_int_list(x)] if type(x) == list else [a, (x)]
for a, x in dictionary.items())
def clean_numpy_int_list(list_n):
return [int(elem) if type(elem) == Integer else
clean_numpy_int_dict(elem) if type(elem) == dict else
clean_numpy_int_list(elem) if type(elem) == list else elem
for elem in list_n]
if __name__ == "__main__":
# get environment variables in .env
load_dotenv(find_dotenv('.env.example'))
DEFAULT_CV = 3
DEFAULT_N_ITER = 30
DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000),
'min_samples_leaf': Integer(1, 1000),
'max_depth': Integer(1, 20),
'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])}
LIST_DATASET = ['boston', 'iris', 'diabetes']
# , 'digits', 'linnerud', 'wine']
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation')
parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search')
parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly')
parser.add_argument('--datasets', nargs='+', type=str, default=LIST_DATASET, help='Specify the datasets on which')
args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
begin_random_seed_range = 1
end_random_seed_range = 2000
if args.seed is not None:
random_seed = random.randint(begin_random_seed_range, end_random_seed_range)
else:
random_seed = args.seed
for dataset_name in args.datasets:
dataset_dir = os.path.join('experiments', dataset_name, 'stage1')
pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True)
logger.info('Bayesian search on dataset {}'.format(dataset_name))
dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None)
dataset = DatasetLoader.load(dataset_parameters)
if dataset.task == Task.CLASSIFICATION:
scorer = 'accuracy'
if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error'
bayesian_search = HyperparameterSearch()
opt = bayesian_search.search(dataset, DICT_PARAM_SPACE, args.n_iter, args.cv, random_seed, scorer)
dict_results = {'_scorer': scorer,
'_best_score_train': opt.best_score_,
'_best_score_test': opt.score(dataset.X_test, dataset.y_test),
'_best_parameters': clean_numpy_int_dict(opt.best_params_),
'_random_seed': random_seed
}
save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results)
......@@ -59,7 +59,7 @@ if __name__ == "__main__":
else [args.extracted_forest_size]
if args.seeds != None and args.random_seed_number > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
seeds = args.seeds if args.seeds is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment