Skip to content
Snippets Groups Projects
Commit e319343d authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Compute best hyperparameters over k seeds, and select the most commons.

parent 254e846e
No related branches found
No related tags found
1 merge request!9Resolve "Experiment pipeline"
...@@ -30,6 +30,11 @@ class DatasetLoader(object): ...@@ -30,6 +30,11 @@ class DatasetLoader(object):
'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people', 'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',
'lfw_pairs', 'covtype', 'rcv1', 'california_housing'] 'lfw_pairs', 'covtype', 'rcv1', 'california_housing']
dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5,
'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15,
'20newsgroups_vectorized':3, 'lfw_people':3,
'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3}
@staticmethod @staticmethod
def load(dataset_parameters): def load(dataset_parameters):
name = dataset_parameters.name name = dataset_parameters.name
......
...@@ -12,6 +12,12 @@ import pathlib ...@@ -12,6 +12,12 @@ import pathlib
import pickle import pickle
import random import random
from dotenv import find_dotenv, load_dotenv from dotenv import find_dotenv, load_dotenv
from concurrent import futures
import threading
import numpy as np
import math
from collections import Counter
from itertools import chain, combinations
""" """
I had to install skopt from this repository I had to install skopt from this repository
...@@ -34,6 +40,87 @@ def clean_numpy_int_list(list_n): ...@@ -34,6 +40,87 @@ def clean_numpy_int_list(list_n):
clean_numpy_int_list(elem) if type(elem) == list else elem clean_numpy_int_list(elem) if type(elem) == list else elem
for elem in list_n] for elem in list_n]
def process_job(dataset_name, seed, param_space, args):
logger = LoggerFactory.create(LOG_PATH, 'hyperparameter-searcher_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
dataset = DatasetLoader.load_default(dataset_name, seed)
if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error'
else:
scorer = 'accuracy'
bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, param_space, args.n_iter,
args.cv, seed, scorer, args.verbose)
return {
'_scorer': scorer,
'_best_score_train': opt.best_score_,
'_best_score_test': opt.score(dataset.X_test, dataset.y_test),
'_best_parameters': clean_numpy_int_dict(opt.best_params_),
'_random_seed': seed
}
def run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args):
# Run one hyperparameter search job per seed
with futures.ProcessPoolExecutor(len(seeds)) as executor:
opt_results = list(f.result() for f in futures.as_completed(
executor.submit(process_job, dataset_name, seed, param_space, args
) for seed in seeds))
return opt_results
def compute_best_params_over_seeds(seeds, dataset_name, param_space, args):
opt_results = run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args)
# Move k best_parameters to a list of dict
all_best_params = [opt_result['_best_parameters'] for opt_result in opt_results]
print(all_best_params)
"""
list of hyperparam dicts -> list of hyperparam list
where each element of form 'key:value' becomes 'key_value'
to afterwards count most common pairs.
"""
stringify_best_params = list()
for current_best_params in all_best_params:
new_best_params = list()
for key, value in current_best_params.items():
new_best_params.append(key + '_' + str(value))
stringify_best_params.append(new_best_params)
# Compute pair combinations
pair_combinations = chain.from_iterable(combinations(line, 2) for line in stringify_best_params)
# Count most common pair combinations in ascent order
most_common_pair_combinations = Counter(pair_combinations).most_common()
"""
Select the most frequent hyperparameter values
until all different hyperparameter variables are
filled.
"""
all_param_names = all_best_params[0].keys()
best_params = dict()
for pair, _ in most_common_pair_combinations:
for element in pair:
split = element.split('_')
param, value = '_'.join(split[:-1]), split[-1]
if param not in best_params:
best_params[param] = value
if len(best_params) == len(all_param_names):
break
return {
'_scorer': opt_results[0]['_best_parameters'],
'_best_score_train': np.mean([opt_result['_best_score_train'] for opt_result in opt_results]),
'_best_score_test': np.mean([opt_result['_best_score_test'] for opt_result in opt_results]),
'_best_parameters': best_params,
'_random_seed': [opt_result['_random_seed'] for opt_result in opt_results]
}
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env # get environment variables in .env
...@@ -41,21 +128,35 @@ if __name__ == "__main__": ...@@ -41,21 +128,35 @@ if __name__ == "__main__":
DEFAULT_CV = 3 DEFAULT_CV = 3
DEFAULT_N_ITER = 50 DEFAULT_N_ITER = 50
DEFAULT_VERBOSE = False
DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000), DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000),
'min_samples_leaf': Integer(1, 1000), 'min_samples_leaf': Integer(1, 1000),
'max_depth': Integer(1, 20), 'max_depth': Integer(1, 20),
'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])} 'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])}
begin_random_seed_range = 1
end_random_seed_range = 2000
DEFAULT_USE_DIFFERENT_SEED_NUMBER = False
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.') parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.')
parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.') parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.')
parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--use_different_seed_number', action='store_true', default=DEFAULT_USE_DIFFERENT_SEED_NUMBER, help='Compute the amount of random seeds depending on the dataset.')
parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.') parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.') parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print information during the bayesian search.')
args = parser.parse_args() args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
random_seed = args.seed
if args.seeds != None and args.random_seed_number > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
# Seeds are either provided as parameters or generated at random
if args.use_different_seed_number:
seeds = args.seeds if args.seeds is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(args.random_seed_number)]
for dataset_name in args.datasets: for dataset_name in args.datasets:
dataset_dir = os.path.join('experiments', dataset_name, 'stage1') dataset_dir = os.path.join('experiments', dataset_name, 'stage1')
...@@ -63,22 +164,16 @@ if __name__ == "__main__": ...@@ -63,22 +164,16 @@ if __name__ == "__main__":
logger.info('Bayesian search on dataset {}'.format(dataset_name)) logger.info('Bayesian search on dataset {}'.format(dataset_name))
dataset = DatasetLoader.load_default(dataset_name, random_seed) """
Compute the amount of random seeds as specified in
if dataset.task == Task.REGRESSION: DatasetLoader.dataset_seed_numbers dictionary, depending on
scorer = 'neg_mean_squared_error' the dataset.
else: """
scorer = 'accuracy' if args.use_different_seed_number:
seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \
bayesian_searcher = HyperparameterSearcher() for i in range(DatasetLoader.dataset_seed_numbers[dataset_name])]
opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter,
args.cv, random_seed, scorer, args.verbose)
dict_results = {'_scorer': scorer, dict_results = compute_best_params_over_seeds(seeds, dataset_name,
'_best_score_train': opt.best_score_, DICT_PARAM_SPACE, args)
'_best_score_test': opt.score(dataset.X_test, dataset.y_test),
'_best_parameters': clean_numpy_int_dict(opt.best_params_),
'_random_seed': random_seed
}
save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results) save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment