diff --git a/code/bolsonaro/data/dataset.py b/code/bolsonaro/data/dataset.py index b2f67c489be94791397649ba292a49f65dd604d8..7108eb5781e1ef2b69926e9fd2239deaa81f44e2 100644 --- a/code/bolsonaro/data/dataset.py +++ b/code/bolsonaro/data/dataset.py @@ -1,9 +1,8 @@ class Dataset(object): - def __init__(self, task, dataset_parameters, X_train, X_dev, X_test, y_train, + def __init__(self, task, X_train, X_dev, X_test, y_train, y_dev, y_test): self._task = task - self._dataset_parameters = dataset_parameters self._X_train = X_train self._X_dev = X_dev self._X_test = X_test diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index d706b7a07751f715b6398b1f451ec9f337d00f60..ec1f321f70115542a2164c474193a246faa5639d 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -1,7 +1,8 @@ from bolsonaro.data.dataset import Dataset from bolsonaro.data.task import Task -from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_wine, load_breast_cancer +from sklearn.datasets import load_boston, load_iris, load_diabetes, \ + load_digits, load_linnerud, load_wine, load_breast_cancer from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing @@ -35,16 +36,16 @@ class DatasetLoader(object): elif name == 'breast_cancer': dataset_loading_func = load_breast_cancer task = Task.CLASSIFICATION - elif name == 'olivetti_faces': + elif name == 'olivetti_faces': # bug (no return X_y) dataset_loading_func = fetch_olivetti_faces task = Task.CLASSIFICATION - elif name == '20newsgroups': + elif name == '20newsgroups': # bug (no return X_y) dataset_loading_func = fetch_20newsgroups task = Task.CLASSIFICATION elif name == '20newsgroups_vectorized': dataset_loading_func = fetch_20newsgroups_vectorized task = Task.CLASSIFICATION - elif name == 'lfw_people': + elif name == 'lfw_people': # needs PIL (image dataset) dataset_loading_func = fetch_lfw_people task = Task.CLASSIFICATION elif name == 'lfw_pairs': @@ -87,5 +88,5 @@ class DatasetLoader(object): X_dev = scaler.transform(X_dev) X_test = scaler.transform(X_test) - return Dataset(task, dataset_parameters, X_train, + return Dataset(task, X_train, X_dev, X_test, y_train, y_dev, y_test) diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py index 9854a75eb27ec83990b8ae032c58e7dec52a5e8c..88054257d4241ae0426c16ebaca1cb4985c3b65f 100644 --- a/code/bolsonaro/data/dataset_parameters.py +++ b/code/bolsonaro/data/dataset_parameters.py @@ -15,11 +15,11 @@ class DatasetParameters(object): @property def name(self): return self._name - + @property def test_size(self): return self._test_size - + @property def dev_size(self): return self._dev_size diff --git a/code/bolsonaro/hyperparameter_searcher.py b/code/bolsonaro/hyperparameter_searcher.py new file mode 100644 index 0000000000000000000000000000000000000000..1f54c84e02f02ab8d62ba1441475cbfe2d572858 --- /dev/null +++ b/code/bolsonaro/hyperparameter_searcher.py @@ -0,0 +1,48 @@ +''' +This module is used to find the best hyperparameters for a given dataset. +''' + +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.data.task import Task +from bolsonaro.error_handling.logger_factory import LoggerFactory +from . import LOG_PATH + +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from skopt import BayesSearchCV + + +class HyperparameterSearcher(object): + + def __init__(self): + self._logger = LoggerFactory.create(LOG_PATH, __name__) + + def search(self, dataset, hyperparameter_space, n_iter, cv, + random_seed, scorer, verbose=False): + ''' + For a given dataset and the space of hyperparameters, does a + bayesian hyperparameters search. + :input dataset: a Dataset object + :input hyperparameter_space: a dictionnary, keys are hyperparameters, + value their spaces defined with skopt + :input n_iter: the number of iterations of the bayesian search + :input cv: the size of the cross validation + :input random_seed: int, the seed for the bayesian search + :input scorer: str, the name of the scorer + :input verbose: bool, print state of the research + :return: a skopt.searchcv.BayesSearchCV object + ''' + + if dataset.task == Task.CLASSIFICATION: + estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed) + + if dataset.task == Task.REGRESSION: + estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed) + + opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter, + cv=cv, n_jobs=-1, random_state=random_seed, + scoring=scorer, verbose=verbose) + + opt.fit(dataset.X_train, dataset.y_train) + + return opt diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index d0981966d93e0384ef345c2d97c8becac62f9785..31a451b70578835fe4663508de9e15f99bf6cc19 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -5,17 +5,13 @@ import os class ModelParameters(object): - def __init__(self, forest_size, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed): - self._forest_size = forest_size + def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters): self._extracted_forest_size = extracted_forest_size self._normalize_D = normalize_D self._subsets_used = subsets_used self._normalize_weights = normalize_weights self._seed = seed - - @property - def forest_size(self): - return self._forest_size + self._hyperparameters = hyperparameters @property def extracted_forest_size(self): @@ -37,6 +33,10 @@ class ModelParameters(object): def seed(self): return self._seed + @property + def hyperparameters(self): + return self._hyperparameters + def save(self, directory_path, experiment_id): save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), self.__dict__) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 65193e1ad32260ad55ff885f2f663aaf9dfeea77..013a86a2e889d3ebdc1b809b6d0d50ac5a697f26 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -12,7 +12,7 @@ class OmpForestRegressor(BaseEstimator): DEFAULT_SCORE_METRIC = 'mse' def __init__(self, models_parameters): - self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, + self._regressor = RandomForestRegressor(**models_parameters.hyperparameters, random_state=models_parameters.seed, n_jobs=-1) self._models_parameters = models_parameters self._logger = LoggerFactory.create(LOG_PATH, __name__) @@ -85,7 +85,7 @@ class OmpForestRegressor(BaseEstimator): self._regressor.fit(X, y) forest = self._regressor.estimators_ return forest - + def _extract_subforest(self, X, y): """ Given an already estimated regressor: apply OMP to get the weight of each tree. diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py new file mode 100644 index 0000000000000000000000000000000000000000..199e060f3ee2e3a125a7af05e9205453ae079b83 --- /dev/null +++ b/code/compute_hyperparameters.py @@ -0,0 +1,99 @@ +import argparse +import os +import pathlib +import pickle +import random + +from dotenv import find_dotenv, load_dotenv + +""" +I had to install skopt from this repository +https://github.com/darenr/scikit-optimize that handles +the issue described here https://github.com/scikit-optimize/scikit-optimize/issues/762. +""" +from skopt.space import Categorical, Integer, Real + +from bolsonaro import LOG_PATH +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.task import Task +from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.hyperparameter_searcher import HyperparameterSearcher +from bolsonaro.utils import save_obj_to_json + + +def clean_numpy_int_dict(dictionary): + return dict([a, int(x)] if type(x) == Integer else + [a, clean_numpy_int_dict(x)] if type(x) == dict else + [a, clean_numpy_int_list(x)] if type(x) == list else [a, (x)] + for a, x in dictionary.items()) + + +def clean_numpy_int_list(list_n): + return [int(elem) if type(elem) == Integer else + clean_numpy_int_dict(elem) if type(elem) == dict else + clean_numpy_int_list(elem) if type(elem) == list else elem + for elem in list_n] + + +if __name__ == "__main__": + # get environment variables in .env + load_dotenv(find_dotenv('.env.example')) + + DEFAULT_CV = 3 + DEFAULT_N_ITER = 50 + DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000), + 'min_samples_leaf': Integer(1, 1000), + 'max_depth': Integer(1, 20), + 'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])} + DATASET_LIST = ['boston', 'iris', 'diabetes'] + # , 'digits', 'linnerud', 'wine'] + + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.') + parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.') + parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.') + parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.') + parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.') + + args = parser.parse_args() + + logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) + + begin_random_seed_range = 1 + end_random_seed_range = 2000 + + if args.seed is None: + random_seed = random.randint(begin_random_seed_range, end_random_seed_range) + else: + random_seed = args.seed + + for dataset_name in args.datasets: + + dataset_dir = os.path.join('experiments', dataset_name, 'stage1') + + pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True) + + logger.info('Bayesian search on dataset {}'.format(dataset_name)) + + dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None) + dataset = DatasetLoader.load(dataset_parameters) + + if dataset.task == Task.CLASSIFICATION: + scorer = 'accuracy' + + if dataset.task == Task.REGRESSION: + scorer = 'neg_mean_squared_error' + + bayesian_searcher = HyperparameterSearcher() + opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter, + args.cv, random_seed, scorer, args.verbose) + + dict_results = {'_scorer': scorer, + '_best_score_train': opt.best_score_, + '_best_score_test': opt.score(dataset.X_test, dataset.y_test), + '_best_parameters': clean_numpy_int_dict(opt.best_params_), + '_random_seed': random_seed + } + + save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results) diff --git a/code/train.py b/code/train.py index 4faff6e970da3600cc9f00714759ba2c9df7f73d..d58871db980369efa254313b9997f5f9e99c0bbe 100644 --- a/code/train.py +++ b/code/train.py @@ -9,6 +9,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory from dotenv import find_dotenv, load_dotenv import argparse +import json import pathlib import random import os @@ -17,7 +18,7 @@ import threading import json -def process_job(seed, parameters, experiment_id): +def process_job(seed, parameters, experiment_id, hyperparameters): logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( seed, threading.get_ident())) logger.info('seed={}'.format(seed)) @@ -46,12 +47,12 @@ def process_job(seed, parameters, experiment_id): pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) model_parameters = ModelParameters( - forest_size=parameters['forest_size'], extracted_forest_size=extracted_forest_size, normalize_D=parameters['normalize_D'], subsets_used=parameters['subsets_used'], normalize_weights=parameters['normalize_weights'], - seed=seed + seed=seed, + hyperparameters=hyperparameters ) model_parameters.save(sub_models_dir, experiment_id) @@ -113,6 +114,17 @@ if __name__ == "__main__": if type(parameters['extracted_forest_size']) == list \ else [parameters['extracted_forest_size']] + hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json') + if os.path.exists(hyperparameters_path): + logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path)) + with open(hyperparameters_path, 'r+') as file_hyperparameter: + hyperparameters = json.load(file_hyperparameter)['best_parameters'] + else: + hyperparameters = {} + + if parameters['forest_size'] is not None: + hyperparameters['n_estimators'] = parameters['forest_size'] + if parameters['seeds'] != None and parameters['random_seed_number'] > 1: logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') @@ -141,4 +153,4 @@ if __name__ == "__main__": # Train as much job as there are seeds with futures.ProcessPoolExecutor(len(seeds)) as executor: list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, - parameters, experiment_id) for seed in seeds)) + parameters, experiment_id, hyperparameters) for seed in seeds)) diff --git a/experiments/boston/stage1/params.json b/experiments/boston/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..5df0848304a35a97edb868490ce359a05ef5708b --- /dev/null +++ b/experiments/boston/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "neg_mean_squared_error", + "best_score_train": -12.900217003727361, + "best_score_test": -12.682938620298733, + "best_parameters": { + "max_depth": 18, + "max_features": "sqrt", + "min_samples_leaf": 1, + "n_estimators": 1000 + }, + "random_seed": 883 +} \ No newline at end of file diff --git a/experiments/diabetes/stage1/params.json b/experiments/diabetes/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..b0bbcf440e18f0fb8440e0849d87786406c107e2 --- /dev/null +++ b/experiments/diabetes/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "neg_mean_squared_error", + "best_score_train": -4116.794513285369, + "best_score_test": -3119.0787141784604, + "best_parameters": { + "max_depth": 15, + "max_features": "auto", + "min_samples_leaf": 60, + "n_estimators": 33 + }, + "random_seed": 883 +} \ No newline at end of file diff --git a/experiments/iris/stage1/params.json b/experiments/iris/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..fd852cace9852ee492649374e915b639fe785b28 --- /dev/null +++ b/experiments/iris/stage1/params.json @@ -0,0 +1,12 @@ +{ + "scorer": "accuracy", + "best_score_train": 0.9576271186440678, + "best_score_test": 1.0, + "best_parameters": { + "max_depth": 20, + "max_features": "log2", + "min_samples_leaf": 1, + "n_estimators": 1000 + }, + "random_seed": 883 +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 92bb6f0615c8c7d58cbbf4e1cadb097bb00b27c6..d585159632442f7293f858e1fad391031658d60a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ awscli flake8 python-dotenv>=0.5.1 scikit-learn +git+git://github.com/darenr/scikit-optimize@master python-dotenv matplotlib pandas \ No newline at end of file