Skip to content
Snippets Groups Projects
Commit 3e8f934b authored by Charly Lamothe's avatar Charly Lamothe
Browse files

- Fix conflicts;

- Fix small typos and issues;
- Add some best hyperparameters (TODO: fix score for regression task, and separate best parameters for regression and classif tasks).
parents 9a9a3bff c66d117d
No related branches found
No related tags found
2 merge requests!6Resolve "Gridsearching of the base forest",!3clean scripts
class Dataset(object): class Dataset(object):
def __init__(self, task, dataset_parameters, X_train, X_dev, X_test, y_train, def __init__(self, task, X_train, X_dev, X_test, y_train,
y_dev, y_test): y_dev, y_test):
self._task = task self._task = task
self._dataset_parameters = dataset_parameters
self._X_train = X_train self._X_train = X_train
self._X_dev = X_dev self._X_dev = X_dev
self._X_test = X_test self._X_test = X_test
......
from bolsonaro.data.dataset import Dataset from bolsonaro.data.dataset import Dataset
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_wine, load_breast_cancer from sklearn.datasets import load_boston, load_iris, load_diabetes, \
load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
...@@ -35,16 +36,16 @@ class DatasetLoader(object): ...@@ -35,16 +36,16 @@ class DatasetLoader(object):
elif name == 'breast_cancer': elif name == 'breast_cancer':
dataset_loading_func = load_breast_cancer dataset_loading_func = load_breast_cancer
task = Task.CLASSIFICATION task = Task.CLASSIFICATION
elif name == 'olivetti_faces': elif name == 'olivetti_faces': # bug (no return X_y)
dataset_loading_func = fetch_olivetti_faces dataset_loading_func = fetch_olivetti_faces
task = Task.CLASSIFICATION task = Task.CLASSIFICATION
elif name == '20newsgroups': elif name == '20newsgroups': # bug (no return X_y)
dataset_loading_func = fetch_20newsgroups dataset_loading_func = fetch_20newsgroups
task = Task.CLASSIFICATION task = Task.CLASSIFICATION
elif name == '20newsgroups_vectorized': elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized dataset_loading_func = fetch_20newsgroups_vectorized
task = Task.CLASSIFICATION task = Task.CLASSIFICATION
elif name == 'lfw_people': elif name == 'lfw_people': # needs PIL (image dataset)
dataset_loading_func = fetch_lfw_people dataset_loading_func = fetch_lfw_people
task = Task.CLASSIFICATION task = Task.CLASSIFICATION
elif name == 'lfw_pairs': elif name == 'lfw_pairs':
...@@ -87,5 +88,5 @@ class DatasetLoader(object): ...@@ -87,5 +88,5 @@ class DatasetLoader(object):
X_dev = scaler.transform(X_dev) X_dev = scaler.transform(X_dev)
X_test = scaler.transform(X_test) X_test = scaler.transform(X_test)
return Dataset(task, dataset_parameters, X_train, return Dataset(task, X_train,
X_dev, X_test, y_train, y_dev, y_test) X_dev, X_test, y_train, y_dev, y_test)
...@@ -15,11 +15,11 @@ class DatasetParameters(object): ...@@ -15,11 +15,11 @@ class DatasetParameters(object):
@property @property
def name(self): def name(self):
return self._name return self._name
@property @property
def test_size(self): def test_size(self):
return self._test_size return self._test_size
@property @property
def dev_size(self): def dev_size(self):
return self._dev_size return self._dev_size
......
'''
This module is used to find the best hyperparameters for a given dataset.
'''
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from skopt import BayesSearchCV
class HyperparameterSearcher(object):
def __init__(self):
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def search(self, dataset, hyperparameter_space, n_iter, cv,
random_seed, scorer, verbose=False):
'''
For a given dataset and the space of hyperparameters, does a
bayesian hyperparameters search.
:input dataset: a Dataset object
:input hyperparameter_space: a dictionnary, keys are hyperparameters,
value their spaces defined with skopt
:input n_iter: the number of iterations of the bayesian search
:input cv: the size of the cross validation
:input random_seed: int, the seed for the bayesian search
:input scorer: str, the name of the scorer
:input verbose: bool, print state of the research
:return: a skopt.searchcv.BayesSearchCV object
'''
if dataset.task == Task.CLASSIFICATION:
estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed)
if dataset.task == Task.REGRESSION:
estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed)
opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter,
cv=cv, n_jobs=-1, random_state=random_seed,
scoring=scorer, verbose=verbose)
opt.fit(dataset.X_train, dataset.y_train)
return opt
...@@ -5,17 +5,13 @@ import os ...@@ -5,17 +5,13 @@ import os
class ModelParameters(object): class ModelParameters(object):
def __init__(self, forest_size, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed): def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters):
self._forest_size = forest_size
self._extracted_forest_size = extracted_forest_size self._extracted_forest_size = extracted_forest_size
self._normalize_D = normalize_D self._normalize_D = normalize_D
self._subsets_used = subsets_used self._subsets_used = subsets_used
self._normalize_weights = normalize_weights self._normalize_weights = normalize_weights
self._seed = seed self._seed = seed
self._hyperparameters = hyperparameters
@property
def forest_size(self):
return self._forest_size
@property @property
def extracted_forest_size(self): def extracted_forest_size(self):
...@@ -37,6 +33,10 @@ class ModelParameters(object): ...@@ -37,6 +33,10 @@ class ModelParameters(object):
def seed(self): def seed(self):
return self._seed return self._seed
@property
def hyperparameters(self):
return self._hyperparameters
def save(self, directory_path, experiment_id): def save(self, directory_path, experiment_id):
save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id),
self.__dict__) self.__dict__)
......
...@@ -12,7 +12,7 @@ class OmpForestRegressor(BaseEstimator): ...@@ -12,7 +12,7 @@ class OmpForestRegressor(BaseEstimator):
DEFAULT_SCORE_METRIC = 'mse' DEFAULT_SCORE_METRIC = 'mse'
def __init__(self, models_parameters): def __init__(self, models_parameters):
self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, self._regressor = RandomForestRegressor(**models_parameters.hyperparameters,
random_state=models_parameters.seed, n_jobs=-1) random_state=models_parameters.seed, n_jobs=-1)
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__) self._logger = LoggerFactory.create(LOG_PATH, __name__)
...@@ -85,7 +85,7 @@ class OmpForestRegressor(BaseEstimator): ...@@ -85,7 +85,7 @@ class OmpForestRegressor(BaseEstimator):
self._regressor.fit(X, y) self._regressor.fit(X, y)
forest = self._regressor.estimators_ forest = self._regressor.estimators_
return forest return forest
def _extract_subforest(self, X, y): def _extract_subforest(self, X, y):
""" """
Given an already estimated regressor: apply OMP to get the weight of each tree. Given an already estimated regressor: apply OMP to get the weight of each tree.
......
import argparse
import os
import pathlib
import pickle
import random
from dotenv import find_dotenv, load_dotenv
"""
I had to install skopt from this repository
https://github.com/darenr/scikit-optimize that handles
the issue described here https://github.com/scikit-optimize/scikit-optimize/issues/762.
"""
from skopt.space import Categorical, Integer, Real
from bolsonaro import LOG_PATH
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.hyperparameter_searcher import HyperparameterSearcher
from bolsonaro.utils import save_obj_to_json
def clean_numpy_int_dict(dictionary):
return dict([a, int(x)] if type(x) == Integer else
[a, clean_numpy_int_dict(x)] if type(x) == dict else
[a, clean_numpy_int_list(x)] if type(x) == list else [a, (x)]
for a, x in dictionary.items())
def clean_numpy_int_list(list_n):
return [int(elem) if type(elem) == Integer else
clean_numpy_int_dict(elem) if type(elem) == dict else
clean_numpy_int_list(elem) if type(elem) == list else elem
for elem in list_n]
if __name__ == "__main__":
# get environment variables in .env
load_dotenv(find_dotenv('.env.example'))
DEFAULT_CV = 3
DEFAULT_N_ITER = 50
DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000),
'min_samples_leaf': Integer(1, 1000),
'max_depth': Integer(1, 20),
'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])}
DATASET_LIST = ['boston', 'iris', 'diabetes']
# , 'digits', 'linnerud', 'wine']
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.')
parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.')
parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.')
parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.')
args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
begin_random_seed_range = 1
end_random_seed_range = 2000
if args.seed is None:
random_seed = random.randint(begin_random_seed_range, end_random_seed_range)
else:
random_seed = args.seed
for dataset_name in args.datasets:
dataset_dir = os.path.join('experiments', dataset_name, 'stage1')
pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True)
logger.info('Bayesian search on dataset {}'.format(dataset_name))
dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None)
dataset = DatasetLoader.load(dataset_parameters)
if dataset.task == Task.CLASSIFICATION:
scorer = 'accuracy'
if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error'
bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter,
args.cv, random_seed, scorer, args.verbose)
dict_results = {'_scorer': scorer,
'_best_score_train': opt.best_score_,
'_best_score_test': opt.score(dataset.X_test, dataset.y_test),
'_best_parameters': clean_numpy_int_dict(opt.best_params_),
'_random_seed': random_seed
}
save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results)
...@@ -9,6 +9,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory ...@@ -9,6 +9,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory
from dotenv import find_dotenv, load_dotenv from dotenv import find_dotenv, load_dotenv
import argparse import argparse
import json
import pathlib import pathlib
import random import random
import os import os
...@@ -17,7 +18,7 @@ import threading ...@@ -17,7 +18,7 @@ import threading
import json import json
def process_job(seed, parameters, experiment_id): def process_job(seed, parameters, experiment_id, hyperparameters):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident())) seed, threading.get_ident()))
logger.info('seed={}'.format(seed)) logger.info('seed={}'.format(seed))
...@@ -46,12 +47,12 @@ def process_job(seed, parameters, experiment_id): ...@@ -46,12 +47,12 @@ def process_job(seed, parameters, experiment_id):
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters( model_parameters = ModelParameters(
forest_size=parameters['forest_size'],
extracted_forest_size=extracted_forest_size, extracted_forest_size=extracted_forest_size,
normalize_D=parameters['normalize_D'], normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'], subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'], normalize_weights=parameters['normalize_weights'],
seed=seed seed=seed,
hyperparameters=hyperparameters
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
...@@ -113,6 +114,17 @@ if __name__ == "__main__": ...@@ -113,6 +114,17 @@ if __name__ == "__main__":
if type(parameters['extracted_forest_size']) == list \ if type(parameters['extracted_forest_size']) == list \
else [parameters['extracted_forest_size']] else [parameters['extracted_forest_size']]
hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
if os.path.exists(hyperparameters_path):
logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
with open(hyperparameters_path, 'r+') as file_hyperparameter:
hyperparameters = json.load(file_hyperparameter)['best_parameters']
else:
hyperparameters = {}
if parameters['forest_size'] is not None:
hyperparameters['n_estimators'] = parameters['forest_size']
if parameters['seeds'] != None and parameters['random_seed_number'] > 1: if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
...@@ -141,4 +153,4 @@ if __name__ == "__main__": ...@@ -141,4 +153,4 @@ if __name__ == "__main__":
# Train as much job as there are seeds # Train as much job as there are seeds
with futures.ProcessPoolExecutor(len(seeds)) as executor: with futures.ProcessPoolExecutor(len(seeds)) as executor:
list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, list(f.result() for f in futures.as_completed(executor.submit(process_job, seed,
parameters, experiment_id) for seed in seeds)) parameters, experiment_id, hyperparameters) for seed in seeds))
{
"scorer": "neg_mean_squared_error",
"best_score_train": -12.900217003727361,
"best_score_test": -12.682938620298733,
"best_parameters": {
"max_depth": 18,
"max_features": "sqrt",
"min_samples_leaf": 1,
"n_estimators": 1000
},
"random_seed": 883
}
\ No newline at end of file
{
"scorer": "neg_mean_squared_error",
"best_score_train": -4116.794513285369,
"best_score_test": -3119.0787141784604,
"best_parameters": {
"max_depth": 15,
"max_features": "auto",
"min_samples_leaf": 60,
"n_estimators": 33
},
"random_seed": 883
}
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.9576271186440678,
"best_score_test": 1.0,
"best_parameters": {
"max_depth": 20,
"max_features": "log2",
"min_samples_leaf": 1,
"n_estimators": 1000
},
"random_seed": 883
}
\ No newline at end of file
...@@ -9,6 +9,7 @@ awscli ...@@ -9,6 +9,7 @@ awscli
flake8 flake8
python-dotenv>=0.5.1 python-dotenv>=0.5.1
scikit-learn scikit-learn
git+git://github.com/darenr/scikit-optimize@master
python-dotenv python-dotenv
matplotlib matplotlib
pandas pandas
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment