Skip to content
Snippets Groups Projects
Commit c66d117d authored by Léo Bouscarrat's avatar Léo Bouscarrat
Browse files

When training, look if there is bayesian search results, if yes use this....

When training, look if there is bayesian search results, if yes use this. Exception: forest_size use the one given by parser if applicable
parent bf5803b6
No related branches found
No related tags found
2 merge requests!6Resolve "Gridsearching of the base forest",!3clean scripts
models/* models/*
results/*
experiments/*
*/.kile/* */.kile/*
*.kilepr *.kilepr
......
...@@ -45,7 +45,7 @@ class DatasetLoader(object): ...@@ -45,7 +45,7 @@ class DatasetLoader(object):
elif name == '20newsgroups_vectorized': elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized dataset_loading_func = fetch_20newsgroups_vectorized
task = Task.CLASSIFICATION task = Task.CLASSIFICATION
elif name == 'lfw_people': elif name == 'lfw_people': # needs PIL (image dataset)
dataset_loading_func = fetch_lfw_people dataset_loading_func = fetch_lfw_people
task = Task.CLASSIFICATION task = Task.CLASSIFICATION
elif name == 'lfw_pairs': elif name == 'lfw_pairs':
......
...@@ -5,16 +5,13 @@ import os ...@@ -5,16 +5,13 @@ import os
class ModelParameters(object): class ModelParameters(object):
def __init__(self, forest_size, extracted_forest_size, normalize_D, subsets_used, seed=None): def __init__(self, extracted_forest_size, normalize_D, subsets_used, hyperparameters, seed=None):
self._forest_size = forest_size
self._extracted_forest_size = extracted_forest_size self._extracted_forest_size = extracted_forest_size
self._normalize_D = normalize_D self._normalize_D = normalize_D
self._subsets_used = subsets_used self._subsets_used = subsets_used
self._seed = seed self._seed = seed
self._hyperparameters = hyperparameters
@property
def forest_size(self):
return self._forest_size
@property @property
def extracted_forest_size(self): def extracted_forest_size(self):
...@@ -32,6 +29,10 @@ class ModelParameters(object): ...@@ -32,6 +29,10 @@ class ModelParameters(object):
def seed(self): def seed(self):
return self._seed return self._seed
@property
def hyperparameters(self):
return self._hyperparameters
def save(self, directory_path, experiment_id): def save(self, directory_path, experiment_id):
save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id),
self.__dict__) self.__dict__)
......
...@@ -12,7 +12,7 @@ class OmpForestRegressor(BaseEstimator): ...@@ -12,7 +12,7 @@ class OmpForestRegressor(BaseEstimator):
DEFAULT_SCORE_METRIC = 'mse' DEFAULT_SCORE_METRIC = 'mse'
def __init__(self, models_parameters): def __init__(self, models_parameters):
self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, self._regressor = RandomForestRegressor(**models_parameters.hyperparameters,
random_state=models_parameters.seed) random_state=models_parameters.seed)
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__) self._logger = LoggerFactory.create(LOG_PATH, __name__)
......
...@@ -35,7 +35,7 @@ if __name__ == "__main__": ...@@ -35,7 +35,7 @@ if __name__ == "__main__":
load_dotenv(find_dotenv('.env.example')) load_dotenv(find_dotenv('.env.example'))
DEFAULT_CV = 3 DEFAULT_CV = 3
DEFAULT_N_ITER = 30 DEFAULT_N_ITER = 50
DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000), DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000),
'min_samples_leaf': Integer(1, 1000), 'min_samples_leaf': Integer(1, 1000),
'max_depth': Integer(1, 20), 'max_depth': Integer(1, 20),
...@@ -56,7 +56,7 @@ if __name__ == "__main__": ...@@ -56,7 +56,7 @@ if __name__ == "__main__":
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
if args.seed is not None: if args.seed is None:
random_seed = random.randint(begin_random_seed_range, end_random_seed_range) random_seed = random.randint(begin_random_seed_range, end_random_seed_range)
else: else:
random_seed = args.seed random_seed = args.seed
......
...@@ -9,6 +9,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory ...@@ -9,6 +9,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory
from dotenv import find_dotenv, load_dotenv from dotenv import find_dotenv, load_dotenv
import argparse import argparse
import json
import pathlib import pathlib
import random import random
import os import os
...@@ -22,7 +23,6 @@ if __name__ == "__main__": ...@@ -22,7 +23,6 @@ if __name__ == "__main__":
DEFAULT_DATASET_NAME = 'boston' DEFAULT_DATASET_NAME = 'boston'
DEFAULT_NORMALIZE_D = False DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = None DEFAULT_DATASET_NORMALIZER = None
DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE = 10 DEFAULT_EXTRACTED_FOREST_SIZE = 10
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
...@@ -39,7 +39,7 @@ if __name__ == "__main__": ...@@ -39,7 +39,7 @@ if __name__ == "__main__":
parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.') parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.') parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.')
...@@ -62,9 +62,20 @@ if __name__ == "__main__": ...@@ -62,9 +62,20 @@ if __name__ == "__main__":
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
seeds = args.seeds if args.seeds is not None \ seeds = args.seeds if args.seeds is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \ else [random.randint(begin_random_seed_range, end_random_seed_range)
for i in range(args.random_seed_number)] for i in range(args.random_seed_number)]
path_hyperparameter = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
if os.path.exists(path_hyperparameter):
with open(path_hyperparameter, 'r+') as file_hyperparameter:
hyperparameters = json.load(file_hyperparameter)['best_parameters']
else:
hyperparameters = {}
if args.forest_size is not None:
hyperparameters['n_estimators'] = args.forest_size
experiment_id = resolve_experiment_id(args.models_dir) experiment_id = resolve_experiment_id(args.models_dir)
experiment_id_str = str(experiment_id) experiment_id_str = str(experiment_id)
...@@ -98,11 +109,11 @@ if __name__ == "__main__": ...@@ -98,11 +109,11 @@ if __name__ == "__main__":
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters( model_parameters = ModelParameters(
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size, extracted_forest_size=extracted_forest_size,
normalize_D=args.normalize_D, normalize_D=args.normalize_D,
subsets_used=args.subsets_used, subsets_used=args.subsets_used,
seed=seed seed=seed,
hyperparameters=hyperparameters
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment