Newer
Older
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.models.model_parameters import ModelParameters
Charly Lamothe
committed
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.trainer import Trainer
Charly Lamothe
committed
from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
Charly LAMOTHE
committed
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
Luc Giffon
committed
from dotenv import find_dotenv, load_dotenv
Léo Bouscarrat
committed
import json
Charly Lamothe
committed
from joblib import Parallel, delayed
import threading
Charly LAMOTHE
committed
import json
Charly Lamothe
committed
from tqdm import tqdm
import numpy as np
Charly Lamothe
committed
def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose):
"""
Experiment function.
Will be used as base function for worker in multithreaded application.
:param seed:
:param parameters:
:param experiment_id:
:return:
"""
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident()))
seed_str = str(seed)
experiment_id_str = str(experiment_id)
Charly LAMOTHE
committed
models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + seed_str
pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
dataset_parameters = DatasetParameters(
Charly LAMOTHE
committed
name=parameters['dataset_name'],
test_size=parameters['test_size'],
dev_size=parameters['dev_size'],
random_state=seed,
Charly LAMOTHE
committed
dataset_normalizer=parameters['dataset_normalizer']
)
dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset)
Charly Lamothe
committed
if parameters['extraction_strategy'] == 'random':
pretrained_model_parameters = ModelParameters(
extracted_forest_size=parameters['forest_size'],
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters)
pretraned_trainer = Trainer(dataset)
pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
pretrained_estimator.fit(
X=pretraned_trainer._X_forest,
y=pretraned_trainer._y_forest
)
Charly Lamothe
committed
else:
pretrained_estimator = None
pretrained_model_parameters = None
Charly Lamothe
committed
if parameters['extraction_strategy'] != 'none':
Charly Lamothe
committed
with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer,
pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters,
use_distillation=parameters['extraction_strategy'] == 'omp_distillation')
Charly Lamothe
committed
for i in range(len(parameters['extracted_forest_size'])))
Charly Lamothe
committed
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
Charly Lamothe
committed
Charly Lamothe
committed
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
Charly Lamothe
committed
if file_name == 'model_raw_results.pickle':
Charly Lamothe
committed
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
Charly Lamothe
committed
else:
continue
Charly Lamothe
committed
if already_exists:
logger.info('Base forest result already exists. Skipping...')
else:
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
Charly Lamothe
committed
model_parameters = ModelParameters(
Charly Lamothe
committed
extracted_forest_size=forest_size,
Charly Lamothe
committed
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
Charly Lamothe
committed
trainer.init(model, subsets_used=parameters['subsets_used'])
Charly Lamothe
committed
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
Charly Lamothe
committed
logger.info(f'Training done for seed {seed_str}')
seed_job_pb.update(1)
def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
seed, parameters, dataset, hyperparameters, experiment_id, trainer,
pretrained_estimator=None, pretrained_model_parameters=None, use_distillation=False):
Charly Lamothe
committed
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
seed, extracted_forest_size, threading.get_ident()))
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
Charly Lamothe
committed
if file_name == 'model_raw_results.pickle':
Charly Lamothe
committed
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
Charly Lamothe
committed
else:
continue
Charly Lamothe
committed
if already_exists:
logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
return
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
Charly Lamothe
committed
if not pretrained_estimator:
model_parameters = ModelParameters(
extracted_forest_size=extracted_forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
Charly Lamothe
committed
else:
model = copy.deepcopy(pretrained_estimator)
Charly Lamothe
committed
pretrained_model_parameters.save(sub_models_dir, experiment_id)
Charly Lamothe
committed
trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model, extracted_forest_size=extracted_forest_size, seed=seed,
use_distillation=use_distillation)
Charly Lamothe
committed
trainer.compute_results(model, sub_models_dir)
Charly Lamothe
committed
"""
Command lines example for stage 1:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05
Charly Lamothe
committed
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --extracted_forest_size_stop=0.05
python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 --dataset_name=california_housing
Command lines example for stage 2:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 no_normalization --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D --normalize_D --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_weights --normalize_weights --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D_and_weights --normalize_D --normalize_weights --extracted_forest_size_stop=0.05
Charly Lamothe
committed
python code/compute_results.py --stage 2 --experiment_ids 7 8 9 10 --dataset_name=california_housing
Command lines example for stage 3:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev
Charly Lamothe
committed
python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing
Command lines example for stage 4:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 4 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 4 omp_with_params --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/compute_results.py --stage 4 --experiment_ids 1 2 3 --dataset_name=california_housing
Charly Lamothe
committed
"""
load_dotenv(find_dotenv('.env'))
Charly LAMOTHE
committed
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_sizes/{extracted_forest_size}
DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
Charly Lamothe
committed
DEFAULT_VERBOSE = False
DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1
Charly Lamothe
committed
DEFAULT_EXTRACTION_STRATEGY = 'omp'
Charly Lamothe
committed
DEFAULT_OVERWRITE = False
begin_random_seed_range = 1
end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--experiment_id', nargs='?', type=int, default=None, help='Specify an experiment id. Remove already existing model with this specified experiment id.')
Charly LAMOTHE
committed
parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
Charly Lamothe
committed
parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
Charly LAMOTHE
committed
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='?', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
Charly Lamothe
committed
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
Charly Lamothe
committed
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity_similarities, similarity_predictions, kmeans, ensemble.')
Charly Lamothe
committed
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
Charly LAMOTHE
committed
if args.experiment_configuration:
with open(args.experiment_configuration_path + os.sep + \
args.experiment_configuration + '.json', 'r') as input_file:
parameters = json.load(input_file)
else:
parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'omp_distillation', 'random', 'none', 'similarity_similarities', 'similarity_predictions', 'kmeans', 'ensemble']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters['extraction_strategy']))
Charly Lamothe
committed
Charly LAMOTHE
committed
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
Charly LAMOTHE
committed
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
Charly LAMOTHE
committed
hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
with open(hyperparameters_path, 'r+') as file_hyperparameter:
loaded_hyperparameters = json.load(file_hyperparameter)['best_parameters']
if args.skip_best_hyperparams:
hyperparameters = {'n_estimators': loaded_hyperparameters['n_estimators']}
else:
hyperparameters = loaded_hyperparameters
Léo Bouscarrat
committed
else:
hyperparameters = {}
"""
First case: no best hyperparameters are specified and no forest_size parameter
specified in argument, so use the DEFAULT_FOREST_SIZE.
Second case: no matter if hyperparameters are specified, the forest_size parameter
will override it.
Third implicit case: use the number of estimators found in specified hyperparameters.
"""
if len(hyperparameters) == 0 and parameters['forest_size'] is None:
hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE
elif parameters['forest_size'] is not None:
hyperparameters['n_estimators'] = parameters['forest_size']
# The number of tree to extract from forest (K)
parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
Charly Lamothe
committed
np.linspace(0, args.extracted_forest_size_stop,
parameters['extracted_forest_size_samples'] + 1,
endpoint=True)[1:]).astype(np.int)).tolist()
logger.info(f"extracted forest sizes: {parameters['extracted_forest_size']}")
Charly LAMOTHE
committed
if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
Charly LAMOTHE
committed
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
# Seeds are either provided as parameters or generated at random
Charly LAMOTHE
committed
seeds = parameters['seeds'] if parameters['seeds'] is not None \
Charly LAMOTHE
committed
else [random.randint(begin_random_seed_range, end_random_seed_range) \
Charly LAMOTHE
committed
for i in range(parameters['random_seed_number'])]
if args.experiment_id:
experiment_id = args.experiment_id
Charly Lamothe
committed
if args.overwrite:
shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
else:
# Resolve the next experiment id number (last id + 1)
experiment_id = resolve_experiment_id(parameters['models_dir'])
logger.info('Experiment id: {}'.format(experiment_id))
Charly LAMOTHE
committed
"""
If the experiment configuration isn't coming from
Charly LAMOTHE
committed
an already existing file, save it to a json file to
keep trace of it (either a specified path, either in 'unnamed' dir.).
Charly LAMOTHE
committed
"""
if args.experiment_configuration is None:
if args.save_experiment_configuration:
if len(args.save_experiment_configuration) != 2:
raise ValueError('save_experiment_configuration must have two parameters.')
elif int(args.save_experiment_configuration[0]) not in list(range(1, 6)):
raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 5]).')
output_experiment_stage_path = os.path.join(args.experiment_configuration_path,
args.dataset_name, 'stage' + args.save_experiment_configuration[0])
pathlib.Path(output_experiment_stage_path).mkdir(parents=True, exist_ok=True)
output_experiment_configuration_path = os.path.join(output_experiment_stage_path,
args.save_experiment_configuration[1] + '.json')
else:
pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True)
output_experiment_configuration_path = os.path.join(
args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format(
experiment_id))
with open(output_experiment_configuration_path, 'w') as output_file:
Charly LAMOTHE
committed
json.dump(
parameters,
output_file,
indent=4
)
Charly Lamothe
committed
# Run as much job as there are seeds
Charly Lamothe
committed
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as seed_job_pb:
Parallel(n_jobs=args.job_number)(delayed(seed_job)(seed_job_pb, seeds[i],
parameters, experiment_id, hyperparameters, args.verbose) for i in range(len(seeds)))