-
Charly Lamothe authored
- Fix some variable names; - Add exp files of stage1 for california housing
Charly Lamothe authored- Fix some variable names; - Add exp files of stage1 for california housing
train.py 13.57 KiB
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.trainer import Trainer
from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from dotenv import find_dotenv, load_dotenv
import argparse
import json
import pathlib
import random
import os
from joblib import Parallel, delayed
import threading
import json
from tqdm import tqdm
import numpy as np
def process_job(seed, parameters, experiment_id, hyperparameters):
"""
Experiment function.
Will be used as base function for worker in multithreaded application.
:param seed:
:param parameters:
:param experiment_id:
:return:
"""
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
seed_str = str(seed)
experiment_id_str = str(experiment_id)
models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + seed_str
pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
dataset_parameters = DatasetParameters(
name=parameters['dataset_name'],
test_size=parameters['test_size'],
dev_size=parameters['dev_size'],
random_state=seed,
dataset_normalizer=parameters['dataset_normalizer']
)
dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset)
if parameters['extraction_strategy'] != 'none':
for extracted_forest_size in parameters['extracted_forest_size']:
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=extracted_forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model)
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model)
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
"""
Example for stage 1:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --save_experiment_configuration 1 none_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --save_experiment_configuration 1 random_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --save_experiment_configuration 1 omp_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params
python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6
"""
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_sizes/{extracted_forest_size}
DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
DEFAULT_VERBOSE = False
DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp'
begin_random_seed_range = 1
end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random or none.')
args = parser.parse_args()
if args.experiment_configuration:
with open(args.experiment_configuration_path + os.sep + \
args.experiment_configuration + '.json', 'r') as input_file:
parameters = json.load(input_file)
else:
parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
if os.path.exists(hyperparameters_path) and not args.skip_best_hyperparams:
logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
with open(hyperparameters_path, 'r+') as file_hyperparameter:
hyperparameters = json.load(file_hyperparameter)['best_parameters']
else:
hyperparameters = {}
"""
First case: no best hyperparameters are specified and no forest_size parameter
specified in argument, so use the DEFAULT_FOREST_SIZE.
Second case: no matter if hyperparameters are specified, the forest_size parameter
will override it.
Third implicit case: use the number of estimators found in specified hyperparameters.
"""
if len(hyperparameters) == 0 and parameters['forest_size'] is None:
hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE
elif parameters['forest_size'] is not None:
hyperparameters['n_estimators'] = parameters['forest_size']
# The number of tree to extract from forest (K)
parameters['extracted_forest_size'] = (hyperparameters['n_estimators'] *
np.linspace(0, args.extracted_forest_size_stop,
parameters['extracted_forest_size_samples'] + 1,
endpoint=False)[1:]).astype(np.int).tolist()
if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
# Seeds are either provided as parameters or generated at random
seeds = parameters['seeds'] if parameters['seeds'] is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(parameters['random_seed_number'])]
# Resolve the next experiment id number (last id + 1)
experiment_id = resolve_experiment_id(parameters['models_dir'])
logger.info('Experiment id: {}'.format(experiment_id))
parameters['experiment_id'] = experiment_id
"""
If the experiment configuration isn't coming from
an already existing file, save it to a json file to
keep trace of it (either a specified path, either in 'unnamed' dir.).
"""
if args.experiment_configuration is None:
if args.save_experiment_configuration:
if len(args.save_experiment_configuration) != 2:
raise ValueError('save_experiment_configuration must have two parameters.')
elif int(args.save_experiment_configuration[0]) not in list(range(1, 5)):
raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 4]).')
output_experiment_configuration_path = os.path.join(args.experiment_configuration_path,
args.dataset_name, 'stage' + args.save_experiment_configuration[0],
args.save_experiment_configuration[1] + '_{}.json'.format(
experiment_id))
else:
pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True)
output_experiment_configuration_path = os.path.join(
args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format(
experiment_id))
with open(output_experiment_configuration_path, 'w') as output_file:
json.dump(
parameters,
output_file,
indent=4
)
# Run as much job as there are seeds
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i],
parameters, experiment_id, hyperparameters) for i in range(len(seeds)))