from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_loader import DatasetLoader from bolsonaro.models.model_factory import ModelFactory from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.trainer import Trainer from bolsonaro.utils import resolve_experiment_id, tqdm_joblib from bolsonaro import LOG_PATH from bolsonaro.error_handling.logger_factory import LoggerFactory from dotenv import find_dotenv, load_dotenv import argparse import json import pathlib import random import os from joblib import Parallel, delayed import threading import json from tqdm import tqdm import numpy as np def process_job(seed, parameters, experiment_id, hyperparameters): """ Experiment function. Will be used as base function for worker in multithreaded application. :param seed: :param parameters: :param experiment_id: :return: """ logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( seed, threading.get_ident())) logger.info('seed={}'.format(seed)) seed_str = str(seed) experiment_id_str = str(experiment_id) models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \ os.sep + seed_str pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True) dataset_parameters = DatasetParameters( name=parameters['dataset_name'], test_size=parameters['test_size'], dev_size=parameters['dev_size'], random_state=seed, dataset_normalizer=parameters['dataset_normalizer'] ) dataset_parameters.save(models_dir, experiment_id_str) dataset = DatasetLoader.load(dataset_parameters) trainer = Trainer(dataset) if parameters['extraction_strategy'] != 'none': for extracted_forest_size in parameters['extracted_forest_size']: logger.info('extracted_forest_size={}'.format(extracted_forest_size)) sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) model_parameters = ModelParameters( extracted_forest_size=extracted_forest_size, normalize_D=parameters['normalize_D'], subsets_used=parameters['subsets_used'], normalize_weights=parameters['normalize_weights'], seed=seed, hyperparameters=hyperparameters, extraction_strategy=parameters['extraction_strategy'] ) model_parameters.save(sub_models_dir, experiment_id) model = ModelFactory.build(dataset.task, model_parameters) trainer.init(model) trainer.train(model) trainer.compute_results(model, sub_models_dir) else: forest_size = hyperparameters['n_estimators'] logger.info('Base forest training with fixed forest size of {}'.format(forest_size)) sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) model_parameters = ModelParameters( extracted_forest_size=forest_size, normalize_D=parameters['normalize_D'], subsets_used=parameters['subsets_used'], normalize_weights=parameters['normalize_weights'], seed=seed, hyperparameters=hyperparameters, extraction_strategy=parameters['extraction_strategy'] ) model_parameters.save(sub_models_dir, experiment_id) model = ModelFactory.build(dataset.task, model_parameters) trainer.init(model) trainer.train(model) trainer.compute_results(model, sub_models_dir) logger.info('Training done') """ Example for stage 1: python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --save_experiment_configuration 1 none_with_params python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --save_experiment_configuration 1 random_with_params python code/train.py --dataset_name=california_housing --seeds 1 2 3 --save_experiment_configuration 1 omp_with_params python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params python code/train.py --dataset_name=california_housing --seeds 1 2 3 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 """ if __name__ == "__main__": load_dotenv(find_dotenv('.env')) DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_sizes/{extracted_forest_size} DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models' DEFAULT_VERBOSE = False DEFAULT_SKIP_BEST_HYPERPARAMS = False DEFAULT_JOB_NUMBER = -1 DEFAULT_EXTRACTION_STRATEGY = 'omp' begin_random_seed_range = 1 end_random_seed_range = 2000 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.') parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.') parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.') parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.') parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.') parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.') parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random or none.') args = parser.parse_args() if args.experiment_configuration: with open(args.experiment_configuration_path + os.sep + \ args.experiment_configuration + '.json', 'r') as input_file: parameters = json.load(input_file) else: parameters = args.__dict__ if parameters['extraction_strategy'] not in ['omp', 'random', 'none']: raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json') if os.path.exists(hyperparameters_path) and not args.skip_best_hyperparams: logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path)) with open(hyperparameters_path, 'r+') as file_hyperparameter: hyperparameters = json.load(file_hyperparameter)['best_parameters'] else: hyperparameters = {} """ First case: no best hyperparameters are specified and no forest_size parameter specified in argument, so use the DEFAULT_FOREST_SIZE. Second case: no matter if hyperparameters are specified, the forest_size parameter will override it. Third implicit case: use the number of estimators found in specified hyperparameters. """ if len(hyperparameters) == 0 and parameters['forest_size'] is None: hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE elif parameters['forest_size'] is not None: hyperparameters['n_estimators'] = parameters['forest_size'] # The number of tree to extract from forest (K) parameters['extracted_forest_size'] = (hyperparameters['n_estimators'] * np.linspace(0, args.extracted_forest_size_stop, parameters['extracted_forest_size_samples'] + 1, endpoint=False)[1:]).astype(np.int).tolist() if parameters['seeds'] != None and parameters['random_seed_number'] > 1: logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') # Seeds are either provided as parameters or generated at random seeds = parameters['seeds'] if parameters['seeds'] is not None \ else [random.randint(begin_random_seed_range, end_random_seed_range) \ for i in range(parameters['random_seed_number'])] # Resolve the next experiment id number (last id + 1) experiment_id = resolve_experiment_id(parameters['models_dir']) logger.info('Experiment id: {}'.format(experiment_id)) parameters['experiment_id'] = experiment_id """ If the experiment configuration isn't coming from an already existing file, save it to a json file to keep trace of it (either a specified path, either in 'unnamed' dir.). """ if args.experiment_configuration is None: if args.save_experiment_configuration: if len(args.save_experiment_configuration) != 2: raise ValueError('save_experiment_configuration must have two parameters.') elif int(args.save_experiment_configuration[0]) not in list(range(1, 5)): raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 4]).') output_experiment_configuration_path = os.path.join(args.experiment_configuration_path, args.dataset_name, 'stage' + args.save_experiment_configuration[0], args.save_experiment_configuration[1] + '_{}.json'.format( experiment_id)) else: pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True) output_experiment_configuration_path = os.path.join( args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format( experiment_id)) with open(output_experiment_configuration_path, 'w') as output_file: json.dump( parameters, output_file, indent=4 ) # Run as much job as there are seeds with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar: Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i], parameters, experiment_id, hyperparameters) for i in range(len(seeds)))