Skip to content
Snippets Groups Projects
train.py 20.7 KiB
Newer Older
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.trainer import Trainer
from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
import argparse
import copy
import pathlib
import random
import shutil
def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose):
    """
    Experiment function.

    Will be used as base function for worker in multithreaded application.

    :param seed:
    :param parameters:
    :param experiment_id:
    :return:
    """
    logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
        seed, threading.get_ident()))
    seed_str = str(seed)
    experiment_id_str = str(experiment_id)
    models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
        os.sep + seed_str
    pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)

    dataset_parameters = DatasetParameters(
        name=parameters['dataset_name'],
        test_size=parameters['test_size'],
        dev_size=parameters['dev_size'],
        dataset_normalizer=parameters['dataset_normalizer']
    )
    dataset_parameters.save(models_dir, experiment_id_str)
    dataset = DatasetLoader.load(dataset_parameters)

    trainer = Trainer(dataset)

    if parameters['extraction_strategy'] == 'random':
        pretrained_model_parameters = ModelParameters(
            extracted_forest_size=parameters['forest_size'],
            normalize_D=parameters['normalize_D'],
            subsets_used=parameters['subsets_used'],
            normalize_weights=parameters['normalize_weights'],
            seed=seed,
            hyperparameters=hyperparameters,
            extraction_strategy=parameters['extraction_strategy']
        )
        pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters)
        pretraned_trainer = Trainer(dataset)
        pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
        pretrained_estimator.fit(
            X=pretraned_trainer._X_forest,
            y=pretraned_trainer._y_forest
        )
    else:
        pretrained_estimator = None
        pretrained_model_parameters = None

    if parameters['extraction_strategy'] != 'none':
        with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
            Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
                models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer,
                pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters,
                use_distillation=parameters['extraction_strategy'] == 'omp_distillation')
                for i in range(len(parameters['extracted_forest_size'])))
    else:
        forest_size = hyperparameters['n_estimators']
        logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
        sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
        # Check if the result file already exists
        already_exists = False
        if os.path.isdir(sub_models_dir):
            sub_models_dir_files = os.listdir(sub_models_dir)
            for file_name in sub_models_dir_files:
                if file_name == 'model_raw_results.pickle':
                    already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
                    break
        if already_exists:
            logger.info('Base forest result already exists. Skipping...')
        else:
            pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
                normalize_D=parameters['normalize_D'],
                subsets_used=parameters['subsets_used'],
                normalize_weights=parameters['normalize_weights'],
                seed=seed,
                hyperparameters=hyperparameters,
                extraction_strategy=parameters['extraction_strategy']
            )
            model_parameters.save(sub_models_dir, experiment_id)

            model = ModelFactory.build(dataset.task, model_parameters)
            trainer.init(model, subsets_used=parameters['subsets_used'])
            trainer.train(model)
            trainer.compute_results(model, sub_models_dir)
    logger.info(f'Training done for seed {seed_str}')
    seed_job_pb.update(1)

def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
    seed, parameters, dataset, hyperparameters, experiment_id, trainer,
    pretrained_estimator=None, pretrained_model_parameters=None, use_distillation=False):

    logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
        seed, extracted_forest_size, threading.get_ident()))
    logger.info('extracted_forest_size={}'.format(extracted_forest_size))

    sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)

    # Check if the result file already exists
    already_exists = False
    if os.path.isdir(sub_models_dir):
        sub_models_dir_files = os.listdir(sub_models_dir)
        for file_name in sub_models_dir_files:
            if file_name == 'model_raw_results.pickle':
                already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
                break
    if already_exists:
        logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
        return

    pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)

    if not pretrained_estimator:
        model_parameters = ModelParameters(
            extracted_forest_size=extracted_forest_size,
            normalize_D=parameters['normalize_D'],
            subsets_used=parameters['subsets_used'],
            normalize_weights=parameters['normalize_weights'],
            seed=seed,
            hyperparameters=hyperparameters,
            extraction_strategy=parameters['extraction_strategy']
        )
        model_parameters.save(sub_models_dir, experiment_id)
        model = ModelFactory.build(dataset.task, model_parameters)
        model = copy.deepcopy(pretrained_estimator)
        pretrained_model_parameters.save(sub_models_dir, experiment_id)

    trainer.init(model, subsets_used=parameters['subsets_used'])
    trainer.train(model, extracted_forest_size=extracted_forest_size, seed=seed,
        use_distillation=use_distillation)
    trainer.compute_results(model, sub_models_dir)
Command lines example for stage 1:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --extracted_forest_size_stop=0.05
python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 --dataset_name=california_housing
Command lines example for stage 2:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 no_normalization --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D --normalize_D --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_weights --normalize_weights --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D_and_weights --normalize_D --normalize_weights --extracted_forest_size_stop=0.05
python code/compute_results.py --stage 2 --experiment_ids 7 8 9 10 --dataset_name=california_housing

Command lines example for stage 3:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev
python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing
Charly Lamothe's avatar
Charly Lamothe committed

Command lines example for stage 4:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 4 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 4 omp_with_params --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/compute_results.py --stage 4 --experiment_ids 1 2 3 --dataset_name=california_housing
if __name__ == "__main__":
    DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
    # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_sizes/{extracted_forest_size}
    DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
    DEFAULT_SKIP_BEST_HYPERPARAMS = False
    DEFAULT_JOB_NUMBER = -1
    begin_random_seed_range = 1
    end_random_seed_range = 2000

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--experiment_id', nargs='?', type=int, default=None, help='Specify an experiment id. Remove already existing model with this specified experiment id.')
    parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
    parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
    parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
    parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
    parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
    parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
    parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
    parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.')
    parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
    parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
    parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
    parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
    parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
    parser.add_argument('--subsets_used', nargs='?', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
    parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
    parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
    parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
    parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
    parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
    parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity_similarities, similarity_predictions, kmeans, ensemble.')
    parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
    args = parser.parse_args()

    if args.experiment_configuration:
        with open(args.experiment_configuration_path + os.sep + \
            args.experiment_configuration + '.json', 'r') as input_file:
            parameters = json.load(input_file)
    else:
        parameters = args.__dict__

    if parameters['extraction_strategy'] not in ['omp', 'omp_distillation', 'random', 'none', 'similarity_similarities', 'similarity_predictions', 'kmeans', 'ensemble']:
        raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters['extraction_strategy']))
    pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
    logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
Charly Lamothe's avatar
Charly Lamothe committed
    hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
Charly Lamothe's avatar
Charly Lamothe committed
    if os.path.exists(hyperparameters_path):
Charly Lamothe's avatar
Charly Lamothe committed
        logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
        with open(hyperparameters_path, 'r+') as file_hyperparameter:
Charly Lamothe's avatar
Charly Lamothe committed
            loaded_hyperparameters = json.load(file_hyperparameter)['best_parameters']
            if args.skip_best_hyperparams:
                hyperparameters = {'n_estimators': loaded_hyperparameters['n_estimators']}
            else:
                hyperparameters = loaded_hyperparameters
    """
    First case: no best hyperparameters are specified and no forest_size parameter
    specified in argument, so use the DEFAULT_FOREST_SIZE.
    Second case: no matter if hyperparameters are specified, the forest_size parameter
    will override it.
    Third implicit case: use the number of estimators found in specified hyperparameters.
    """
    if len(hyperparameters) == 0 and parameters['forest_size'] is None:
        hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE
    elif parameters['forest_size'] is not None:
Charly Lamothe's avatar
Charly Lamothe committed
        hyperparameters['n_estimators'] = parameters['forest_size']

    # The number of tree to extract from forest (K)
    parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
        np.linspace(0, args.extracted_forest_size_stop,
        parameters['extracted_forest_size_samples'] + 1,
        endpoint=True)[1:]).astype(np.int)).tolist()
Charly Lamothe's avatar
Charly Lamothe committed
    logger.info(f"extracted forest sizes: {parameters['extracted_forest_size']}")

    if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
        logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')    

    # Seeds are either provided as parameters or generated at random
    seeds = parameters['seeds'] if parameters['seeds'] is not None \
        else [random.randint(begin_random_seed_range, end_random_seed_range) \
        for i in range(parameters['random_seed_number'])]
    if args.experiment_id:
        experiment_id = args.experiment_id
        if args.overwrite:
            shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
    else:
        # Resolve the next experiment id number (last id + 1)
        experiment_id = resolve_experiment_id(parameters['models_dir'])
    logger.info('Experiment id: {}'.format(experiment_id))

    If the experiment configuration isn't coming from
    an already existing file, save it to a json file to
    keep trace of it (either a specified path, either in 'unnamed' dir.).
    """
    if args.experiment_configuration is None:
        if args.save_experiment_configuration:
            if len(args.save_experiment_configuration) != 2:
                raise ValueError('save_experiment_configuration must have two parameters.')
Charly Lamothe's avatar
Charly Lamothe committed
            elif int(args.save_experiment_configuration[0]) not in list(range(1, 6)):
                raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 5]).')
            output_experiment_stage_path = os.path.join(args.experiment_configuration_path,
                args.dataset_name, 'stage' + args.save_experiment_configuration[0])
            pathlib.Path(output_experiment_stage_path).mkdir(parents=True, exist_ok=True)
            output_experiment_configuration_path = os.path.join(output_experiment_stage_path,
                args.save_experiment_configuration[1] + '.json')
        else:
            pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True)
            output_experiment_configuration_path = os.path.join(
                args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format(
                experiment_id))
        with open(output_experiment_configuration_path, 'w') as output_file:
    with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as seed_job_pb:
        Parallel(n_jobs=args.job_number)(delayed(seed_job)(seed_job_pb, seeds[i],
            parameters, experiment_id, hyperparameters, args.verbose) for i in range(len(seeds)))