Skip to content
Snippets Groups Projects
train.py 20.8 KiB
Newer Older
  • Learn to ignore specific revisions
  • from bolsonaro.data.dataset_parameters import DatasetParameters
    from bolsonaro.data.dataset_loader import DatasetLoader
    from bolsonaro.models.model_factory import ModelFactory
    
    from bolsonaro.models.model_parameters import ModelParameters
    
    from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
    
    from bolsonaro.trainer import Trainer
    
    from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
    
    from bolsonaro import LOG_PATH
    from bolsonaro.error_handling.logger_factory import LoggerFactory
    
    import argparse
    
    import copy
    
    import pathlib
    import random
    
    import shutil
    
    def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose):
    
        """
        Experiment function.
    
        Will be used as base function for worker in multithreaded application.
    
        :param seed:
        :param parameters:
        :param experiment_id:
        :return:
        """
    
        logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
            seed, threading.get_ident()))
    
        seed_str = str(seed)
        experiment_id_str = str(experiment_id)
    
        models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
    
            os.sep + seed_str
        pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
    
        dataset_parameters = DatasetParameters(
    
            name=parameters['dataset_name'],
            test_size=parameters['test_size'],
            dev_size=parameters['dev_size'],
    
            dataset_normalizer=parameters['dataset_normalizer']
    
        )
        dataset_parameters.save(models_dir, experiment_id_str)
        dataset = DatasetLoader.load(dataset_parameters)
    
        trainer = Trainer(dataset)
    
    
        if parameters['extraction_strategy'] == 'ensemble':
            library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed)
        else:
            library = None
    
    
        if parameters['extraction_strategy'] == 'random':
            pretrained_model_parameters = ModelParameters(
                extracted_forest_size=parameters['forest_size'],
                normalize_D=parameters['normalize_D'],
                subsets_used=parameters['subsets_used'],
                normalize_weights=parameters['normalize_weights'],
                seed=seed,
                hyperparameters=hyperparameters,
                extraction_strategy=parameters['extraction_strategy']
            )
            pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters, library=library)
    
            pretraned_trainer = Trainer(dataset)
            pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
            pretrained_estimator.fit(
                X=pretraned_trainer._X_forest,
                y=pretraned_trainer._y_forest
            )
    
        else:
            pretrained_estimator = None
            pretrained_model_parameters = None
    
    
        if parameters['extraction_strategy'] != 'none':
    
            with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
                Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
    
                    models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library,
                    pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters)
    
                    for i in range(len(parameters['extracted_forest_size'])))
    
        else:
            forest_size = hyperparameters['n_estimators']
            logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
    
            sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
    
            # Check if the result file already exists
            already_exists = False
            if os.path.isdir(sub_models_dir):
                sub_models_dir_files = os.listdir(sub_models_dir)
                for file_name in sub_models_dir_files:
    
                    if file_name == 'model_raw_results.pickle':
    
                        already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
                        break
    
            if already_exists:
                logger.info('Base forest result already exists. Skipping...')
            else:
                pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
    
                    normalize_D=parameters['normalize_D'],
                    subsets_used=parameters['subsets_used'],
                    normalize_weights=parameters['normalize_weights'],
                    seed=seed,
                    hyperparameters=hyperparameters,
                    extraction_strategy=parameters['extraction_strategy']
                )
                model_parameters.save(sub_models_dir, experiment_id)
    
    
                model = ModelFactory.build(dataset.task, model_parameters, library=library)
    
                trainer.init(model, subsets_used=parameters['subsets_used'])
    
                trainer.train(model)
                trainer.compute_results(model, sub_models_dir)
    
        logger.info(f'Training done for seed {seed_str}')
        seed_job_pb.update(1)
    
    def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
    
        seed, parameters, dataset, hyperparameters, experiment_id, trainer, library,
        pretrained_estimator=None, pretrained_model_parameters=None):
    
    
        logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
            seed, extracted_forest_size, threading.get_ident()))
        logger.info('extracted_forest_size={}'.format(extracted_forest_size))
    
        sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
    
        # Check if the result file already exists
        already_exists = False
        if os.path.isdir(sub_models_dir):
            sub_models_dir_files = os.listdir(sub_models_dir)
            for file_name in sub_models_dir_files:
    
                if file_name == 'model_raw_results.pickle':
    
                    already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
                    break
    
        if already_exists:
            logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
            return
    
        pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
    
    
        if not pretrained_estimator:
            model_parameters = ModelParameters(
                extracted_forest_size=extracted_forest_size,
                normalize_D=parameters['normalize_D'],
                subsets_used=parameters['subsets_used'],
                normalize_weights=parameters['normalize_weights'],
                seed=seed,
                hyperparameters=hyperparameters,
                extraction_strategy=parameters['extraction_strategy']
            )
            model_parameters.save(sub_models_dir, experiment_id)
            model = ModelFactory.build(dataset.task, model_parameters, library=library)
        else:
    
            model = copy.deepcopy(pretrained_estimator)
    
            pretrained_model_parameters.save(sub_models_dir, experiment_id)
    
    
        trainer.init(model, subsets_used=parameters['subsets_used'])
    
        trainer.train(model, extracted_forest_size=extracted_forest_size)
    
        trainer.compute_results(model, sub_models_dir)
    
    Command lines example for stage 1:
    
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05
    
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --extracted_forest_size_stop=0.05
    python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 --dataset_name=california_housing
    
    Command lines example for stage 2:
    
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 no_normalization --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D --normalize_D --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_weights --normalize_weights --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D_and_weights --normalize_D --normalize_weights --extracted_forest_size_stop=0.05
    
    python code/compute_results.py --stage 2 --experiment_ids 7 8 9 10 --dataset_name=california_housing
    
    
    Command lines example for stage 3:
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev
    
    python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing
    
    Charly Lamothe's avatar
    Charly Lamothe committed
    
    Command lines example for stage 4:
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 4 random_with_params --extracted_forest_size_stop=0.05
    python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 4 omp_with_params --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
    python code/compute_results.py --stage 4 --experiment_ids 1 2 3 --dataset_name=california_housing
    
    if __name__ == "__main__":
    
        DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
    
        # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_sizes/{extracted_forest_size}
    
        DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
    
        DEFAULT_SKIP_BEST_HYPERPARAMS = False
        DEFAULT_JOB_NUMBER = -1
    
        begin_random_seed_range = 1
        end_random_seed_range = 2000
    
        parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    
        parser.add_argument('--experiment_id', nargs='?', type=int, default=None, help='Specify an experiment id. Remove already existing model with this specified experiment id.')
    
        parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
        parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
    
        parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
        parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
        parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
    
        parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
        parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
    
        parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.')
    
        parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
    
        parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
        parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
        parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
    
        parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
    
        parser.add_argument('--subsets_used', nargs='?', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
    
        parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
    
        parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
    
        parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
        parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
        parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
    
        parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity_similarities, similarity_predictions, kmeans, ensemble.')
    
        parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
    
        args = parser.parse_args()
    
    
        if args.experiment_configuration:
            with open(args.experiment_configuration_path + os.sep + \
                args.experiment_configuration + '.json', 'r') as input_file:
                parameters = json.load(input_file)
        else:
            parameters = args.__dict__
    
    
        if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity_similarities', 'similarity_predictions', 'kmeans', 'ensemble']:
    
            raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
    
    
        pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
    
        logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
    
    Charly Lamothe's avatar
    Charly Lamothe committed
        hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
    
    Charly Lamothe's avatar
    Charly Lamothe committed
        if os.path.exists(hyperparameters_path):
    
    Charly Lamothe's avatar
    Charly Lamothe committed
            logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
            with open(hyperparameters_path, 'r+') as file_hyperparameter:
    
    Charly Lamothe's avatar
    Charly Lamothe committed
                loaded_hyperparameters = json.load(file_hyperparameter)['best_parameters']
                if args.skip_best_hyperparams:
                    hyperparameters = {'n_estimators': loaded_hyperparameters['n_estimators']}
                else:
                    hyperparameters = loaded_hyperparameters
    
        """
        First case: no best hyperparameters are specified and no forest_size parameter
        specified in argument, so use the DEFAULT_FOREST_SIZE.
        Second case: no matter if hyperparameters are specified, the forest_size parameter
        will override it.
        Third implicit case: use the number of estimators found in specified hyperparameters.
        """
        if len(hyperparameters) == 0 and parameters['forest_size'] is None:
            hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE
        elif parameters['forest_size'] is not None:
    
    Charly Lamothe's avatar
    Charly Lamothe committed
            hyperparameters['n_estimators'] = parameters['forest_size']
    
    
        # The number of tree to extract from forest (K)
    
        parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
    
            np.linspace(0, args.extracted_forest_size_stop,
            parameters['extracted_forest_size_samples'] + 1,
    
            endpoint=True)[1:]).astype(np.int)).tolist()
    
    Charly Lamothe's avatar
    Charly Lamothe committed
        logger.info(f"extracted forest sizes: {parameters['extracted_forest_size']}")
    
    
        if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
    
            logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')    
    
    
        # Seeds are either provided as parameters or generated at random
    
        seeds = parameters['seeds'] if parameters['seeds'] is not None \
    
            else [random.randint(begin_random_seed_range, end_random_seed_range) \
    
            for i in range(parameters['random_seed_number'])]
    
        if args.experiment_id:
            experiment_id = args.experiment_id
    
            if args.overwrite:
                shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
    
        else:
            # Resolve the next experiment id number (last id + 1)
            experiment_id = resolve_experiment_id(parameters['models_dir'])
    
        logger.info('Experiment id: {}'.format(experiment_id))
    
    
        If the experiment configuration isn't coming from
    
        an already existing file, save it to a json file to
    
        keep trace of it (either a specified path, either in 'unnamed' dir.).
    
        """
        if args.experiment_configuration is None:
    
            if args.save_experiment_configuration:
                if len(args.save_experiment_configuration) != 2:
                    raise ValueError('save_experiment_configuration must have two parameters.')
    
    Charly Lamothe's avatar
    Charly Lamothe committed
                elif int(args.save_experiment_configuration[0]) not in list(range(1, 6)):
                    raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 5]).')
    
                output_experiment_stage_path = os.path.join(args.experiment_configuration_path,
                    args.dataset_name, 'stage' + args.save_experiment_configuration[0])
                pathlib.Path(output_experiment_stage_path).mkdir(parents=True, exist_ok=True)
                output_experiment_configuration_path = os.path.join(output_experiment_stage_path,
    
                    args.save_experiment_configuration[1] + '.json')
    
            else:
                pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True)
                output_experiment_configuration_path = os.path.join(
                    args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format(
                    experiment_id))
            with open(output_experiment_configuration_path, 'w') as output_file:
    
        with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as seed_job_pb:
            Parallel(n_jobs=args.job_number)(delayed(seed_job)(seed_job_pb, seeds[i],
                parameters, experiment_id, hyperparameters, args.verbose) for i in range(len(seeds)))