Skip to content
Snippets Groups Projects
train.py 8.02 KiB
Newer Older
  • Learn to ignore specific revisions
  • from bolsonaro.data.dataset_parameters import DatasetParameters
    from bolsonaro.data.dataset_loader import DatasetLoader
    from bolsonaro.models.model_factory import ModelFactory
    
    from bolsonaro.models.model_parameters import ModelParameters
    
    from bolsonaro.trainer import Trainer
    
    from bolsonaro.utils import resolve_experiment_id
    
    from bolsonaro import LOG_PATH
    from bolsonaro.error_handling.logger_factory import LoggerFactory
    
    import argparse
    import pathlib
    import random
    
    from concurrent import futures
    import threading
    
    def process_job(seed, parameters, experiment_id):
    
        """
        Experiment function.
    
        Will be used as base function for worker in multithreaded application.
    
        :param seed:
        :param parameters:
        :param experiment_id:
        :return:
        """
    
        logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
            seed, threading.get_ident()))
        logger.info('seed={}'.format(seed))
    
        seed_str = str(seed)
        experiment_id_str = str(experiment_id)
    
        models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
    
            os.sep + seed_str
        pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
    
        dataset_parameters = DatasetParameters(
    
            name=parameters['dataset_name'],
            test_size=parameters['test_size'],
            dev_size=parameters['dev_size'],
    
            dataset_normalizer=parameters['dataset_normalizer']
    
        )
        dataset_parameters.save(models_dir, experiment_id_str)
        dataset = DatasetLoader.load(dataset_parameters)
    
        trainer = Trainer(dataset)
    
    
        for extracted_forest_size in parameters['extracted_forest_size']:
    
            # question if training is too long, one may also split experiments for different forest sizes into different workers
    
            logger.info('extracted_forest_size={}'.format(extracted_forest_size))
            sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
            pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
    
            model_parameters = ModelParameters(
    
                forest_size=parameters['forest_size'],
    
                extracted_forest_size=extracted_forest_size,
    
                normalize_D=parameters['normalize_D'],
                subsets_used=parameters['subsets_used'],
                normalize_weights=parameters['normalize_weights'],
    
                seed=seed
            )
            model_parameters.save(sub_models_dir, experiment_id)
    
            model = ModelFactory.build(dataset.task, model_parameters)
    
            trainer.train(model, sub_models_dir)
        logger.info('Training done')
    
    
    if __name__ == "__main__":
    
        DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
    
        DEFAULT_DATASET_NAME = 'boston'
        DEFAULT_NORMALIZE_D = False
        DEFAULT_DATASET_NORMALIZER = None
        DEFAULT_FOREST_SIZE = 100
        DEFAULT_EXTRACTED_FOREST_SIZE = 10
    
        # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
    
        DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
        DEFAULT_DEV_SIZE = 0.2
        DEFAULT_TEST_SIZE = 0.2
        DEFAULT_RANDOM_SEED_NUMBER = 1
    
        begin_random_seed_range = 1
        end_random_seed_range = 2000
    
        parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    
        parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
        parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
    
        parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
        parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
        parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
        parser.add_argument('--forest_size', nargs='?', type=int, default=DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.')
        parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.')
        parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
        parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.')
        parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.')
        parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
    
        parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
    
        parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
    
        parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
    
        args = parser.parse_args()
    
    
        if args.experiment_configuration:
            with open(args.experiment_configuration_path + os.sep + \
                args.experiment_configuration + '.json', 'r') as input_file:
                parameters = json.load(input_file)
        else:
            parameters = args.__dict__
    
        pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
    
        logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
    
        # The number of tree to extract from forest (K)
    
        parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \
            if type(parameters['extracted_forest_size']) == list \
            else [parameters['extracted_forest_size']]
    
        if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
    
            logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')    
    
    
        # Seeds are either provided as parameters or generated at random
    
        seeds = parameters['seeds'] if parameters['seeds'] is not None \
    
            else [random.randint(begin_random_seed_range, end_random_seed_range) \
    
            for i in range(parameters['random_seed_number'])]
    
        # Resolve the next experiment id number (last id + 1)
        experiment_id = resolve_experiment_id(parameters['models_dir'])
    
        logger.info('Experiment id: {}'.format(experiment_id))
    
    
        If the experiment configuration isn't coming from
    
        an already existing file, save it to a json file to
        keep trace of it.
        """
        if args.experiment_configuration is None:
            with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format(
                experiment_id), 'w') as output_file:
                json.dump(
                    parameters,
                    output_file,
                    indent=4
                )
    
        # Train as much job as there are seeds
    
        with futures.ProcessPoolExecutor(len(seeds)) as executor:
            list(f.result() for f in futures.as_completed(executor.submit(process_job, seed,
    
                parameters, experiment_id) for seed in seeds))