diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index 6f8599896aad3c2ca681bc1d5aca91fd89883708..892104711701cd430962a3832566da6e62b83a38 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -19,8 +19,8 @@ class DatasetLoader(object): DEFAULT_NORMALIZE_D = False DEFAULT_DATASET_NORMALIZER = 'standard' DEFAULT_FOREST_SIZE = 100 - DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 10 - DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.3 + DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 5 + DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.1 DEFAULT_DEV_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2 DEFAULT_RANDOM_SEED_NUMBER = 1 diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index ea51ad320918bb714930f11864712d004803e650..262d2560054ba4177852d883cafd48eaccbe475d 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.data.task import Task from bolsonaro.models.model_parameters import ModelParameters +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier import os import pickle @@ -11,12 +12,33 @@ class ModelFactory(object): @staticmethod def build(task, model_parameters): + if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]: + raise ValueError("Unsupported task '{}'".format(task)) + if task == Task.BINARYCLASSIFICATION: - model_func = OmpForestBinaryClassifier + if model_parameters.extraction_strategy == 'omp': + return OmpForestBinaryClassifier(model_parameters) + elif model_parameters.extraction_strategy == 'random': + return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, + random_state=model_parameters.seed) + else: + return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], + random_state=model_parameters.seed) elif task == Task.REGRESSION: - model_func = OmpForestRegressor + if model_parameters.extraction_strategy == 'omp': + return OmpForestRegressor(model_parameters) + elif model_parameters.extraction_strategy == 'random': + return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size, + random_state=model_parameters.seed) + else: + return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], + random_state=model_parameters.seed) elif task == Task.MULTICLASSIFICATION: - model_func = OmpForestMulticlassClassifier - else: - raise ValueError("Unsupported task '{}'".format(task)) - return model_func(model_parameters) + if model_parameters.extraction_strategy == 'omp': + return OmpForestMulticlassClassifier(model_parameters) + elif model_parameters.extraction_strategy == 'random': + return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, + random_state=model_parameters.seed) + else: + return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], + random_state=model_parameters.seed) diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index 31a451b70578835fe4663508de9e15f99bf6cc19..a3286edfccce7fddf4a4174b6cffddd5cf78717d 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -5,13 +5,15 @@ import os class ModelParameters(object): - def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters): + def __init__(self, extracted_forest_size, normalize_D, subsets_used, + normalize_weights, seed, hyperparameters, extraction_strategy): self._extracted_forest_size = extracted_forest_size self._normalize_D = normalize_D self._subsets_used = subsets_used self._normalize_weights = normalize_weights self._seed = seed self._hyperparameters = hyperparameters + self._extraction_strategy = extraction_strategy @property def extracted_forest_size(self): @@ -37,6 +39,10 @@ class ModelParameters(object): def hyperparameters(self): return self._hyperparameters + @property + def extraction_strategy(self): + return self._extraction_strategy + def save(self, directory_path, experiment_id): save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), self.__dict__) diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py index 2da0beab64ef5361efbde6d6197f957fe627886c..ea6bf9108712043ff964ca39c1ec7728ddd26f20 100644 --- a/code/bolsonaro/models/omp_forest.py +++ b/code/bolsonaro/models/omp_forest.py @@ -30,7 +30,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): # sklearn baseestimator api methods def fit(self, X_forest, y_forest, X_omp, y_omp): self._base_forest_estimator.fit(X_forest, y_forest) - self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit + self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit return self def _extract_subforest(self, X, y): diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 4a32bffb8c7a2e129bf6f010c8a2c3a339a53a4b..549df6e7aceb93706b8d5902f994501aabdde015 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -2,6 +2,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.error_handling.logger_factory import LoggerFactory from . import LOG_PATH +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier import time import datetime import numpy as np @@ -21,7 +22,11 @@ class Trainer(object): self._logger = LoggerFactory.create(LOG_PATH, __name__) def init(self, model): - if model.models_parameters.subsets_used == 'train,dev': + if type(model) in [RandomForestRegressor, RandomForestClassifier]: + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._logger.debug('Fitting the forest on train subset') + elif model.models_parameters.subsets_used == 'train,dev': self._X_forest = self._dataset.X_train self._y_forest = self._dataset.y_train self._X_omp = self._dataset.X_dev @@ -46,15 +51,21 @@ class Trainer(object): :param model: Object with :return: """ - + self._logger.debug('Training model using train set...') self._begin_time = time.time() - model.fit( - X_forest=self._X_forest, - y_forest=self._y_forest, - X_omp=self._X_omp, - y_omp=self._y_omp - ) + if type(model) in [RandomForestRegressor, RandomForestClassifier]: + model.fit( + X=self._X_forest, + y=self._y_forest + ) + else: + model.fit( + X_forest=self._X_forest, + y_forest=self._y_forest, + X_omp=self._X_omp, + y_omp=self._y_omp + ) self._end_time = time.time() def compute_results(self, model, models_dir): @@ -62,6 +73,8 @@ class Trainer(object): :param model: Object with :param models_dir: Where the results will be saved """ + score_func = model.score if type(model) in [RandomForestRegressor, RandomForestClassifier] \ + else model.score_base_estimator results = ModelRawResults( model_object=model, training_time=self._end_time - self._begin_time, @@ -69,10 +82,11 @@ class Trainer(object): train_score=model.score(self._dataset.X_train, self._dataset.y_train), dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), test_score=model.score(self._dataset.X_test, self._dataset.y_test), - score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way - train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train), - dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev), - test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test) + score_metric='mse' if type(model) in [RandomForestRegressor, RandomForestClassifier] \ + else model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way + train_score_regressor=score_func(self._dataset.X_train, self._dataset.y_train), + dev_score_regressor=score_func(self._dataset.X_dev, self._dataset.y_dev), + test_score_regressor=score_func(self._dataset.X_test, self._dataset.y_test) ) results.save(models_dir) self._logger.info("Base performance on test: {}".format(results.test_score_regressor)) diff --git a/code/compute_results.py b/code/compute_results.py index 3ff7a6648d88a90239c714b231f74f020d6a82e3..f0e8cb3e60985f73ae67f37cc1f4797210c11a81 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -18,19 +18,29 @@ if __name__ == "__main__": DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--stage_number', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].') - parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).') + parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].') + parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \ + 'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') args = parser.parse_args() - if args.stage_number not in list(range(1, 5)): - raise ValueError('stage_number must be a supported stage id (i.e. [1, 4]).') + if args.stage not in list(range(1, 5)): + raise ValueError('stage must be a supported stage id (i.e. [1, 4]).') # Create recursively the results dir tree pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) - if args.stage_number == 1: + if args.stage == 1: + # First axis: + # base_with_params + # random_with_params + # omp_with_params + + # Second axis: + # base_wo_params + # random_wo_params + # base_wo_params for experiment_id in args.experiment_ids: experiment_id_path = args.models_dir + os.sep + str(experiment_id) # models/{experiment_id} # Create recursively the tree results/{experiment_id} @@ -50,7 +60,9 @@ if __name__ == "__main__": experiment_score_metrics = list() # For each seed results stored in models/{experiment_id}/seeds - for seed in os.listdir(experiment_seed_root_path): + seeds = os.listdir(experiment_seed_root_path) + seeds.sort(key=int) + for seed in seeds: experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed} dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters diff --git a/code/train.py b/code/train.py index 81d430e9e98127ce87bf7a52c38c4e9e1c0882a1..815eac082fca28f2aa2a3fac775a822ac84c5717 100644 --- a/code/train.py +++ b/code/train.py @@ -53,19 +53,42 @@ def process_job(seed, parameters, experiment_id, hyperparameters): trainer = Trainer(dataset) - for extracted_forest_size in parameters['extracted_forest_size']: - # question if training is too long, one may also split experiments for different forest sizes into different workers - logger.info('extracted_forest_size={}'.format(extracted_forest_size)) - sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) + if parameters['extraction_strategy'] != 'none': + for extracted_forest_size in parameters['extracted_forest_size']: + logger.info('extracted_forest_size={}'.format(extracted_forest_size)) + sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) + pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) + + model_parameters = ModelParameters( + extracted_forest_size=extracted_forest_size, + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], + seed=seed, + hyperparameters=hyperparameters, + extraction_strategy=parameters['extraction_strategy'] + ) + model_parameters.save(sub_models_dir, experiment_id) + + model = ModelFactory.build(dataset.task, model_parameters) + + trainer.init(model) + trainer.train(model) + trainer.compute_results(model, sub_models_dir) + else: + forest_size = hyperparameters['n_estimators'] + logger.info('Base forest training with fixed forest size of {}'.format(forest_size)) + sub_models_dir = models_dir + os.sep + str(forest_size) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) model_parameters = ModelParameters( - extracted_forest_size=extracted_forest_size, + extracted_forest_size=forest_size, normalize_D=parameters['normalize_D'], subsets_used=parameters['subsets_used'], normalize_weights=parameters['normalize_weights'], seed=seed, - hyperparameters=hyperparameters + hyperparameters=hyperparameters, + extraction_strategy=parameters['extraction_strategy'] ) model_parameters.save(sub_models_dir, experiment_id) @@ -76,7 +99,16 @@ def process_job(seed, parameters, experiment_id, hyperparameters): trainer.compute_results(model, sub_models_dir) logger.info('Training done') - +""" +Example for stage 1: +python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --save_experiment_configuration 1 none_with_params +python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --save_experiment_configuration 1 random_with_params +python code/train.py --dataset_name=california_housing --seeds 1 2 3 --save_experiment_configuration 1 omp_with_params +python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params +python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params +python code/train.py --dataset_name=california_housing --seeds 1 2 3 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params +python code/compute_results.py --stage_number 1 --experiment_ids 1 2 3 4 5 6 +""" if __name__ == "__main__": load_dotenv(find_dotenv('.env')) DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' @@ -85,6 +117,7 @@ if __name__ == "__main__": DEFAULT_VERBOSE = False DEFAULT_SKIP_BEST_HYPERPARAMS = False DEFAULT_JOB_NUMBER = -1 + DEFAULT_EXTRACTION_STRATEGY = 'omp' begin_random_seed_range = 1 end_random_seed_range = 2000 @@ -109,6 +142,7 @@ if __name__ == "__main__": parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') + parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random or none.') args = parser.parse_args() if args.experiment_configuration: @@ -118,6 +152,9 @@ if __name__ == "__main__": else: parameters = args.__dict__ + if parameters['extraction_strategy'] not in ['omp', 'random', 'none']: + raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) + pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))