diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index 450d97bc01fe8f02ccd399d2c47a0c7397b4cb13..768d207a323c8a7e33ed2b9b295c03cba27ce18b 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -5,11 +5,11 @@ import os class ModelParameters(object): - def __init__(self, forest_size, extracted_forest_size, normalize_D, use_dev_subset, seed=None): + def __init__(self, forest_size, extracted_forest_size, normalize_D, subsets_used, seed=None): self._forest_size = forest_size self._extracted_forest_size = extracted_forest_size self._normalize_D = normalize_D - self._use_dev_subset = use_dev_subset + self._subsets_used = subsets_used self._seed = seed @property @@ -25,8 +25,8 @@ class ModelParameters(object): return self._normalize_D @property - def use_dev_subset(self): - return self._use_dev_subset + def subsets_used(self): + return self._subsets_used @property def seed(self): diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index dcc16e0b81f79bc203a2f547459b009b5ef44d4d..b586914166cf80f274a502d8d44b83f6b6f97484 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -17,20 +17,25 @@ class Trainer(object): self._logger.debug('Training model using train set...') begin_time = time.time() - if model.models_parameters.use_dev_subset: + if model.models_parameters.subsets_used == 'train,dev': X_forest = self._dataset.X_train y_forest = self._dataset.y_train X_omp = self._dataset.X_dev y_omp = self._dataset.y_dev self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') - else: + elif model.models_parameters.subsets_used == 'train+dev,train+dev': X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) X_omp = X_forest y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) y_omp = y_forest self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') - - # TODO: add an option to train forest to train+dev and OMP to dev + elif model.models_parameters.subsets_used == 'train,train+dev': + X_forest = self._dataset.X_train + y_forest = self._dataset.y_train + X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + else: + raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) model.fit( X_forest=X_forest, diff --git a/code/train.py b/code/train.py index 5783fef4bd32d75f62f5dcf05c9812e82f9c0338..72abcf6796db78d6bc5de806a16417be01f65f7a 100644 --- a/code/train.py +++ b/code/train.py @@ -29,7 +29,7 @@ if __name__ == "__main__": DEFAULT_DEV_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2 DEFAULT_RANDOM_SEED_NUMBER = 1 - DEFAULT_USE_DEV_SUBSET = False + DEFAULT_SUBSETS_USED = 'train,dev' DEFAULT_DISABLE_PROGRESS = False begin_random_seed_range = 1 @@ -46,7 +46,7 @@ if __name__ == "__main__": parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') - parser.add_argument('--use_dev_subset', action='store_true', default=DEFAULT_USE_DEV_SUBSET, help='If specify the forest will be trained on train subset and OMP on dev subset. Otherwise both the forest and OMP will be trained on train+dev subsets.') + parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') parser.add_argument('--disable_progress', action='store_true', default=DEFAULT_DISABLE_PROGRESS, help='Disable the progress bars.') args = parser.parse_args() @@ -101,7 +101,7 @@ if __name__ == "__main__": forest_size=args.forest_size, extracted_forest_size=extracted_forest_size, normalize_D=args.normalize_D, - use_dev_subset=args.use_dev_subset, + subsets_used=args.subsets_used, seed=seed ) model_parameters.save(sub_models_dir, experiment_id)