diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py index e251c7536f0814a5839f71532201abd827312b00..9854a75eb27ec83990b8ae032c58e7dec52a5e8c 100644 --- a/code/bolsonaro/data/dataset_parameters.py +++ b/code/bolsonaro/data/dataset_parameters.py @@ -5,12 +5,11 @@ import os class DatasetParameters(object): - def __init__(self, name, test_size, dev_size, random_state, train_on_subset, dataset_normalizer): + def __init__(self, name, test_size, dev_size, random_state, dataset_normalizer): self._name = name self._test_size = test_size self._dev_size = dev_size self._random_state = random_state - self._train_on_subset = train_on_subset self._dataset_normalizer = dataset_normalizer @property @@ -29,10 +28,6 @@ class DatasetParameters(object): def random_state(self): return self._random_state - @property - def train_on_subset(self): - return self._train_on_subset - @property def dataset_normalizer(self): return self._dataset_normalizer diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index 7198dceb9f2677dbd44767baa9f308edc5953e90..450d97bc01fe8f02ccd399d2c47a0c7397b4cb13 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -5,11 +5,12 @@ import os class ModelParameters(object): - def __init__(self, forest_size, extracted_forest_size, normalize_D, seed=None): + def __init__(self, forest_size, extracted_forest_size, normalize_D, use_dev_subset, seed=None): self._forest_size = forest_size self._extracted_forest_size = extracted_forest_size - self._seed = seed self._normalize_D = normalize_D + self._use_dev_subset = use_dev_subset + self._seed = seed @property def forest_size(self): @@ -19,14 +20,18 @@ class ModelParameters(object): def extracted_forest_size(self): return self._extracted_forest_size - @property - def seed(self): - return self._seed - @property def normalize_D(self): return self._normalize_D + @property + def use_dev_subset(self): + return self._use_dev_subset + + @property + def seed(self): + return self._seed + def save(self, directory_path, experiment_id): save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), self.__dict__) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index cd26f92acc89725e4f6cc64b69bad8d2e8d2cbc3..50754246abc13e0282e4cbd6aa1e917a0e3544ed 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -36,9 +36,9 @@ class OmpForestRegressor(BaseEstimator): def models_parameters(self): return self._models_parameters - def fit(self, X, y): - self._forest = self._train_forest(X, y) - self._weights = self._extract_subforest(X, y) + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._forest = self._train_forest(X_forest, y_forest) + self._weights = self._extract_subforest(X_omp, y_omp) return self def score_regressor(self, X, y): diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 91480dd944f077668494dfedf252756ec0511898..e5ca49b4bef86a4386f314801d9c36ec8ea2fa7f 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -4,6 +4,7 @@ from . import LOG_PATH import time import datetime +import numpy as np class Trainer(object): @@ -15,15 +16,26 @@ class Trainer(object): def train(self, model, models_dir): self._logger.debug('Training model using train set...') begin_time = time.time() - train_on_subset = self._dataset.dataset_parameters.train_on_subset - if train_on_subset == 'train': - X, y = self._dataset.X_train, self._dataset.y_train - elif train_on_subset == 'dev': - X, y = self._dataset.X_dev, self._dataset.y_dev + + if model.models_parameters.use_dev_subset: + X_forest = self._dataset.X_train + y_forest = self._dataset.y_train + X_omp = self._dataset.X_dev + y_omp = self._dataset.y_dev + self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') else: - raise ValueError("Unsupported train_on_subset value '{}'".format(train_on_subset)) - self._logger.debug('Fitting on {} subset'.format(train_on_subset)) - model.fit(X, y) + X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') + + model.fit( + X_forest=X_forest, + y_forest=y_forest, + X_omp=X_omp, + y_omp=y_omp + ) end_time = time.time() ModelRawResults( diff --git a/code/train.py b/code/train.py index 475843c9dbd005935fe7e61d77b7467a781af1fc..5783fef4bd32d75f62f5dcf05c9812e82f9c0338 100644 --- a/code/train.py +++ b/code/train.py @@ -29,7 +29,7 @@ if __name__ == "__main__": DEFAULT_DEV_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2 DEFAULT_RANDOM_SEED_NUMBER = 1 - DEFAULT_TRAIN_ON_SUBSET = 'train' + DEFAULT_USE_DEV_SUBSET = False DEFAULT_DISABLE_PROGRESS = False begin_random_seed_range = 1 @@ -46,7 +46,7 @@ if __name__ == "__main__": parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') - parser.add_argument('--train_on_subset', nargs='?', type=str, default=DEFAULT_TRAIN_ON_SUBSET, help='Specify on witch subset the model will be trained (either train or dev).') + parser.add_argument('--use_dev_subset', action='store_true', default=DEFAULT_USE_DEV_SUBSET, help='If specify the forest will be trained on train subset and OMP on dev subset. Otherwise both the forest and OMP will be trained on train+dev subsets.') parser.add_argument('--disable_progress', action='store_true', default=DEFAULT_DISABLE_PROGRESS, help='Disable the progress bars.') args = parser.parse_args() @@ -83,8 +83,7 @@ if __name__ == "__main__": test_size=args.test_size, dev_size=args.dev_size, random_state=seed, - dataset_normalizer=args.dataset_normalizer, - train_on_subset=args.train_on_subset + dataset_normalizer=args.dataset_normalizer ) dataset_parameters.save(models_dir, experiment_id_str) @@ -101,8 +100,9 @@ if __name__ == "__main__": model_parameters = ModelParameters( forest_size=args.forest_size, extracted_forest_size=extracted_forest_size, - seed=seed, - normalize_D=args.normalize_D + normalize_D=args.normalize_D, + use_dev_subset=args.use_dev_subset, + seed=seed ) model_parameters.save(sub_models_dir, experiment_id)