diff --git a/.gitignore b/.gitignore index be20546afc6e7213216d7be700119cef0ee9d32b..2758a62a5b8721d31f9cd1dbec11c3243f475044 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ models/* +results/* +experiments/* */.kile/* *.kilepr diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index d188cc8b18dcdf42a9d6343f03b7724acc85454a..ec1f321f70115542a2164c474193a246faa5639d 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -45,7 +45,7 @@ class DatasetLoader(object): elif name == '20newsgroups_vectorized': dataset_loading_func = fetch_20newsgroups_vectorized task = Task.CLASSIFICATION - elif name == 'lfw_people': + elif name == 'lfw_people': # needs PIL (image dataset) dataset_loading_func = fetch_lfw_people task = Task.CLASSIFICATION elif name == 'lfw_pairs': diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index 768d207a323c8a7e33ed2b9b295c03cba27ce18b..60ce9ccbe10ea17cdeeaad8ade769ceaa492943a 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -5,16 +5,13 @@ import os class ModelParameters(object): - def __init__(self, forest_size, extracted_forest_size, normalize_D, subsets_used, seed=None): - self._forest_size = forest_size + def __init__(self, extracted_forest_size, normalize_D, subsets_used, hyperparameters, seed=None): self._extracted_forest_size = extracted_forest_size self._normalize_D = normalize_D self._subsets_used = subsets_used self._seed = seed + self._hyperparameters = hyperparameters - @property - def forest_size(self): - return self._forest_size @property def extracted_forest_size(self): @@ -32,6 +29,10 @@ class ModelParameters(object): def seed(self): return self._seed + @property + def hyperparameters(self): + return self._hyperparameters + def save(self, directory_path, experiment_id): save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), self.__dict__) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 50754246abc13e0282e4cbd6aa1e917a0e3544ed..53bd76f718eea9cced6d527f67dd5ce4314e6327 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -12,8 +12,8 @@ class OmpForestRegressor(BaseEstimator): DEFAULT_SCORE_METRIC = 'mse' def __init__(self, models_parameters): - self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, - random_state=models_parameters.seed) + self._regressor = RandomForestRegressor(**models_parameters.hyperparameters, + random_state=models_parameters.seed) self._models_parameters = models_parameters self._logger = LoggerFactory.create(LOG_PATH, __name__) @@ -82,7 +82,7 @@ class OmpForestRegressor(BaseEstimator): self._regressor.fit(X, y) forest = self._regressor.estimators_ return forest - + def _extract_subforest(self, X, y): """ Given an already estimated regressor: apply OMP to get the weight of each tree. diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py index 40e4b1d052ee026b8fe8cce8fe784a364666edbe..2c708fc5c9af04e91825a949eb57ac373aad2503 100644 --- a/code/compute_hyperparameters.py +++ b/code/compute_hyperparameters.py @@ -35,7 +35,7 @@ if __name__ == "__main__": load_dotenv(find_dotenv('.env.example')) DEFAULT_CV = 3 - DEFAULT_N_ITER = 30 + DEFAULT_N_ITER = 50 DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000), 'min_samples_leaf': Integer(1, 1000), 'max_depth': Integer(1, 20), @@ -56,7 +56,7 @@ if __name__ == "__main__": begin_random_seed_range = 1 end_random_seed_range = 2000 - if args.seed is not None: + if args.seed is None: random_seed = random.randint(begin_random_seed_range, end_random_seed_range) else: random_seed = args.seed diff --git a/code/train.py b/code/train.py index 546da8bfe186bf7e3f088d9ec1b74c0491f05e3a..468bfd3ab84ddecb974150747331dc0d16aaa17f 100644 --- a/code/train.py +++ b/code/train.py @@ -9,6 +9,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory from dotenv import find_dotenv, load_dotenv import argparse +import json import pathlib import random import os @@ -22,7 +23,6 @@ if __name__ == "__main__": DEFAULT_DATASET_NAME = 'boston' DEFAULT_NORMALIZE_D = False DEFAULT_DATASET_NORMALIZER = None - DEFAULT_FOREST_SIZE = 100 DEFAULT_EXTRACTED_FOREST_SIZE = 10 # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' @@ -39,7 +39,7 @@ if __name__ == "__main__": parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') - parser.add_argument('--forest_size', nargs='?', type=int, default=DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.') + parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.') @@ -62,8 +62,19 @@ if __name__ == "__main__": logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') seeds = args.seeds if args.seeds is not None \ - else [random.randint(begin_random_seed_range, end_random_seed_range) \ - for i in range(args.random_seed_number)] + else [random.randint(begin_random_seed_range, end_random_seed_range) + for i in range(args.random_seed_number)] + + path_hyperparameter = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json') + if os.path.exists(path_hyperparameter): + with open(path_hyperparameter, 'r+') as file_hyperparameter: + hyperparameters = json.load(file_hyperparameter)['best_parameters'] + + else: + hyperparameters = {} + + if args.forest_size is not None: + hyperparameters['n_estimators'] = args.forest_size experiment_id = resolve_experiment_id(args.models_dir) experiment_id_str = str(experiment_id) @@ -98,11 +109,11 @@ if __name__ == "__main__": pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) model_parameters = ModelParameters( - forest_size=args.forest_size, extracted_forest_size=extracted_forest_size, normalize_D=args.normalize_D, subsets_used=args.subsets_used, - seed=seed + seed=seed, + hyperparameters=hyperparameters ) model_parameters.save(sub_models_dir, experiment_id)