diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 5c79c2a4623c0dabe2dd4bdaf47329174f605f3e..cd26f92acc89725e4f6cc64b69bad8d2e8d2cbc3 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -9,7 +9,7 @@ import numpy as np class OmpForestRegressor(BaseEstimator): - default_score_metric = 'mse' + DEFAULT_SCORE_METRIC = 'mse' def __init__(self, models_parameters): self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, @@ -60,7 +60,7 @@ class OmpForestRegressor(BaseEstimator): return predictions - def score(self, X, y, metric=default_score_metric): + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): """ Evaluate OMPForestRegressor on (`X`, `y`) using `metric` diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 08d745c2425aed36312365b7a473629396108b25..91480dd944f077668494dfedf252756ec0511898 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -34,7 +34,7 @@ class Trainer(object): train_score=model.score(self._dataset.X_train, self._dataset.y_train), dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), test_score=model.score(self._dataset.X_test, self._dataset.y_test), - score_metric=model.default_score_metric, + score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way train_score_regressor=model.score_regressor(self._dataset.X_train, self._dataset.y_train), dev_score_regressor=model.score_regressor(self._dataset.X_dev, self._dataset.y_dev), test_score_regressor=model.score_regressor(self._dataset.X_test, self._dataset.y_test) diff --git a/code/compute_results.py b/code/compute_results.py index 21777e2d4611995f83cc25ba08ded2789ea17650..383b89b9731aaf40864c2aff888d592fbbf40335 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -14,14 +14,14 @@ if __name__ == "__main__": # get environment variables in .env load_dotenv(find_dotenv('.env.example')) - default_results_dir = os.environ["project_dir"] + os.sep + 'results' - default_models_dir = os.environ["project_dir"] + os.sep + 'models' - default_experiment_ids = None + DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' + DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' + DEFAULT_EXPERIMENT_IDS = None parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.') - parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') - parser.add_argument('--experiment_ids', nargs='+', type=int, default=default_experiment_ids, help='Compute the results of the specified experiment id(s)') + parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') + parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') + parser.add_argument('--experiment_ids', nargs='+', type=int, default=DEFAULT_EXPERIMENT_IDS, help='Compute the results of the specified experiment id(s)') args = parser.parse_args() pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) diff --git a/code/train.py b/code/train.py index 647ed19c706daa2db8faf6a5b6851fd81f2088bd..6daa502ac3dc41ec036e2642c2a658965221475c 100644 --- a/code/train.py +++ b/code/train.py @@ -19,33 +19,33 @@ if __name__ == "__main__": # get environment variables in .env load_dotenv(find_dotenv('.env.example')) - default_dataset_name = 'boston' - default_normalize = True - default_normalize_D = False - default_dataset_normalizer = None - default_forest_size = 100 - default_extracted_forest_size = 10 + DEFAULT_DATASET_NAME = 'boston' + DEFAULT_NORMALIZE_D = False + DEFAULT_DATASET_NORMALIZER = None + DEFAULT_FOREST_SIZE = 100 + DEFAULT_EXTRACTED_FOREST_SIZE = 10 # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} - default_models_dir = os.environ["project_dir"] + os.sep + 'models' - default_dev_size = 0.2 - default_test_size = 0.2 - default_random_seed_number = 1 + DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' + DEFAULT_DEV_SIZE = 0.2 + DEFAULT_TEST_SIZE = 0.2 + DEFAULT_RANDOM_SEED_NUMBER = 1 + DEFAULT_TRAIN_ON_SUBSET = 'train' + begin_random_seed_range = 1 end_random_seed_range = 2000 - default_train_on_subset = 'train' parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') - parser.add_argument('--normalize_D', action='store_true', default=default_normalize_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') - parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=default_dataset_normalizer, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') - parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') - parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') - parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') - parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio.') - parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio.') - parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used.') + parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') + parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') + parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') + parser.add_argument('--forest_size', nargs='?', type=int, default=DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.') + parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.') + parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') + parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.') + parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') + parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') - parser.add_argument('--train_on_subset', nargs='?', type=str, default=default_train_on_subset, help='Specify on witch subset the model will be trained (either train or dev).') + parser.add_argument('--train_on_subset', nargs='?', type=str, default=DEFAULT_TRAIN_ON_SUBSET, help='Specify on witch subset the model will be trained (either train or dev).') args = parser.parse_args() pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True)