diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..9ca543b382b4889be1d93ed2065ef234b789153c --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +# Environment variables go here, can be read by `python-dotenv` package: +# +# `src/script.py` +# ---------------------------------------------------------------- +# import dotenv +# +# project_dir = os.path.join(os.path.dirname(__file__), os.pardir) +# dotenv_path = os.path.join(project_dir, '.env') +# dotenv.load_dotenv(dotenv_path) +# ---------------------------------------------------------------- + +project_dir = "." \ No newline at end of file diff --git a/.gitignore b/.gitignore index 9cc2e27f5cc3bd830b9038f8562b9edc51eb3f19..be95874710f5ee55a4e0104c6f78ff8a21abb705 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +models/* + */.kile/* *.kilepr # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index 8ffb3de812f21466cd7bfe0aaf8c92518529963c..8d46ebd2a36736d601c1dc94ee3f19c5118555fd 100644 --- a/README.md +++ b/README.md @@ -49,5 +49,16 @@ Project Organization Instal project -------------- +First install the project pacakge: + pip install -r requirements.txt + +Then create a file `.env` by copying the file `.env.example`: + cp .env.example .env + +Then you must set the project directory in the `.env` file : + + project_dir = "path/to/your/project/directory" + +This directory will be used for storing the model parameters. \ No newline at end of file diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index 6ad4b1f769d35b67b9ebcec6dae6b03ed68607e7..4ab7e3faf434959ba9da0d32b3c3801b26775ec3 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -71,7 +71,7 @@ class DatasetLoader(object): test_size=dataset_parameters.dev_size, random_state=dataset_parameters.random_state) - # TODO + # TODO? if dataset_parameters.normalize: pass diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py index 556c96059189397cad7a96f035316aec55bd944a..f19dc903a74488f006cc3f54107a7cd3cdc2f81c 100644 --- a/code/bolsonaro/data/dataset_parameters.py +++ b/code/bolsonaro/data/dataset_parameters.py @@ -4,12 +4,13 @@ import os class DatasetParameters(object): - def __init__(self, name, test_size, dev_size, random_state, normalize): + def __init__(self, name, test_size, dev_size, random_state, normalize, train_on_subset): self._name = name self._test_size = test_size self._dev_size = dev_size self._random_state = random_state self._normalize = normalize + self._train_on_subset = train_on_subset @property def name(self): @@ -31,6 +32,10 @@ class DatasetParameters(object): def normalize(self): return self._normalize + @property + def train_on_subset(self): + return self._train_on_subset + def save(self, directory_path, experiment_id): with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file: json.dump({ @@ -38,7 +43,8 @@ class DatasetParameters(object): 'test_size': self._test_size, 'dev_size': self._dev_size, 'random_state': self._random_state, - 'normalize': self._normalize + 'normalize': self._normalize, + 'train_on_subset': self._train_on_subset }, output_file, indent=4) diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index b1fec8c6516aff4dbf8e2a0c521387e001ae242b..2d8dba5f78b29f6db093f497a63bee5cf634822c 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -4,10 +4,11 @@ import os class ModelParameters(object): - def __init__(self, forest_size, extracted_forest_size, seed=None): + def __init__(self, forest_size, extracted_forest_size, normalize, seed=None): self._forest_size = forest_size self._extracted_forest_size = extracted_forest_size self._seed = seed + self._normalize = normalize @property def forest_size(self): @@ -21,12 +22,17 @@ class ModelParameters(object): def seed(self): return self._seed + @property + def normalize(self): + return self._normalize + def save(self, directory_path, experiment_id): with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file: json.dump({ 'forest_size': self._forest_size, 'extracted_forest_size': self._extracted_forest_size, - 'seed': self._seed + 'seed': self._seed, + 'normalize': self._normalize }, output_file, indent=4) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index be60cae324c4542eed468765cbb1e8e6b8c86f5f..8fe7dc9bbe4061f259bdc74417b38a607ac6db46 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -1,6 +1,10 @@ +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory + from sklearn.ensemble import RandomForestRegressor from sklearn.linear_model import OrthogonalMatchingPursuit from sklearn.base import BaseEstimator +import numpy as np class OmpForestRegressor(BaseEstimator): @@ -9,6 +13,7 @@ class OmpForestRegressor(BaseEstimator): self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, random_state=models_parameters.seed) self._models_parameters = models_parameters + self._logger = LoggerFactory.create(LOG_PATH, __name__) def fit(self, X_train, y_train): self._forest = self._train_forest(X_train, y_train) @@ -29,16 +34,78 @@ class OmpForestRegressor(BaseEstimator): def models_parameters(self): return self._models_parameters + def score_regressor(self, X, y): + return self._regressor.score(X, y) + def _train_forest(self, X_train, y_train): self._regressor.fit(X_train, y_train) forest = self._regressor.estimators_ return forest def _extract_subforest(self, X_train, y_train): - D = [[tree.predict([elem])[0] for tree in self._forest] for elem in X_train] + """ + Given an already estimated regressor: apply OMP to get the weight of each tree. + + The X_train data is used for interrogation of every tree in the forest. The y_train data + is used for finding the weights in OMP. + + :param X_train: (n_sample, n_features) array + :param y_train: (n_sample,) array + :return: + """ + self._logger.debug("Forest make prediction on X_train") + D = self._forest_prediction(X_train) + + if self._models_parameters.normalize: + # question: maybe consider other kinds of normalization + self._logger.debug("Compute norm of predicted vectors on X_train") + self._forest_norms = np.linalg.norm(D, axis=0) + D /= self._forest_norms + omp = OrthogonalMatchingPursuit( n_nonzero_coefs=self._models_parameters.extracted_forest_size, fit_intercept=False, normalize=False) + self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." + .format(self._models_parameters.extracted_forest_size)) omp.fit(D, y_train) weights = omp.coef_ + # question: why not to use directly the omp estimator instead of bypassing it using the coefs? return weights + + def _forest_prediction(self, X): + return np.array([tree.predict(X) for tree in self._forest]).T + + def predict(self, X): + """ + Apply the OMPForestRegressor to X. + + :param X: + :return: + """ + D = self._forest_prediction(X) + + if self._models_parameters.normalize: + D /= self._forest_norms + + predictions = D @ self.weights + + return predictions + + + def score(self, X, y, metric="mse"): + """ + Evaluate OMPForestRegressor on (`X`, `y`) using `metric` + + :param X: + :param y: + :param metric: + :return: + """ + predictions = self.predict(X) + + if metric == "mse": + evaluation = np.mean(np.square(predictions - y)) + else: + raise ValueError("Metric value {} is not known.") + + return evaluation \ No newline at end of file diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 7c1436b87b48e7069500ba85f1e60104ac619c4c..3370ce7b0e9211ffaa92305b4ddcadfc41ea5cb1 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -13,12 +13,21 @@ class Trainer(object): self._dataset = dataset self._logger = LoggerFactory.create(LOG_PATH, __name__) - def iterate(self, model, models_dir): + def train(self, model, models_dir): self._logger.info('Training model using train set...') begin_time = time.time() - model.fit(self._dataset.X_train, self._dataset.y_train) + if self._dataset.dataset_parameters.train_on_subset == 'train': + X, y = self._dataset.X_train, self._dataset.y_train + elif self._dataset.dataset_parameters.train_on_subset == 'dev': + X, y = self._dataset.X_dev, self._dataset.y_dev + else: + raise ValueError("Unsupported train_on_subset value '{}'".format(self._dataset.dataset_parameters.train_on_subset)) + model.fit(X, y) end_time = time.time() + self._dump_raw_results(models_dir, model, end_time, begin_time) + + def _dump_raw_results(self, models_dir, model, end_time, begin_time): output_file_path = models_dir + os.sep + 'model.pickle' self._logger.info('Saving trained model to {}'.format(output_file_path)) with open(output_file_path, 'wb') as output_file: diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py index 2affd37e8e20e96406b1d2fc585b3e042d7a1d2d..96fbb06217b2771c1cf906f7ee5d89097296bcb1 100644 --- a/code/bolsonaro/utils.py +++ b/code/bolsonaro/utils.py @@ -2,6 +2,14 @@ import os def resolve_experiment_id(models_dir): + """ + Return the ID of the next experiment. + + The ID is an int equal to n+1 where n is the current number of directory in `models_dir + ` + :param models_dir: + :return: + """ ids = [x for x in os.listdir(models_dir) if os.path.isdir(models_dir + os.sep + x)] if len(ids) > 0: diff --git a/code/train.py b/code/train.py index 0e6896c522ad2ca26d848a634f740d8d2afb008d..1283e90a3d05cf9bb5ee0a3b4777e41a09dcfa1c 100644 --- a/code/train.py +++ b/code/train.py @@ -4,7 +4,10 @@ from bolsonaro.models.model_factory import ModelFactory from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.trainer import Trainer from bolsonaro.utils import resolve_experiment_id +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory +from dotenv import find_dotenv, load_dotenv import argparse import pathlib import random @@ -13,17 +16,21 @@ import errno if __name__ == "__main__": + # get environment variables in .env + load_dotenv(find_dotenv('.env.example')) + default_dataset_name = 'boston' - default_normalize = False + default_normalize = True default_forest_size = 100 default_extracted_forest_size = 10 - default_models_dir = 'models' + # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} + default_models_dir = os.environ["project_dir"] + os.sep + 'models' default_dev_size = 0.2 default_test_size = 0.2 - default_use_random_seed = True default_random_seed_number = 1 begin_random_seed_range = 1 end_random_seed_range = 2000 + default_train_on_subset = 'train' parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') @@ -31,29 +38,34 @@ if __name__ == "__main__": parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') - parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio') - parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio') - parser.add_argument('--use_random_seed', action='store_true', default=default_use_random_seed, help='Random seed used for the data split') - parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used') + parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio.') + parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio.') + parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used.') + parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') + parser.add_argument('--train_on_subset', nargs='?', type=str, default=default_train_on_subset, help='Specify on witch subset the model will be trained (either train or dev).') args = parser.parse_args() pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) + logger = LoggerFactory.create(LOG_PATH, __name__) + args.extracted_forest_size = args.extracted_forest_size \ if type(args.extracted_forest_size) == list \ else [args.extracted_forest_size] - random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \ - for i in range(args.random_seed_number)] \ - if args.use_random_seed else None + if args.seeds != None and args.random_seed_number > 1: + logger.warn('seeds and random_seed_number parameters are both specified. Seeds will be used.') + seeds = args.seeds if args.seeds is not None \ + else [random.randint(begin_random_seed_range, end_random_seed_range) \ + for i in range(args.random_seed_number)] experiment_id = resolve_experiment_id(args.models_dir) experiment_id_str = str(experiment_id) - for random_seed in random_seeds: - random_seed_str = str(random_seed) + for seed in seeds: + seed_str = str(seed) models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ - os.sep + random_seed_str + os.sep + seed_str try: os.makedirs(models_dir) except OSError as e: @@ -64,8 +76,9 @@ if __name__ == "__main__": name=args.dataset_name, test_size=args.test_size, dev_size=args.dev_size, - random_state=random_seed, - normalize=args.normalize + random_state=seed, + normalize=args.normalize, + train_on_subset=args.train_on_subset ) dataset_parameters.save(models_dir, experiment_id_str) @@ -84,10 +97,14 @@ if __name__ == "__main__": model_parameters = ModelParameters( forest_size=args.forest_size, extracted_forest_size=extracted_forest_size, - seed=random_seed + seed=seed, + normalize=args.normalize ) model_parameters.save(sub_models_dir, experiment_id) model = ModelFactory.build(dataset.task, model_parameters) - trainer.iterate(model, sub_models_dir) + trainer.train(model, sub_models_dir) + + logger.info('Error on test set: {}'.format(model.score(dataset.X_test, dataset.y_test))) + logger.info('Accuracy on test set: {}'.format(model.score_regressor(dataset.X_test, dataset.y_test)))