diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index 4ab7e3faf434959ba9da0d32b3c3801b26775ec3..b76f7408fe8c0b7937bc5d84d0bab397e441ff09 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -11,7 +11,7 @@ from sklearn.model_selection import train_test_split class DatasetLoader(object): @staticmethod - def load_from_name(dataset_parameters): + def load(dataset_parameters): name = dataset_parameters.name if name == 'boston': dataset_loading_func = load_boston diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py index f19dc903a74488f006cc3f54107a7cd3cdc2f81c..d5a1145d3dc4a3796dbdd1b38aa50325262b7e3e 100644 --- a/code/bolsonaro/data/dataset_parameters.py +++ b/code/bolsonaro/data/dataset_parameters.py @@ -48,3 +48,16 @@ class DatasetParameters(object): }, output_file, indent=4) + + @staticmethod + def load(directory_path, experiment_id): + with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'r') as input_file: + parameters = json.load(input_file) + return DatasetParameters( + name=parameters['name'], + test_size=parameters['test_size'], + dev_size=parameters['dev_size'], + random_state=parameters['random_state'], + normalize=parameters['normalize'], + train_on_subset=parameters['train_on_subset'] + ) diff --git a/code/bolsonaro/error_handling/logger_factory.py b/code/bolsonaro/error_handling/logger_factory.py index f5248517575ecfd0a2ceb4a0c0fa30413ea89f5f..09a7ca68033b9f1960a1d9d98a9c4553653313e1 100644 --- a/code/bolsonaro/error_handling/logger_factory.py +++ b/code/bolsonaro/error_handling/logger_factory.py @@ -50,7 +50,7 @@ class LoggerFactory(object): # Create console handler ch = logging.StreamHandler() - ch.setLevel(logging.DEBUG) + ch.setLevel(logging.INFO) # Create formatter formatter = logging.Formatter('%(asctime)s - %(filename)s:%(lineno)s - %(name)s - %(levelname)s - %(message)s') diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index 5bad7f44ddbfda7e5cb4fd21d56063dae5126e16..fb6b32cb26727d2221367f208598f04e1a19dfb1 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -1,6 +1,10 @@ from bolsonaro.models.omp_forest_classifier import OmpForestClassifier from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.data.task import Task +from bolsonaro.models.model_parameters import ModelParameters + +import os +import pickle class ModelFactory(object): @@ -14,3 +18,11 @@ class ModelFactory(object): else: raise ValueError("Unsupported task '{}'".format(task)) return model_func(model_parameters) + + @staticmethod + def load(task, directory_path, experiment_id, model_raw_results): + model_parameters = ModelParameters.load(directory_path, experiment_id) + model = ModelFactory.build(task, model_parameters) + model.set_forest(model_raw_results.forest) + model.set_weights(model_raw_results.weights) + return model diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index 2d8dba5f78b29f6db093f497a63bee5cf634822c..838253255e2a9a71290633e026090b70e680e201 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -36,3 +36,14 @@ class ModelParameters(object): }, output_file, indent=4) + + @staticmethod + def load(directory_path, experiment_id): + with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'r') as input_file: + parameters = json.load(input_file) + return ModelParameters( + forest_size=parameters['forest_size'], + extracted_forest_size=parameters['extracted_forest_size'], + seed=parameters['seed'], + normalize=parameters['normalize'] + ) diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py new file mode 100644 index 0000000000000000000000000000000000000000..7b849d0466a17602d32519362db660b54b224880 --- /dev/null +++ b/code/bolsonaro/models/model_raw_results.py @@ -0,0 +1,105 @@ +import pickle +import os +import datetime + + +class ModelRawResults(object): + + def __init__(self, forest, weights, training_time, + datetime, train_score, dev_score, test_score, + score_metric, train_score_regressor, dev_score_regressor, + test_score_regressor): + + self._forest = forest + self._weights = weights + self._training_time = training_time + self._datetime = datetime + self._train_score = train_score + self._dev_score = dev_score + self._test_score = test_score + self._score_metric = score_metric + self._train_score_regressor = train_score_regressor + self._dev_score_regressor = dev_score_regressor + self._test_score_regressor = test_score_regressor + + @property + def forest(self): + return self._forest + + @property + def weights(self): + return self._weights + + @property + def training_time(self): + return self._training_time + + @property + def datetime(self): + return self._datetime + + @property + def train_score(self): + return self._train_score + + @property + def dev_score(self): + return self._dev_score + + @property + def test_score(self): + return self._test_score + + @property + def score_metric(self): + return self._score_metric + + @property + def train_score_regressor(self): + return self._train_score_regressor + + @property + def dev_score_regressor(self): + return self._dev_score_regressor + + @property + def test_score_regressor(self): + return self._test_score_regressor + + @staticmethod + def save(models_dir, model, end_time, begin_time, dataset, logger): + output_file_path = models_dir + os.sep + 'model_raw_results.pickle' + logger.debug('Saving trained model and raw results to {}'.format(output_file_path)) + with open(output_file_path, 'wb') as output_file: + pickle.dump({ + 'forest': model.forest, + 'weights': model.weights, + 'training_time': end_time - begin_time, + 'datetime': datetime.datetime.now(), + 'train_score': model.score(dataset.X_train, dataset.y_train), + 'dev_score': model.score(dataset.X_dev, dataset.y_dev), + 'test_score': model.score(dataset.X_test, dataset.y_test), + 'score_metric': model.default_score_metric, + 'train_score_regressor': model.score_regressor(dataset.X_train, dataset.y_train), + 'dev_score_regressor': model.score_regressor(dataset.X_dev, dataset.y_dev), + 'test_score_regressor': model.score_regressor(dataset.X_test, dataset.y_test) + }, output_file) + + @staticmethod + def load(models_dir): + model_file_path = models_dir + os.sep + 'model_raw_results.pickle' + with open(model_file_path, 'rb') as input_file: + model_data = pickle.load(input_file) + return ModelRawResults( + forest=model_data['forest'], + weights=model_data['weights'], + training_time=model_data['training_time'], + datetime=model_data['datetime'], + train_score=model_data['train_score'], + dev_score=model_data['dev_score'], + test_score=model_data['test_score'], + score_metric=model_data['score_metric'], + train_score_regressor=model_data['train_score_regressor'], + dev_score_regressor=model_data['dev_score_regressor'], + test_score_regressor=model_data['test_score_regressor'] + ) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 8fe7dc9bbe4061f259bdc74417b38a607ac6db46..f5e2b35f23f931a06ca88ce181434f4ff98cce23 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -9,72 +9,41 @@ import numpy as np class OmpForestRegressor(BaseEstimator): + default_score_metric = 'mse' + def __init__(self, models_parameters): self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, random_state=models_parameters.seed) self._models_parameters = models_parameters self._logger = LoggerFactory.create(LOG_PATH, __name__) - def fit(self, X_train, y_train): - self._forest = self._train_forest(X_train, y_train) - - self._weights = self._extract_subforest(X_train, y_train) - - return self - @property def forest(self): return self._forest + def set_forest(self, forest): + self._forest = forest + self._regressor.estimators_ = forest + @property def weights(self): return self._weights + def set_weights(self, weights): + self._weights = weights + @property def models_parameters(self): return self._models_parameters + def fit(self, X, y): + self._forest = self._train_forest(X, y) + self._weights = self._extract_subforest(X, y) + return self + def score_regressor(self, X, y): return self._regressor.score(X, y) - def _train_forest(self, X_train, y_train): - self._regressor.fit(X_train, y_train) - forest = self._regressor.estimators_ - return forest - - def _extract_subforest(self, X_train, y_train): - """ - Given an already estimated regressor: apply OMP to get the weight of each tree. - - The X_train data is used for interrogation of every tree in the forest. The y_train data - is used for finding the weights in OMP. - - :param X_train: (n_sample, n_features) array - :param y_train: (n_sample,) array - :return: - """ - self._logger.debug("Forest make prediction on X_train") - D = self._forest_prediction(X_train) - - if self._models_parameters.normalize: - # question: maybe consider other kinds of normalization - self._logger.debug("Compute norm of predicted vectors on X_train") - self._forest_norms = np.linalg.norm(D, axis=0) - D /= self._forest_norms - - omp = OrthogonalMatchingPursuit( - n_nonzero_coefs=self._models_parameters.extracted_forest_size, - fit_intercept=False, normalize=False) - self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." - .format(self._models_parameters.extracted_forest_size)) - omp.fit(D, y_train) - weights = omp.coef_ - # question: why not to use directly the omp estimator instead of bypassing it using the coefs? - return weights - - def _forest_prediction(self, X): - return np.array([tree.predict(X) for tree in self._forest]).T - def predict(self, X): """ Apply the OMPForestRegressor to X. @@ -91,8 +60,7 @@ class OmpForestRegressor(BaseEstimator): return predictions - - def score(self, X, y, metric="mse"): + def score(self, X, y, metric=default_score_metric): """ Evaluate OMPForestRegressor on (`X`, `y`) using `metric` @@ -103,9 +71,47 @@ class OmpForestRegressor(BaseEstimator): """ predictions = self.predict(X) - if metric == "mse": + if metric == 'mse': evaluation = np.mean(np.square(predictions - y)) else: - raise ValueError("Metric value {} is not known.") + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation + + def _train_forest(self, X, y): + self._regressor.fit(X, y) + forest = self._regressor.estimators_ + return forest + + def _extract_subforest(self, X, y): + """ + Given an already estimated regressor: apply OMP to get the weight of each tree. + + The X data is used for interrogation of every tree in the forest. The y data + is used for finding the weights in OMP. + + :param X: (n_sample, n_features) array + :param y: (n_sample,) array + :return: + """ + self._logger.debug("Forest make prediction on X") + D = self._forest_prediction(X) - return evaluation \ No newline at end of file + if self._models_parameters.normalize: + # question: maybe consider other kinds of normalization + self._logger.debug("Compute norm of predicted vectors on X") + self._forest_norms = np.linalg.norm(D, axis=0) + D /= self._forest_norms + + omp = OrthogonalMatchingPursuit( + n_nonzero_coefs=self._models_parameters.extracted_forest_size, + fit_intercept=False, normalize=False) + self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." + .format(self._models_parameters.extracted_forest_size)) + omp.fit(D, y) + weights = omp.coef_ + # question: why not to use directly the omp estimator instead of bypassing it using the coefs? + return weights + + def _forest_prediction(self, X): + return np.array([tree.predict(X) for tree in self._forest]).T diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 3370ce7b0e9211ffaa92305b4ddcadfc41ea5cb1..1120961d2196e24e36f2a9bd441cc4845597d4d0 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -1,10 +1,8 @@ +from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.error_handling.logger_factory import LoggerFactory from . import LOG_PATH -import pickle -import os import time -import datetime class Trainer(object): @@ -14,26 +12,17 @@ class Trainer(object): self._logger = LoggerFactory.create(LOG_PATH, __name__) def train(self, model, models_dir): - self._logger.info('Training model using train set...') + self._logger.debug('Training model using train set...') begin_time = time.time() - if self._dataset.dataset_parameters.train_on_subset == 'train': + train_on_subset = self._dataset.dataset_parameters.train_on_subset + if train_on_subset == 'train': X, y = self._dataset.X_train, self._dataset.y_train - elif self._dataset.dataset_parameters.train_on_subset == 'dev': + elif train_on_subset == 'dev': X, y = self._dataset.X_dev, self._dataset.y_dev else: - raise ValueError("Unsupported train_on_subset value '{}'".format(self._dataset.dataset_parameters.train_on_subset)) + raise ValueError("Unsupported train_on_subset value '{}'".format(train_on_subset)) + self._logger.debug('Fitting on {} subset'.format(train_on_subset)) model.fit(X, y) end_time = time.time() - self._dump_raw_results(models_dir, model, end_time, begin_time) - - def _dump_raw_results(self, models_dir, model, end_time, begin_time): - output_file_path = models_dir + os.sep + 'model.pickle' - self._logger.info('Saving trained model to {}'.format(output_file_path)) - with open(output_file_path, 'wb') as output_file: - pickle.dump({ - 'forest': model.forest, - 'weights': model.weights, - 'training_time': end_time - begin_time, - 'datetime': datetime.datetime.now() - }, output_file) + ModelRawResults.save(models_dir, model, end_time, begin_time, self._dataset, self._logger) diff --git a/code/compute_results.py b/code/compute_results.py index ba80f0bcd16d510d9463329ab3bda85762b57321..16bbbe4e73609059f4609472b3e23da17ef7813a 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -1,21 +1,48 @@ +from bolsonaro.data.dataset_parameters import DatasetParameters +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.models.model_raw_results import ModelRawResults +from bolsonaro.models.model_factory import ModelFactory + import argparse import pathlib +from dotenv import find_dotenv, load_dotenv +import os if __name__ == "__main__": - default_results_dir = 'results' - default_models_dir = 'models' - default_experiment_id = -1 + # get environment variables in .env + load_dotenv(find_dotenv('.env.example')) + + default_results_dir = os.environ["project_dir"] + os.sep + 'results' + default_models_dir = os.environ["project_dir"] + os.sep + 'models' + default_experiment_ids = None parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') - parser.add_argument('--experiment_id', nargs='?', type=int, default=default_experiment_id, help='Compute the results of a single experiment id') + parser.add_argument('--experiment_ids', nargs='+', type=int, default=default_experiment_ids, help='Compute the results of the specified experiment id(s)') args = parser.parse_args() pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) - if args.experiment_id == -1: - pass - else: - pass + experiments_ids = [str(experiment_id) for experiment_id in args.experiment_ids] \ + if args.experiment_ids is not None \ + else os.listdir(args.models_dir) + + if experiments_ids is None or len(experiments_ids) == 0: + raise ValueError("No experiment id was found or specified.") + + for experiment_id in experiments_ids: + experiment_id_path = args.models_dir + os.sep + experiment_id + experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' + for seed in os.listdir(experiment_seed_root_path): + experiment_seed_path = experiment_seed_root_path + os.sep + seed + dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) + dataset = DatasetLoader.load(dataset_parameters) + extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' + for extracted_forest_size in os.listdir(extracted_forest_size_root_path): + extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size + model_raw_results = ModelRawResults.load(extracted_forest_size_path) + model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results) + + diff --git a/code/train.py b/code/train.py index 1283e90a3d05cf9bb5ee0a3b4777e41a09dcfa1c..34d2f860aa46088e38152ae7efb04efb8580bd1c 100644 --- a/code/train.py +++ b/code/train.py @@ -21,6 +21,7 @@ if __name__ == "__main__": default_dataset_name = 'boston' default_normalize = True + default_wo_normalization = False default_forest_size = 100 default_extracted_forest_size = 10 # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} @@ -34,7 +35,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') - parser.add_argument('--normalize', action='store_true', default=default_normalize, help='Normalize the data by doing the L2 division of the pred vectors.') + parser.add_argument('--wo_normalization', action='store_true', default=default_wo_normalization, help='Withouyt normalize the data by doing the L2 division of the pred vectors.') parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') @@ -47,22 +48,27 @@ if __name__ == "__main__": pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) - logger = LoggerFactory.create(LOG_PATH, __name__) + logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) args.extracted_forest_size = args.extracted_forest_size \ if type(args.extracted_forest_size) == list \ else [args.extracted_forest_size] if args.seeds != None and args.random_seed_number > 1: - logger.warn('seeds and random_seed_number parameters are both specified. Seeds will be used.') + logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') + seeds = args.seeds if args.seeds is not None \ else [random.randint(begin_random_seed_range, end_random_seed_range) \ for i in range(args.random_seed_number)] + normalize = default_normalize and args.wo_normalization is False + logger.debug('normalize={}'.format(normalize)) + experiment_id = resolve_experiment_id(args.models_dir) experiment_id_str = str(experiment_id) for seed in seeds: + logger.debug('Seed={}'.format(seed)) seed_str = str(seed) models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ os.sep + seed_str @@ -77,16 +83,17 @@ if __name__ == "__main__": test_size=args.test_size, dev_size=args.dev_size, random_state=seed, - normalize=args.normalize, + normalize=normalize, train_on_subset=args.train_on_subset ) dataset_parameters.save(models_dir, experiment_id_str) - dataset = DatasetLoader.load_from_name(dataset_parameters) + dataset = DatasetLoader.load(dataset_parameters) trainer = Trainer(dataset) for extracted_forest_size in args.extracted_forest_size: + logger.debug('extracted_forest_size={}'.format(extracted_forest_size)) sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) try: os.makedirs(sub_models_dir) @@ -98,7 +105,7 @@ if __name__ == "__main__": forest_size=args.forest_size, extracted_forest_size=extracted_forest_size, seed=seed, - normalize=args.normalize + normalize=normalize ) model_parameters.save(sub_models_dir, experiment_id) diff --git a/models/.gitkeep b/models/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000