diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index d706b7a07751f715b6398b1f451ec9f337d00f60..dd11da7f322ebce2d8ac62fabf180a86d0978046 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -8,6 +8,13 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ from sklearn.model_selection import train_test_split from sklearn import preprocessing +def change_binary_func_load(base_load_function): + def func_load(return_X_y): + X, y = base_load_function(return_X_y=return_X_y) + assert len(set(y).difference({0, 1})) == 0, "Classes for binary classifier should be {-1, +1}" + y[y==0] = -1 + return X, y + return func_load class DatasetLoader(object): @@ -33,7 +40,7 @@ class DatasetLoader(object): dataset_loading_func = load_wine task = Task.CLASSIFICATION elif name == 'breast_cancer': - dataset_loading_func = load_breast_cancer + dataset_loading_func = change_binary_func_load(load_breast_cancer) task = Task.CLASSIFICATION elif name == 'olivetti_faces': dataset_loading_func = fetch_olivetti_faces diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index fb6b32cb26727d2221367f208598f04e1a19dfb1..1fa46385a884d82b74b44a5b8227b5b3dbfb0286 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -1,4 +1,4 @@ -from bolsonaro.models.omp_forest_classifier import OmpForestClassifier +from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.data.task import Task from bolsonaro.models.model_parameters import ModelParameters @@ -12,7 +12,7 @@ class ModelFactory(object): @staticmethod def build(task, model_parameters): if task == Task.CLASSIFICATION: - model_func = OmpForestClassifier + model_func = OmpForestBinaryClassifier elif task == Task.REGRESSION: model_func = OmpForestRegressor else: @@ -21,8 +21,10 @@ class ModelFactory(object): @staticmethod def load(task, directory_path, experiment_id, model_raw_results): + raise NotImplementedError model_parameters = ModelParameters.load(directory_path, experiment_id) model = ModelFactory.build(task, model_parameters) - model.set_forest(model_raw_results.forest) - model.set_weights(model_raw_results.weights) + # todo faire ce qu'il faut ici pour rétablir correctement le modèle + # model.set_forest(model_raw_results.forest) + # model.set_weights(model_raw_results.weights) return model diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py index 673cb0fc65b7378e95c03b186d246cb70b384a07..df8b2ec0b10704a8a8c397b9012298e8b901e14b 100644 --- a/code/bolsonaro/models/model_raw_results.py +++ b/code/bolsonaro/models/model_raw_results.py @@ -6,13 +6,12 @@ import datetime class ModelRawResults(object): - def __init__(self, forest, weights, training_time, + def __init__(self, model_object, training_time, datetime, train_score, dev_score, test_score, score_metric, train_score_regressor, dev_score_regressor, test_score_regressor): - self._forest = forest - self._weights = weights + self._model_object = model_object self._training_time = training_time self._datetime = datetime self._train_score = train_score @@ -24,12 +23,8 @@ class ModelRawResults(object): self._test_score_regressor = test_score_regressor @property - def forest(self): - return self._forest - - @property - def weights(self): - return self._weights + def model_object(self): + return self.model_object @property def training_time(self): diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py new file mode 100644 index 0000000000000000000000000000000000000000..1962d78eaa670035d9cb4bd283d513aa700e7d84 --- /dev/null +++ b/code/bolsonaro/models/omp_forest.py @@ -0,0 +1,116 @@ +from abc import abstractmethod, ABCMeta + +import numpy as np +from sklearn.linear_model import OrthogonalMatchingPursuit + +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory +from sklearn.base import BaseEstimator + + +class OmpForest(BaseEstimator, metaclass=ABCMeta): + def __init__(self, models_parameters, base_forest_estimator): + self._base_forest_estimator = base_forest_estimator + self._models_parameters = models_parameters + self._logger = LoggerFactory.create(LOG_PATH, __name__) + + @property + def models_parameters(self): + return self._models_parameters + + def score_base_estimator(self, X, y): + return self._base_forest_estimator.score(X, y) + + + def _base_estimator_predictions(self, X): + return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T + + @property + def forest(self): + return self._base_forest_estimator.estimators_ + + # sklearn baseestimator api methods + @abstractmethod + def fit(self, X_forest, y_forest, X_omp, y_omp): + pass + + @abstractmethod + def predict(self, X): + pass + + @abstractmethod + def score(self, X, y): + pass + +class SingleOmpForest(OmpForest): + def __init__(self, models_parameters, base_forest_estimator): + # fit_intercept shouldn't be set to False as the data isn't necessarily centered here + # normalization is handled outsite OMP + self._omp = OrthogonalMatchingPursuit( + n_nonzero_coefs=models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + + super().__init__(models_parameters, base_forest_estimator) + + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._base_forest_estimator.fit(X_forest, y_forest) + self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit + return self + + def _extract_subforest(self, X, y): + """ + Given an already estimated regressor: apply OMP to get the weight of each tree. + + The X data is used for interrogation of every tree in the forest. The y data + is used for finding the weights in OMP. + + :param X: (n_sample, n_features) array + :param y: (n_sample,) array + :return: + """ + self._logger.debug("Forest make prediction on X") + D = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + # question: maybe consider other kinds of normalization.. centering? + self._logger.debug("Compute norm of predicted vectors on X") + self._forest_norms = np.linalg.norm(D, axis=0) + D /= self._forest_norms + + self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." + .format(self._models_parameters.extracted_forest_size)) + + return self._omp.fit(D, y) + + def predict(self, X): + """ + Apply the SingleOmpForest to X. + + Make all the base tree predictions then apply the OMP weights for pruning. + + :param X: + :return: + """ + forest_predictions = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + if self._models_parameters.normalize_weights: + # we can normalize weights (by their sum) so that they sum to 1 + # and they can be interpreted as impact percentages for interpretability. + # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) + + # question: je comprend pas le truc avec nonszero? + # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) + coef_signs = np.sign(self._omp.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square) + unsigned_coef = (coef_signs * self._omp.coef_).squeeze() + intercept = self._omp.intercept_ + + adjusted_forest_predictions = forest_predictions * coef_signs + predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept + + else: + predictions = self._omp.predict(forest_predictions) + + return predictions \ No newline at end of file diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py index 12cc23fab69fc0b79ff40b1d6957db5532a8c452..fb602ce401c086a2dba77714bc6530d69df10898 100644 --- a/code/bolsonaro/models/omp_forest_classifier.py +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -1,11 +1,65 @@ +from collections import namedtuple + from sklearn.base import BaseEstimator from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import OrthogonalMatchingPursuit + +from bolsonaro import LOG_PATH +from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest +import numpy as np + +class OmpForestBinaryClassifier(SingleOmpForest): + + DEFAULT_SCORE_METRIC = 'indicator' + + def __init__(self, models_parameters): + estimator = RandomForestClassifier(n_estimators=models_parameters.forest_size, + random_state=models_parameters.seed, n_jobs=-1) + super().__init__(models_parameters, estimator) + + def _check_classes(self, y): + assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier should be {-1, +1}" + + def fit(self, X_forest, y_forest, X_omp, y_omp): + self._check_classes(y_forest) + self._check_classes(y_omp) + + return super().fit(X_forest, y_forest, X_omp, y_omp) + + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + """ + Evaluate OMPForestClassifer on (`X`, `y`) using `metric` + + :param X: + :param y: + :param metric: might be "indicator" + :return: + """ + predictions = self.predict(X) + + if metric == 'indicator': + evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1)) + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation + + +class OmpForestMulticlassClassifier(BaseEstimator): + def __init__(self, models_parameters): + self._models_parameters = models_parameters + self._base_forest_estimators = RandomForestClassifier(n_estimators=models_parameters.forest_size, + random_state=models_parameters.seed, n_jobs=-1) + self._logger = LoggerFactory.create(LOG_PATH, __name__) -class OmpForestClassifier(BaseEstimator): - def __init__(self): - raise ValueError('Classification tasks are not supported for now') - def fit(self, X, y): - pass +if __name__ == "__main__": + forest = RandomForestClassifier(n_estimators=10) + X = np.random.rand(10, 5) + y = np.random.choice([-1, +1], 10) + forest.fit(X, y) + print(forest.predict(np.random.rand(10, 5))) \ No newline at end of file diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index 65193e1ad32260ad55ff885f2f663aaf9dfeea77..c1404fc2564a0611b072bdd79921f1179680ec93 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -1,67 +1,22 @@ -from bolsonaro import LOG_PATH -from bolsonaro.error_handling.logger_factory import LoggerFactory + from sklearn.ensemble import RandomForestRegressor -from sklearn.linear_model import OrthogonalMatchingPursuit -from sklearn.base import BaseEstimator import numpy as np +from bolsonaro.models.omp_forest import SingleOmpForest + -class OmpForestRegressor(BaseEstimator): +class OmpForestRegressor(SingleOmpForest): DEFAULT_SCORE_METRIC = 'mse' def __init__(self, models_parameters): - self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size, - random_state=models_parameters.seed, n_jobs=-1) - self._models_parameters = models_parameters - self._logger = LoggerFactory.create(LOG_PATH, __name__) - - @property - def forest(self): - return self._forest - - def set_forest(self, forest): - self._forest = forest - self._regressor.estimators_ = forest - - @property - def weights(self): - return self._weights - - def set_weights(self, weights): - self._weights = weights - - @property - def models_parameters(self): - return self._models_parameters - - def fit(self, X_forest, y_forest, X_omp, y_omp): - self._forest = self._train_forest(X_forest, y_forest) - self._omp = self._extract_subforest(X_omp, y_omp) - self._weights = self._omp.coef_ - return self - - def score_regressor(self, X, y): - return self._regressor.score(X, y) - - def predict(self, X): - """ - Apply the OMPForestRegressor to X. - - :param X: - :return: - """ - forest_predictions = self._forest_prediction(X) + estimator = RandomForestRegressor(n_estimators=models_parameters.forest_size, + random_state=models_parameters.seed, n_jobs=-1) - if self._models_parameters.normalize_D: - forest_predictions /= self._forest_norms - predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) \ - if self._models_parameters.normalize_weights \ - else self._omp.predict(forest_predictions) + super().__init__(models_parameters, estimator) - return predictions def score(self, X, y, metric=DEFAULT_SCORE_METRIC): """ @@ -79,39 +34,4 @@ class OmpForestRegressor(BaseEstimator): else: raise ValueError("Unsupported metric '{}'.".format(metric)) - return evaluation - - def _train_forest(self, X, y): - self._regressor.fit(X, y) - forest = self._regressor.estimators_ - return forest - - def _extract_subforest(self, X, y): - """ - Given an already estimated regressor: apply OMP to get the weight of each tree. - - The X data is used for interrogation of every tree in the forest. The y data - is used for finding the weights in OMP. - - :param X: (n_sample, n_features) array - :param y: (n_sample,) array - :return: - """ - self._logger.debug("Forest make prediction on X") - D = self._forest_prediction(X) - - if self._models_parameters.normalize_D: - # question: maybe consider other kinds of normalization - self._logger.debug("Compute norm of predicted vectors on X") - self._forest_norms = np.linalg.norm(D, axis=0) - D /= self._forest_norms - - omp = OrthogonalMatchingPursuit( - n_nonzero_coefs=self._models_parameters.extracted_forest_size, - fit_intercept=False, normalize=False) - self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." - .format(self._models_parameters.extracted_forest_size)) - return omp.fit(D, y) - - def _forest_prediction(self, X): - return np.array([tree.predict(X) for tree in self._forest]).T + return evaluation \ No newline at end of file diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index b586914166cf80f274a502d8d44b83f6b6f97484..e615ca8c97facd1b8e033e60eec11c1cc8de922e 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -8,12 +8,26 @@ import numpy as np class Trainer(object): + """ + Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. + """ def __init__(self, dataset): + """ + + :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes + """ self._dataset = dataset self._logger = LoggerFactory.create(LOG_PATH, __name__) def train(self, model, models_dir): + """ + + :param model: Object with + :param models_dir: Where the results will be saved + :return: + """ + # todo cette fonction ne fait pas que "train", elle choisit le jeu de données, train et evalue le modèle -> nom à changer self._logger.debug('Training model using train set...') begin_time = time.time() @@ -45,16 +59,18 @@ class Trainer(object): ) end_time = time.time() - ModelRawResults( - forest=model.forest, - weights=model.weights, + results = ModelRawResults( + model_object=model, training_time=end_time - begin_time, datetime=datetime.datetime.now(), train_score=model.score(self._dataset.X_train, self._dataset.y_train), dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), test_score=model.score(self._dataset.X_test, self._dataset.y_test), score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way - train_score_regressor=model.score_regressor(self._dataset.X_train, self._dataset.y_train), - dev_score_regressor=model.score_regressor(self._dataset.X_dev, self._dataset.y_dev), - test_score_regressor=model.score_regressor(self._dataset.X_test, self._dataset.y_test) - ).save(models_dir) + train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train), + dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev), + test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test) + ) + results.save(models_dir) + self._logger.info("Base performance on test: {}".format(results.test_score_regressor)) + self._logger.info("Performance on test: {}".format(results.test_score)) diff --git a/code/train.py b/code/train.py index 4faff6e970da3600cc9f00714759ba2c9df7f73d..73bf6ab688ab647fd1cf4f9e40e234a2a805d703 100644 --- a/code/train.py +++ b/code/train.py @@ -18,9 +18,20 @@ import json def process_job(seed, parameters, experiment_id): + """ + Experiment function. + + Will be used as base function for worker in multithreaded application. + + :param seed: + :param parameters: + :param experiment_id: + :return: + """ logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( seed, threading.get_ident())) logger.info('seed={}'.format(seed)) + seed_str = str(seed) experiment_id_str = str(experiment_id) models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \ @@ -35,12 +46,12 @@ def process_job(seed, parameters, experiment_id): dataset_normalizer=parameters['dataset_normalizer'] ) dataset_parameters.save(models_dir, experiment_id_str) - dataset = DatasetLoader.load(dataset_parameters) trainer = Trainer(dataset) for extracted_forest_size in parameters['extracted_forest_size']: + # question if training is too long, one may also split experiments for different forest sizes into different workers logger.info('extracted_forest_size={}'.format(extracted_forest_size)) sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) @@ -62,7 +73,7 @@ def process_job(seed, parameters, experiment_id): if __name__ == "__main__": # get environment variables in .env - load_dotenv(find_dotenv('.env.example')) + load_dotenv(find_dotenv('.env')) DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' DEFAULT_DATASET_NAME = 'boston' @@ -109,6 +120,7 @@ if __name__ == "__main__": logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) + # The number of tree to extract from forest (K) parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \ if type(parameters['extracted_forest_size']) == list \ else [parameters['extracted_forest_size']] @@ -116,6 +128,7 @@ if __name__ == "__main__": if parameters['seeds'] != None and parameters['random_seed_number'] > 1: logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') + # Seeds are either provided as parameters or generated at random seeds = parameters['seeds'] if parameters['seeds'] is not None \ else [random.randint(begin_random_seed_range, end_random_seed_range) \ for i in range(parameters['random_seed_number'])]