diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index dd11da7f322ebce2d8ac62fabf180a86d0978046..102026ff0f73832881b5b037b0f26f77c92d8870 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -8,11 +8,15 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ from sklearn.model_selection import train_test_split from sklearn import preprocessing +from bolsonaro.utils import binarize_class_data + + def change_binary_func_load(base_load_function): def func_load(return_X_y): X, y = base_load_function(return_X_y=return_X_y) - assert len(set(y).difference({0, 1})) == 0, "Classes for binary classifier should be {-1, +1}" - y[y==0] = -1 + possible_classes = sorted(set(y)) + assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication" + y = binarize_class_data(y, possible_classes[-1]) return X, y return func_load @@ -26,13 +30,13 @@ class DatasetLoader(object): task = Task.REGRESSION elif name == 'iris': dataset_loading_func = load_iris - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'diabetes': dataset_loading_func = load_diabetes task = Task.REGRESSION elif name == 'digits': dataset_loading_func = load_digits - task = Task.CLASSIFICATION + task = Task.MULTICLASSIFICATION elif name == 'linnerud': dataset_loading_func = load_linnerud task = Task.REGRESSION diff --git a/code/bolsonaro/data/task.py b/code/bolsonaro/data/task.py index 2f47fa22f472f769c075f40e1c25a7bf3de45f0d..f1214a64a27873e49f5dbbcb853e4f65f9b07f68 100644 --- a/code/bolsonaro/data/task.py +++ b/code/bolsonaro/data/task.py @@ -2,5 +2,6 @@ from enum import Enum class Task(Enum): - CLASSIFICATION = 1 + BINARYCLASSIFICATION = 1 REGRESSION = 2 + MULTICLASSIFICATION = 3 diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index 1fa46385a884d82b74b44a5b8227b5b3dbfb0286..2dc578cfaacc99f9fea17b9ae8e64cc08e3038dc 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -1,4 +1,4 @@ -from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier +from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.data.task import Task from bolsonaro.models.model_parameters import ModelParameters @@ -11,10 +11,12 @@ class ModelFactory(object): @staticmethod def build(task, model_parameters): - if task == Task.CLASSIFICATION: + if task == Task.BINARYCLASSIFICATION: model_func = OmpForestBinaryClassifier elif task == Task.REGRESSION: model_func = OmpForestRegressor + elif task == Task.MULTICLASSIFICATION: + model_func = OmpForestMulticlassClassifier else: raise ValueError("Unsupported task '{}'".format(task)) return model_func(model_parameters) diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py index 1962d78eaa670035d9cb4bd283d513aa700e7d84..0c33f09dd07142cfc9f94cee500be3ed8c795fba 100644 --- a/code/bolsonaro/models/omp_forest.py +++ b/code/bolsonaro/models/omp_forest.py @@ -30,28 +30,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): return self._base_forest_estimator.estimators_ # sklearn baseestimator api methods - @abstractmethod - def fit(self, X_forest, y_forest, X_omp, y_omp): - pass - - @abstractmethod - def predict(self, X): - pass - - @abstractmethod - def score(self, X, y): - pass - -class SingleOmpForest(OmpForest): - def __init__(self, models_parameters, base_forest_estimator): - # fit_intercept shouldn't be set to False as the data isn't necessarily centered here - # normalization is handled outsite OMP - self._omp = OrthogonalMatchingPursuit( - n_nonzero_coefs=models_parameters.extracted_forest_size, - fit_intercept=True, normalize=False) - - super().__init__(models_parameters, base_forest_estimator) - def fit(self, X_forest, y_forest, X_omp, y_omp): self._base_forest_estimator.fit(X_forest, y_forest) self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit @@ -80,7 +58,53 @@ class SingleOmpForest(OmpForest): self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees." .format(self._models_parameters.extracted_forest_size)) - return self._omp.fit(D, y) + self.fit_omp(D, y) + + @staticmethod + def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False): + if normalize_weights: + # we can normalize weights (by their sum) so that they sum to 1 + # and they can be interpreted as impact percentages for interpretability. + # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) + + # question: je comprend pas le truc avec nonszero? + # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) + coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square) + unsigned_coef = (coef_signs * omp_obj.coef_).squeeze() + intercept = omp_obj.intercept_ + + adjusted_forest_predictions = base_predictions * coef_signs + predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept + + else: + predictions = omp_obj.predict(base_predictions) + + return predictions + + @abstractmethod + def fit_omp(self, atoms, objective): + pass + + @abstractmethod + def predict(self, X): + pass + + @abstractmethod + def score(self, X, y): + pass + +class SingleOmpForest(OmpForest): + def __init__(self, models_parameters, base_forest_estimator): + # fit_intercept shouldn't be set to False as the data isn't necessarily centered here + # normalization is handled outsite OMP + self._omp = OrthogonalMatchingPursuit( + n_nonzero_coefs=models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + + super().__init__(models_parameters, base_forest_estimator) + + def fit_omp(self, atoms, objective): + self._omp.fit(atoms, objective) def predict(self, X): """ @@ -96,21 +120,4 @@ class SingleOmpForest(OmpForest): if self._models_parameters.normalize_D: forest_predictions /= self._forest_norms - if self._models_parameters.normalize_weights: - # we can normalize weights (by their sum) so that they sum to 1 - # and they can be interpreted as impact percentages for interpretability. - # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) - - # question: je comprend pas le truc avec nonszero? - # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) - coef_signs = np.sign(self._omp.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square) - unsigned_coef = (coef_signs * self._omp.coef_).squeeze() - intercept = self._omp.intercept_ - - adjusted_forest_predictions = forest_predictions * coef_signs - predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept - - else: - predictions = self._omp.predict(forest_predictions) - - return predictions \ No newline at end of file + return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) \ No newline at end of file diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py index fb602ce401c086a2dba77714bc6530d69df10898..02c993c63fa72dd78d65a075068ae1b91c22ef8e 100644 --- a/code/bolsonaro/models/omp_forest_classifier.py +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -1,4 +1,5 @@ from collections import namedtuple +from copy import deepcopy from sklearn.base import BaseEstimator from sklearn.ensemble import RandomForestClassifier @@ -9,6 +10,9 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest import numpy as np +from bolsonaro.utils import binarize_class_data + + class OmpForestBinaryClassifier(SingleOmpForest): DEFAULT_SCORE_METRIC = 'indicator' @@ -47,12 +51,59 @@ class OmpForestBinaryClassifier(SingleOmpForest): return evaluation -class OmpForestMulticlassClassifier(BaseEstimator): +class OmpForestMulticlassClassifier(OmpForest): + DEFAULT_SCORE_METRIC = 'indicator' def __init__(self, models_parameters): - self._models_parameters = models_parameters - self._base_forest_estimators = RandomForestClassifier(n_estimators=models_parameters.forest_size, + estimator = RandomForestClassifier(n_estimators=models_parameters.forest_size, random_state=models_parameters.seed, n_jobs=-1) - self._logger = LoggerFactory.create(LOG_PATH, __name__) + super().__init__(models_parameters, estimator) + # question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest + self._dct_class_omp = {} + + def fit_omp(self, atoms, objective): + assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__) + possible_classes = sorted(set(objective)) + for class_label in possible_classes: + atoms_binary = binarize_class_data(atoms, class_label, inplace=False) + objective_binary = binarize_class_data(objective, class_label, inplace=False) + # todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP... + omp_class = OrthogonalMatchingPursuit( + n_nonzero_coefs=self.models_parameters.extracted_forest_size, + fit_intercept=True, normalize=False) + omp_class.fit(atoms_binary, objective_binary) + self._dct_class_omp[class_label] = omp_class + return self._dct_class_omp + + def predict(self, X): + forest_predictions = self._base_estimator_predictions(X) + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + label_names = [] + preds = [] + for class_label, omp_class in self._dct_class_omp.items(): + label_names.append(class_label) + atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False) + preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights)) + + # todo verifier que ce n'est pas bugué ici + + preds = np.array(preds).T + max_preds = np.argmax(preds, axis=1) + return np.array(label_names)[max_preds] + + + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): + predictions = self.predict(X) + + if metric == 'indicator': + # todo corriger bug ici + evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1)) + else: + raise ValueError("Unsupported metric '{}'.".format(metric)) + + return evaluation diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py index 82e501878ba06320914230096213d2d28548e4dc..21c7f72ac9173caf2cf1b5ccbbe6dde61193d1aa 100644 --- a/code/bolsonaro/utils.py +++ b/code/bolsonaro/utils.py @@ -1,6 +1,7 @@ import os import json import pickle +from copy import deepcopy def resolve_experiment_id(models_dir): @@ -45,3 +46,21 @@ def load_obj_from_pickle(file_path, constructor): with open(file_path, 'rb') as input_file: parameters = pickle.load(input_file) return constructor(**parameters) + +def binarize_class_data(data, class_pos, inplace=True): + """ + Replace class_pos by +1 and ~class_pos by -1. + + :param data: an array of classes + :param class_pos: the positive class to be replaced by +1 + :param inplace: If True, modify data in place (still return it, also) + :return: + """ + if not inplace: + data = deepcopy(data) + + position_class_labels = (data == class_pos) + data[~(position_class_labels)] = -1 + data[(position_class_labels)] = +1 + + return data \ No newline at end of file diff --git a/code/train.py b/code/train.py index 73bf6ab688ab647fd1cf4f9e40e234a2a805d703..6cd7dd10623d3d60d676705e60c9478132b56547 100644 --- a/code/train.py +++ b/code/train.py @@ -72,7 +72,7 @@ def process_job(seed, parameters, experiment_id): logger.info('Training done') if __name__ == "__main__": - # get environment variables in .env + # get environment variables in .env (not .env.example... this is for the git, and the .env is local) load_dotenv(find_dotenv('.env')) DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'