Skip to content
Snippets Groups Projects
Commit 5e50bbaa authored by Luc Giffon's avatar Luc Giffon
Browse files

add multiclass classifier mais attention ya un bug dans le calcul du score

parent 065988a4
No related branches found
No related tags found
1 merge request!3clean scripts
......@@ -8,11 +8,15 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from bolsonaro.utils import binarize_class_data
def change_binary_func_load(base_load_function):
def func_load(return_X_y):
X, y = base_load_function(return_X_y=return_X_y)
assert len(set(y).difference({0, 1})) == 0, "Classes for binary classifier should be {-1, +1}"
y[y==0] = -1
possible_classes = sorted(set(y))
assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication"
y = binarize_class_data(y, possible_classes[-1])
return X, y
return func_load
......@@ -26,13 +30,13 @@ class DatasetLoader(object):
task = Task.REGRESSION
elif name == 'iris':
dataset_loading_func = load_iris
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'diabetes':
dataset_loading_func = load_diabetes
task = Task.REGRESSION
elif name == 'digits':
dataset_loading_func = load_digits
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'linnerud':
dataset_loading_func = load_linnerud
task = Task.REGRESSION
......
......@@ -2,5 +2,6 @@ from enum import Enum
class Task(Enum):
CLASSIFICATION = 1
BINARYCLASSIFICATION = 1
REGRESSION = 2
MULTICLASSIFICATION = 3
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters
......@@ -11,10 +11,12 @@ class ModelFactory(object):
@staticmethod
def build(task, model_parameters):
if task == Task.CLASSIFICATION:
if task == Task.BINARYCLASSIFICATION:
model_func = OmpForestBinaryClassifier
elif task == Task.REGRESSION:
model_func = OmpForestRegressor
elif task == Task.MULTICLASSIFICATION:
model_func = OmpForestMulticlassClassifier
else:
raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters)
......
......@@ -30,28 +30,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
return self._base_forest_estimator.estimators_
# sklearn baseestimator api methods
@abstractmethod
def fit(self, X_forest, y_forest, X_omp, y_omp):
pass
@abstractmethod
def predict(self, X):
pass
@abstractmethod
def score(self, X, y):
pass
class SingleOmpForest(OmpForest):
def __init__(self, models_parameters, base_forest_estimator):
# fit_intercept shouldn't be set to False as the data isn't necessarily centered here
# normalization is handled outsite OMP
self._omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
super().__init__(models_parameters, base_forest_estimator)
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
......@@ -80,7 +58,53 @@ class SingleOmpForest(OmpForest):
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
return self._omp.fit(D, y)
self.fit_omp(D, y)
@staticmethod
def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False):
if normalize_weights:
# we can normalize weights (by their sum) so that they sum to 1
# and they can be interpreted as impact percentages for interpretability.
# this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef)
# question: je comprend pas le truc avec nonszero?
# predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
unsigned_coef = (coef_signs * omp_obj.coef_).squeeze()
intercept = omp_obj.intercept_
adjusted_forest_predictions = base_predictions * coef_signs
predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept
else:
predictions = omp_obj.predict(base_predictions)
return predictions
@abstractmethod
def fit_omp(self, atoms, objective):
pass
@abstractmethod
def predict(self, X):
pass
@abstractmethod
def score(self, X, y):
pass
class SingleOmpForest(OmpForest):
def __init__(self, models_parameters, base_forest_estimator):
# fit_intercept shouldn't be set to False as the data isn't necessarily centered here
# normalization is handled outsite OMP
self._omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
super().__init__(models_parameters, base_forest_estimator)
def fit_omp(self, atoms, objective):
self._omp.fit(atoms, objective)
def predict(self, X):
"""
......@@ -96,21 +120,4 @@ class SingleOmpForest(OmpForest):
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
if self._models_parameters.normalize_weights:
# we can normalize weights (by their sum) so that they sum to 1
# and they can be interpreted as impact percentages for interpretability.
# this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef)
# question: je comprend pas le truc avec nonszero?
# predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
coef_signs = np.sign(self._omp.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
unsigned_coef = (coef_signs * self._omp.coef_).squeeze()
intercept = self._omp.intercept_
adjusted_forest_predictions = forest_predictions * coef_signs
predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept
else:
predictions = self._omp.predict(forest_predictions)
return predictions
\ No newline at end of file
return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
\ No newline at end of file
from collections import namedtuple
from copy import deepcopy
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
......@@ -9,6 +10,9 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest
import numpy as np
from bolsonaro.utils import binarize_class_data
class OmpForestBinaryClassifier(SingleOmpForest):
DEFAULT_SCORE_METRIC = 'indicator'
......@@ -47,12 +51,59 @@ class OmpForestBinaryClassifier(SingleOmpForest):
return evaluation
class OmpForestMulticlassClassifier(BaseEstimator):
class OmpForestMulticlassClassifier(OmpForest):
DEFAULT_SCORE_METRIC = 'indicator'
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._base_forest_estimators = RandomForestClassifier(n_estimators=models_parameters.forest_size,
estimator = RandomForestClassifier(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed, n_jobs=-1)
self._logger = LoggerFactory.create(LOG_PATH, __name__)
super().__init__(models_parameters, estimator)
# question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest
self._dct_class_omp = {}
def fit_omp(self, atoms, objective):
assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__)
possible_classes = sorted(set(objective))
for class_label in possible_classes:
atoms_binary = binarize_class_data(atoms, class_label, inplace=False)
objective_binary = binarize_class_data(objective, class_label, inplace=False)
# todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
omp_class = OrthogonalMatchingPursuit(
n_nonzero_coefs=self.models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
omp_class.fit(atoms_binary, objective_binary)
self._dct_class_omp[class_label] = omp_class
return self._dct_class_omp
def predict(self, X):
forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
label_names = []
preds = []
for class_label, omp_class in self._dct_class_omp.items():
label_names.append(class_label)
atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False)
preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))
# todo verifier que ce n'est pas bugué ici
preds = np.array(preds).T
max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds]
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
predictions = self.predict(X)
if metric == 'indicator':
# todo corriger bug ici
evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1))
else:
raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
......
import os
import json
import pickle
from copy import deepcopy
def resolve_experiment_id(models_dir):
......@@ -45,3 +46,21 @@ def load_obj_from_pickle(file_path, constructor):
with open(file_path, 'rb') as input_file:
parameters = pickle.load(input_file)
return constructor(**parameters)
def binarize_class_data(data, class_pos, inplace=True):
"""
Replace class_pos by +1 and ~class_pos by -1.
:param data: an array of classes
:param class_pos: the positive class to be replaced by +1
:param inplace: If True, modify data in place (still return it, also)
:return:
"""
if not inplace:
data = deepcopy(data)
position_class_labels = (data == class_pos)
data[~(position_class_labels)] = -1
data[(position_class_labels)] = +1
return data
\ No newline at end of file
......@@ -72,7 +72,7 @@ def process_job(seed, parameters, experiment_id):
logger.info('Training done')
if __name__ == "__main__":
# get environment variables in .env
# get environment variables in .env (not .env.example... this is for the git, and the .env is local)
load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment