Skip to content
Snippets Groups Projects
Commit 3f5cdf68 authored by Luc Giffon's avatar Luc Giffon
Browse files

Big changes: Create intermediate classes OMPForest and SingleOmpForest for...

Big changes: Create intermediate classes OMPForest and SingleOmpForest for code factoring: share code between OmpForestRegressor and OmpForestBinaryClassifer. Remove set_wweights and set_forest which are not relevant anymore. load function from model_factory isn't trustfull now: raises an error. TODO: multiclass classifier
parent bc5a83b6
No related branches found
No related tags found
1 merge request!3clean scripts
......@@ -8,6 +8,13 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
def change_binary_func_load(base_load_function):
def func_load(return_X_y):
X, y = base_load_function(return_X_y=return_X_y)
assert len(set(y).difference({0, 1})) == 0, "Classes for binary classifier should be {-1, +1}"
y[y==0] = -1
return X, y
return func_load
class DatasetLoader(object):
......@@ -33,7 +40,7 @@ class DatasetLoader(object):
dataset_loading_func = load_wine
task = Task.CLASSIFICATION
elif name == 'breast_cancer':
dataset_loading_func = load_breast_cancer
dataset_loading_func = change_binary_func_load(load_breast_cancer)
task = Task.CLASSIFICATION
elif name == 'olivetti_faces':
dataset_loading_func = fetch_olivetti_faces
......
from bolsonaro.models.omp_forest_classifier import OmpForestClassifier
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters
......@@ -12,7 +12,7 @@ class ModelFactory(object):
@staticmethod
def build(task, model_parameters):
if task == Task.CLASSIFICATION:
model_func = OmpForestClassifier
model_func = OmpForestBinaryClassifier
elif task == Task.REGRESSION:
model_func = OmpForestRegressor
else:
......@@ -21,8 +21,10 @@ class ModelFactory(object):
@staticmethod
def load(task, directory_path, experiment_id, model_raw_results):
raise NotImplementedError
model_parameters = ModelParameters.load(directory_path, experiment_id)
model = ModelFactory.build(task, model_parameters)
model.set_forest(model_raw_results.forest)
model.set_weights(model_raw_results.weights)
# todo faire ce qu'il faut ici pour rétablir correctement le modèle
# model.set_forest(model_raw_results.forest)
# model.set_weights(model_raw_results.weights)
return model
......@@ -6,13 +6,12 @@ import datetime
class ModelRawResults(object):
def __init__(self, forest, weights, training_time,
def __init__(self, model_object, training_time,
datetime, train_score, dev_score, test_score,
score_metric, train_score_regressor, dev_score_regressor,
test_score_regressor):
self._forest = forest
self._weights = weights
self._model_object = model_object
self._training_time = training_time
self._datetime = datetime
self._train_score = train_score
......@@ -24,12 +23,8 @@ class ModelRawResults(object):
self._test_score_regressor = test_score_regressor
@property
def forest(self):
return self._forest
@property
def weights(self):
return self._weights
def model_object(self):
return self.model_object
@property
def training_time(self):
......
from abc import abstractmethod, ABCMeta
import numpy as np
from sklearn.linear_model import OrthogonalMatchingPursuit
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from sklearn.base import BaseEstimator
class OmpForest(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters, base_forest_estimator):
self._base_forest_estimator = base_forest_estimator
self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
@property
def models_parameters(self):
return self._models_parameters
def score_base_estimator(self, X, y):
return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X):
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
@property
def forest(self):
return self._base_forest_estimator.estimators_
# sklearn baseestimator api methods
@abstractmethod
def fit(self, X_forest, y_forest, X_omp, y_omp):
pass
@abstractmethod
def predict(self, X):
pass
@abstractmethod
def score(self, X, y):
pass
class SingleOmpForest(OmpForest):
def __init__(self, models_parameters, base_forest_estimator):
# fit_intercept shouldn't be set to False as the data isn't necessarily centered here
# normalization is handled outsite OMP
self._omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
super().__init__(models_parameters, base_forest_estimator)
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
return self
def _extract_subforest(self, X, y):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X data is used for interrogation of every tree in the forest. The y data
is used for finding the weights in OMP.
:param X: (n_sample, n_features) array
:param y: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X")
D = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
# question: maybe consider other kinds of normalization.. centering?
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
return self._omp.fit(D, y)
def predict(self, X):
"""
Apply the SingleOmpForest to X.
Make all the base tree predictions then apply the OMP weights for pruning.
:param X:
:return:
"""
forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
if self._models_parameters.normalize_weights:
# we can normalize weights (by their sum) so that they sum to 1
# and they can be interpreted as impact percentages for interpretability.
# this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef)
# question: je comprend pas le truc avec nonszero?
# predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
coef_signs = np.sign(self._omp.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
unsigned_coef = (coef_signs * self._omp.coef_).squeeze()
intercept = self._omp.intercept_
adjusted_forest_predictions = forest_predictions * coef_signs
predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept
else:
predictions = self._omp.predict(forest_predictions)
return predictions
\ No newline at end of file
from collections import namedtuple
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import OrthogonalMatchingPursuit
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest
import numpy as np
class OmpForestBinaryClassifier(SingleOmpForest):
DEFAULT_SCORE_METRIC = 'indicator'
def __init__(self, models_parameters):
estimator = RandomForestClassifier(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed, n_jobs=-1)
super().__init__(models_parameters, estimator)
def _check_classes(self, y):
assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier should be {-1, +1}"
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._check_classes(y_forest)
self._check_classes(y_omp)
return super().fit(X_forest, y_forest, X_omp, y_omp)
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
"""
Evaluate OMPForestClassifer on (`X`, `y`) using `metric`
:param X:
:param y:
:param metric: might be "indicator"
:return:
"""
predictions = self.predict(X)
if metric == 'indicator':
evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1))
else:
raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
class OmpForestMulticlassClassifier(BaseEstimator):
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._base_forest_estimators = RandomForestClassifier(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed, n_jobs=-1)
self._logger = LoggerFactory.create(LOG_PATH, __name__)
class OmpForestClassifier(BaseEstimator):
def __init__(self):
raise ValueError('Classification tasks are not supported for now')
def fit(self, X, y):
pass
if __name__ == "__main__":
forest = RandomForestClassifier(n_estimators=10)
X = np.random.rand(10, 5)
y = np.random.choice([-1, +1], 10)
forest.fit(X, y)
print(forest.predict(np.random.rand(10, 5)))
\ No newline at end of file
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.base import BaseEstimator
import numpy as np
from bolsonaro.models.omp_forest import SingleOmpForest
class OmpForestRegressor(BaseEstimator):
class OmpForestRegressor(SingleOmpForest):
DEFAULT_SCORE_METRIC = 'mse'
def __init__(self, models_parameters):
self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed, n_jobs=-1)
self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
@property
def forest(self):
return self._forest
def set_forest(self, forest):
self._forest = forest
self._regressor.estimators_ = forest
@property
def weights(self):
return self._weights
def set_weights(self, weights):
self._weights = weights
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._forest = self._train_forest(X_forest, y_forest)
self._omp = self._extract_subforest(X_omp, y_omp)
self._weights = self._omp.coef_
return self
def score_regressor(self, X, y):
return self._regressor.score(X, y)
def predict(self, X):
"""
Apply the OMPForestRegressor to X.
:param X:
:return:
"""
forest_predictions = self._forest_prediction(X)
estimator = RandomForestRegressor(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed, n_jobs=-1)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) \
if self._models_parameters.normalize_weights \
else self._omp.predict(forest_predictions)
super().__init__(models_parameters, estimator)
return predictions
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
"""
......@@ -79,39 +34,4 @@ class OmpForestRegressor(BaseEstimator):
else:
raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
def _train_forest(self, X, y):
self._regressor.fit(X, y)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X, y):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X data is used for interrogation of every tree in the forest. The y data
is used for finding the weights in OMP.
:param X: (n_sample, n_features) array
:param y: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X")
D = self._forest_prediction(X)
if self._models_parameters.normalize_D:
# question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=self._models_parameters.extracted_forest_size,
fit_intercept=False, normalize=False)
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
return omp.fit(D, y)
def _forest_prediction(self, X):
return np.array([tree.predict(X) for tree in self._forest]).T
return evaluation
\ No newline at end of file
......@@ -8,12 +8,26 @@ import numpy as np
class Trainer(object):
"""
Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method.
"""
def __init__(self, dataset):
"""
:param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes
"""
self._dataset = dataset
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def train(self, model, models_dir):
"""
:param model: Object with
:param models_dir: Where the results will be saved
:return:
"""
# todo cette fonction ne fait pas que "train", elle choisit le jeu de données, train et evalue le modèle -> nom à changer
self._logger.debug('Training model using train set...')
begin_time = time.time()
......@@ -45,16 +59,18 @@ class Trainer(object):
)
end_time = time.time()
ModelRawResults(
forest=model.forest,
weights=model.weights,
results = ModelRawResults(
model_object=model,
training_time=end_time - begin_time,
datetime=datetime.datetime.now(),
train_score=model.score(self._dataset.X_train, self._dataset.y_train),
dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev),
test_score=model.score(self._dataset.X_test, self._dataset.y_test),
score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way
train_score_regressor=model.score_regressor(self._dataset.X_train, self._dataset.y_train),
dev_score_regressor=model.score_regressor(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=model.score_regressor(self._dataset.X_test, self._dataset.y_test)
).save(models_dir)
train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train),
dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test)
)
results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_regressor))
self._logger.info("Performance on test: {}".format(results.test_score))
......@@ -18,9 +18,20 @@ import json
def process_job(seed, parameters, experiment_id):
"""
Experiment function.
Will be used as base function for worker in multithreaded application.
:param seed:
:param parameters:
:param experiment_id:
:return:
"""
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
seed_str = str(seed)
experiment_id_str = str(experiment_id)
models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
......@@ -35,12 +46,12 @@ def process_job(seed, parameters, experiment_id):
dataset_normalizer=parameters['dataset_normalizer']
)
dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset)
for extracted_forest_size in parameters['extracted_forest_size']:
# question if training is too long, one may also split experiments for different forest sizes into different workers
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
......@@ -62,7 +73,7 @@ def process_job(seed, parameters, experiment_id):
if __name__ == "__main__":
# get environment variables in .env
load_dotenv(find_dotenv('.env.example'))
load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
DEFAULT_DATASET_NAME = 'boston'
......@@ -109,6 +120,7 @@ if __name__ == "__main__":
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
# The number of tree to extract from forest (K)
parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \
if type(parameters['extracted_forest_size']) == list \
else [parameters['extracted_forest_size']]
......@@ -116,6 +128,7 @@ if __name__ == "__main__":
if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
# Seeds are either provided as parameters or generated at random
seeds = parameters['seeds'] if parameters['seeds'] is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(parameters['random_seed_number'])]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment