Skip to content
Snippets Groups Projects
Commit 43486a13 authored by Léo Bouscarrat's avatar Léo Bouscarrat
Browse files

Merge'

(
parents dcff5e38 a0f7c96f
No related branches found
No related tags found
1 merge request!3clean scripts
Showing
with 564 additions and 200 deletions
# Environment variables go here, can be read by `python-dotenv` package:
#
# `src/script.py`
# ----------------------------------------------------------------
# import dotenv
#
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)
# dotenv_path = os.path.join(project_dir, '.env')
# dotenv.load_dotenv(dotenv_path)
# ----------------------------------------------------------------
project_dir = "."
\ No newline at end of file
......@@ -371,3 +371,6 @@ TSWLatexianTemp*
*.lpz
reports/*.pdf
# Image
*.png
* Trouver des jeux de données pertinents
* Entraîner et tester des forêts de différentes tailles
* Entraîner et tester en regression et classification
* Entraîner et tester sur différentes modalités (pas seulement des datasets d'images)
* Entraîner avec différents hyperparamètres (d, profondeur, epsilon)
* Appliquer OMP avec différentes valeurs de k (notamment un petit k)
* Faire des figures
* Implémenter et comparer les systèmes concurrents
\ No newline at end of file
* Fix pickle loading of ModelRawResults, because saving the model_object leads import issues.
* Fix ModelFactory.load function.
* Fix model results loading in compute_results.py.
* Check that omp multiclasses classifier is working as expected.
* In the bayesian search computation, output a different file name depending on the task of the trained model.
* Check the best params scores of the regressors (neg_mean_squared_error leads to huge negative values).
* Prepare the json experiment files to run.
\ No newline at end of file
import os
LOG_PATH = os.path.abspath(os.path.dirname(__file__) + os.sep + '..' + os.sep + '..' + os.sep + 'log')
class Dataset(object):
def __init__(self, task, dataset_parameters, X_train, X_dev, X_test, y_train,
def __init__(self, task, X_train, X_dev, X_test, y_train,
y_dev, y_test):
self._task = task
self._dataset_parameters = dataset_parameters
self._X_train = X_train
self._X_dev = X_dev
self._X_test = X_test
......
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load
from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import load_boston, load_iris, load_diabetes, \
load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
......@@ -19,45 +21,46 @@ class DatasetLoader(object):
task = Task.REGRESSION
elif name == 'iris':
dataset_loading_func = load_iris
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'diabetes':
dataset_loading_func = load_diabetes
task = Task.REGRESSION
elif name == 'digits':
dataset_loading_func = load_digits
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'linnerud':
dataset_loading_func = load_linnerud
task = Task.REGRESSION
elif name == 'wine':
dataset_loading_func = load_wine
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'breast_cancer':
dataset_loading_func = load_breast_cancer
task = Task.CLASSIFICATION
elif name == 'olivetti_faces':
dataset_loading_func = change_binary_func_load(load_breast_cancer)
task = Task.BINARYCLASSIFICATION
elif name == 'olivetti_faces': # bug (no return X_y)
dataset_loading_func = fetch_olivetti_faces
task = Task.CLASSIFICATION
elif name == '20newsgroups':
task = Task.MULTICLASSIFICATION
elif name == '20newsgroups': # bug (no return X_y)
dataset_loading_func = fetch_20newsgroups
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized
task = Task.CLASSIFICATION
elif name == 'lfw_people':
task = Task.MULTICLASSIFICATION
elif name == 'lfw_people': # needs PIL (image dataset)
dataset_loading_func = fetch_lfw_people
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'lfw_pairs':
dataset_loading_func = fetch_lfw_pairs
task = Task.MULTICLASSIFICATION
elif name == 'covtype':
dataset_loading_func = fetch_covtype
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'rcv1':
dataset_loading_func = fetch_rcv1
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'kddcup99':
dataset_loading_func = fetch_kddcup99
task = Task.CLASSIFICATION
task = Task.MULTICLASSIFICATION
elif name == 'california_housing':
dataset_loading_func = fetch_california_housing
task = Task.REGRESSION
......@@ -87,5 +90,5 @@ class DatasetLoader(object):
X_dev = scaler.transform(X_dev)
X_test = scaler.transform(X_test)
return Dataset(task, dataset_parameters, X_train,
return Dataset(task, X_train,
X_dev, X_test, y_train, y_dev, y_test)
......@@ -15,11 +15,11 @@ class DatasetParameters(object):
@property
def name(self):
return self._name
@property
def test_size(self):
return self._test_size
@property
def dev_size(self):
return self._dev_size
......
......@@ -2,5 +2,6 @@ from enum import Enum
class Task(Enum):
CLASSIFICATION = 1
BINARYCLASSIFICATION = 1
REGRESSION = 2
MULTICLASSIFICATION = 3
'''
This module is used to find the best hyperparameters for a given dataset.
'''
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from skopt import BayesSearchCV
class HyperparameterSearcher(object):
def __init__(self):
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def search(self, dataset, hyperparameter_space, n_iter, cv,
random_seed, scorer, verbose=False):
'''
For a given dataset and the space of hyperparameters, does a
bayesian hyperparameters search.
:input dataset: a Dataset object
:input hyperparameter_space: a dictionnary, keys are hyperparameters,
value their spaces defined with skopt
:input n_iter: the number of iterations of the bayesian search
:input cv: the size of the cross validation
:input random_seed: int, the seed for the bayesian search
:input scorer: str, the name of the scorer
:input verbose: bool, print state of the research
:return: a skopt.searchcv.BayesSearchCV object
'''
if dataset.task == Task.REGRESSION:
estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed)
else:
estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed)
opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter,
cv=cv, n_jobs=-1, random_state=random_seed,
scoring=scorer, verbose=verbose)
opt.fit(dataset.X_train, dataset.y_train)
return opt
import os
LOG_PATH = os.path.abspath(os.path.dirname(__file__) + os.sep + '..' + os.sep + '..' + os.sep + 'log')
from bolsonaro.models.omp_forest_classifier import OmpForestClassifier
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters
......@@ -11,18 +11,22 @@ class ModelFactory(object):
@staticmethod
def build(task, model_parameters):
if task == Task.CLASSIFICATION:
model_func = OmpForestClassifier
if task == Task.BINARYCLASSIFICATION:
model_func = OmpForestBinaryClassifier
elif task == Task.REGRESSION:
model_func = OmpForestRegressor
elif task == Task.MULTICLASSIFICATION:
model_func = OmpForestMulticlassClassifier
else:
raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters)
@staticmethod
def load(task, directory_path, experiment_id, model_raw_results):
raise NotImplementedError
model_parameters = ModelParameters.load(directory_path, experiment_id)
model = ModelFactory.build(task, model_parameters)
model.set_forest(model_raw_results.forest)
model.set_weights(model_raw_results.weights)
# todo faire ce qu'il faut ici pour rétablir correctement le modèle
model.set_forest(model_raw_results.model_object.forest)
model.set_weights(model_raw_results.model_object.weights)
return model
......@@ -5,16 +5,13 @@ import os
class ModelParameters(object):
def __init__(self, forest_size, extracted_forest_size, normalize_D, use_dev_subset, seed=None):
self._forest_size = forest_size
def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters):
self._extracted_forest_size = extracted_forest_size
self._normalize_D = normalize_D
self._use_dev_subset = use_dev_subset
self._subsets_used = subsets_used
self._normalize_weights = normalize_weights
self._seed = seed
@property
def forest_size(self):
return self._forest_size
self._hyperparameters = hyperparameters
@property
def extracted_forest_size(self):
......@@ -25,13 +22,21 @@ class ModelParameters(object):
return self._normalize_D
@property
def use_dev_subset(self):
return self._use_dev_subset
def subsets_used(self):
return self._subsets_used
@property
def normalize_weights(self):
return self._normalize_weights
@property
def seed(self):
return self._seed
@property
def hyperparameters(self):
return self._hyperparameters
def save(self, directory_path, experiment_id):
save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id),
self.__dict__)
......
......@@ -6,13 +6,12 @@ import datetime
class ModelRawResults(object):
def __init__(self, forest, weights, training_time,
def __init__(self, model_object, training_time,
datetime, train_score, dev_score, test_score,
score_metric, train_score_regressor, dev_score_regressor,
test_score_regressor):
self._forest = forest
self._weights = weights
self._model_object = model_object
self._training_time = training_time
self._datetime = datetime
self._train_score = train_score
......@@ -24,12 +23,8 @@ class ModelRawResults(object):
self._test_score_regressor = test_score_regressor
@property
def forest(self):
return self._forest
@property
def weights(self):
return self._weights
def model_object(self):
return self.model_object
@property
def training_time(self):
......
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from abc import abstractmethod, ABCMeta
import numpy as np
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.base import BaseEstimator
class OmpForest(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters, base_forest_estimator):
self._base_forest_estimator = base_forest_estimator
self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
@property
def models_parameters(self):
return self._models_parameters
def score_base_estimator(self, X, y):
return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X):
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
@property
def forest(self):
return self._base_forest_estimator.estimators_
# sklearn baseestimator api methods
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
return self
def _extract_subforest(self, X, y):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X data is used for interrogation of every tree in the forest. The y data
is used for finding the weights in OMP.
:param X: (n_sample, n_features) array
:param y: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X")
D = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
# question: maybe consider other kinds of normalization.. centering?
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
self.fit_omp(D, y)
@staticmethod
def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False):
if normalize_weights:
# we can normalize weights (by their sum) so that they sum to 1
# and they can be interpreted as impact percentages for interpretability.
# this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef)
# question: je comprend pas le truc avec nonszero?
# predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
unsigned_coef = (coef_signs * omp_obj.coef_).squeeze()
intercept = omp_obj.intercept_
adjusted_forest_predictions = base_predictions * coef_signs
predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept
else:
predictions = omp_obj.predict(base_predictions)
return predictions
@abstractmethod
def fit_omp(self, atoms, objective):
pass
@abstractmethod
def predict(self, X):
pass
@abstractmethod
def score(self, X, y):
pass
class SingleOmpForest(OmpForest):
def __init__(self, models_parameters, base_forest_estimator):
# fit_intercept shouldn't be set to False as the data isn't necessarily centered here
# normalization is handled outsite OMP
self._omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
super().__init__(models_parameters, base_forest_estimator)
def fit_omp(self, atoms, objective):
self._omp.fit(atoms, objective)
def predict(self, X):
"""
Apply the SingleOmpForest to X.
Make all the base tree predictions then apply the OMP weights for pruning.
:param X:
:return:
"""
forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
from sklearn.base import BaseEstimator
from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest
from bolsonaro.utils import binarize_class_data
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import OrthogonalMatchingPursuit
class OmpForestBinaryClassifier(SingleOmpForest):
DEFAULT_SCORE_METRIC = 'indicator'
def __init__(self, models_parameters):
estimator = RandomForestClassifier(**models_parameters.hyperparameters,
random_state=models_parameters.seed, n_jobs=-1)
super().__init__(models_parameters, estimator)
def _check_classes(self, y):
assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier must be {-1, +1}"
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._check_classes(y_forest)
self._check_classes(y_omp)
return super().fit(X_forest, y_forest, X_omp, y_omp)
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
"""
Evaluate OMPForestClassifer on (`X`, `y`) using `metric`
:param X:
:param y:
:param metric: might be "indicator"
:return:
"""
predictions = self.predict(X)
if metric == 'indicator':
evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1))
else:
raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
class OmpForestMulticlassClassifier(OmpForest):
DEFAULT_SCORE_METRIC = 'indicator'
def __init__(self, models_parameters):
estimator = RandomForestClassifier(**models_parameters.hyperparameters,
random_state=models_parameters.seed, n_jobs=-1)
super().__init__(models_parameters, estimator)
# question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest
self._dct_class_omp = {}
def fit_omp(self, atoms, objective):
assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__)
possible_classes = sorted(set(objective))
for class_label in possible_classes:
atoms_binary = binarize_class_data(atoms, class_label, inplace=False)
objective_binary = binarize_class_data(objective, class_label, inplace=False)
# todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
omp_class = OrthogonalMatchingPursuit(
n_nonzero_coefs=self.models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
omp_class.fit(atoms_binary, objective_binary)
self._dct_class_omp[class_label] = omp_class
return self._dct_class_omp
def predict(self, X):
forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
label_names = []
preds = []
for class_label, omp_class in self._dct_class_omp.items():
label_names.append(class_label)
atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False)
preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))
# todo verifier que ce n'est pas bugué ici
preds = np.array(preds).T
max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds]
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
predictions = self.predict(X)
if metric == 'indicator':
evaluation = np.sum(np.ones_like(predictions)[predictions == y]) / X.shape[0]
else:
raise ValueError("Unsupported metric '{}'.".format(metric))
class OmpForestClassifier(BaseEstimator):
return evaluation
def __init__(self):
raise ValueError('Classification tasks are not supported for now')
def fit(self, X, y):
pass
if __name__ == "__main__":
forest = RandomForestClassifier(n_estimators=10)
X = np.random.rand(10, 5)
y = np.random.choice([-1, +1], 10)
forest.fit(X, y)
print(forest.predict(np.random.rand(10, 5)))
\ No newline at end of file
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.models.omp_forest import SingleOmpForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.base import BaseEstimator
import numpy as np
class OmpForestRegressor(BaseEstimator):
class OmpForestRegressor(SingleOmpForest):
DEFAULT_SCORE_METRIC = 'mse'
def __init__(self, models_parameters):
self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed)
self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
estimator = RandomForestRegressor(**models_parameters.hyperparameters,
random_state=models_parameters.seed, n_jobs=-1)
@property
def forest(self):
return self._forest
def set_forest(self, forest):
self._forest = forest
self._regressor.estimators_ = forest
@property
def weights(self):
return self._weights
def set_weights(self, weights):
self._weights = weights
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._forest = self._train_forest(X_forest, y_forest)
self._weights = self._extract_subforest(X_omp, y_omp)
return self
def score_regressor(self, X, y):
return self._regressor.score(X, y)
def predict(self, X):
"""
Apply the OMPForestRegressor to X.
:param X:
:return:
"""
D = self._forest_prediction(X)
if self._models_parameters.normalize_D:
D /= self._forest_norms
predictions = D @ self.weights
return predictions
super().__init__(models_parameters, estimator)
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
"""
......@@ -77,41 +31,3 @@ class OmpForestRegressor(BaseEstimator):
raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
def _train_forest(self, X, y):
self._regressor.fit(X, y)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X, y):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X data is used for interrogation of every tree in the forest. The y data
is used for finding the weights in OMP.
:param X: (n_sample, n_features) array
:param y: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X")
D = self._forest_prediction(X)
if self._models_parameters.normalize_D:
# question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=self._models_parameters.extracted_forest_size,
fit_intercept=False, normalize=False)
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
omp.fit(D, y)
weights = omp.coef_
# question: why not to use directly the omp estimator instead of bypassing it using the coefs?
return weights
def _forest_prediction(self, X):
return np.array([tree.predict(X) for tree in self._forest]).T
......@@ -8,46 +8,78 @@ import numpy as np
class Trainer(object):
"""
Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method.
"""
def __init__(self, dataset):
"""
:param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes
"""
self._dataset = dataset
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def train(self, model, models_dir):
self._logger.debug('Training model using train set...')
begin_time = time.time()
if model.models_parameters.use_dev_subset:
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
X_omp = self._dataset.X_dev
y_omp = self._dataset.y_dev
def init(self, model):
if model.models_parameters.subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._X_omp = self._dataset.X_dev
self._y_omp = self._dataset.y_dev
self._logger.debug('Fitting the forest on train subset and OMP on dev subset.')
else:
X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
X_omp = X_forest
y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
y_omp = y_forest
elif model.models_parameters.subsets_used == 'train+dev,train+dev':
self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._X_omp = self._X_forest
self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
self._y_omp = self._y_forest
self._logger.debug('Fitting both the forest and OMP on train+dev subsets.')
elif model.models_parameters.subsets_used == 'train,train+dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
def train(self, model):
"""
:param model: Object with
:return:
"""
self._logger.debug('Training model using train set...')
self._begin_time = time.time()
model.fit(
X_forest=X_forest,
y_forest=y_forest,
X_omp=X_omp,
y_omp=y_omp
X_forest=self._X_forest,
y_forest=self._y_forest,
X_omp=self._X_omp,
y_omp=self._y_omp
)
end_time = time.time()
self._end_time = time.time()
ModelRawResults(
forest=model.forest,
weights=model.weights,
training_time=end_time - begin_time,
def compute_results(self, model, models_dir):
"""
:param model: Object with
:param models_dir: Where the results will be saved
"""
results = ModelRawResults(
model_object=model,
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=model.score(self._dataset.X_train, self._dataset.y_train),
dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev),
test_score=model.score(self._dataset.X_test, self._dataset.y_test),
score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way
train_score_regressor=model.score_regressor(self._dataset.X_train, self._dataset.y_train),
dev_score_regressor=model.score_regressor(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=model.score_regressor(self._dataset.X_test, self._dataset.y_test)
).save(models_dir)
train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train),
dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test)
)
results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_regressor))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train: {}".format(results.train_score_regressor))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev: {}".format(results.dev_score_regressor))
self._logger.info("Performance on dev: {}".format(results.dev_score))
import os
import json
import pickle
from copy import deepcopy
def resolve_experiment_id(models_dir):
......@@ -45,3 +46,30 @@ def load_obj_from_pickle(file_path, constructor):
with open(file_path, 'rb') as input_file:
parameters = pickle.load(input_file)
return constructor(**parameters)
def binarize_class_data(data, class_pos, inplace=True):
"""
Replace class_pos by +1 and ~class_pos by -1.
:param data: an array of classes
:param class_pos: the positive class to be replaced by +1
:param inplace: If True, modify data in place (still return it, also)
:return:
"""
if not inplace:
data = deepcopy(data)
position_class_labels = (data == class_pos)
data[~(position_class_labels)] = -1
data[(position_class_labels)] = +1
return data
def change_binary_func_load(base_load_function):
def func_load(return_X_y):
X, y = base_load_function(return_X_y=return_X_y)
possible_classes = sorted(set(y))
assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication"
y = binarize_class_data(y, possible_classes[-1])
return X, y
return func_load
import matplotlib.pyplot as plt
import numpy as np
from sklearn.neighbors.kde import KernelDensity
import pandas as pd
class Plotter(object):
@staticmethod
def weight_density(weights, X, file_path):
X_plot = [np.exp(elem) for elem in weights]
fig, ax = plt.subplots()
def weight_density(all_experiment_weights, file_path):
"""
Function that creates the figure with the density of the weights
:param all_experiment_weights: The weights for the different experiments
:param file path: str, path where the figure will be saved
TODO: colored by seed number or not?
TODO: represents both the seed AND the extracted tree information in the legend
"""
for kernel in ['gaussian', 'tophat', 'epanechnikov']:
kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X_plot)
log_dens = kde.score_samples(X_plot)
ax.plot(X_plot, np.exp(log_dens), '-',
label="kernel = '{0}'".format(kernel))
"""
Convert dictionnary of structure
{seed_1:[M x W]], seed_k:[M x W]}
to numpy.ndarray with dim [K x M x W]
where K is the seed number, M is the
number of extracted trees and W the
weight number.
"""
all_experiment_weights = np.array(list(all_experiment_weights.values()))
n = len(all_experiment_weights)
ax.legend(loc='upper left')
ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')
"""
Get as many different colors from the specified cmap (here nipy_spectral)
as there are seeds used.
"""
colors = Plotter.get_colors_from_cmap(n)
fig, ax = plt.subplots()
# For each seed
for i in range(n):
# For each weight set of a given extracted tree number
for weights in all_experiment_weights[i]:
"""
Plot the series of weights that aren't zero,
colored by seed number.
"""
pd.Series(weights[np.nonzero(weights)]).plot.kde(
figsize=(15, 10), ax=ax, color=colors[i])
ax.set_xlim(-4, 9)
ax.set_ylim(-0.02, 0.4)
ax.set_title('Density weights of the OMP')
fig.savefig(file_path, dpi=fig.dpi)
plt.close(fig)
......
from bolsonaro import LOG_PATH
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.hyperparameter_searcher import HyperparameterSearcher
from bolsonaro.utils import save_obj_to_json
import argparse
import os
import pathlib
import pickle
import random
from dotenv import find_dotenv, load_dotenv
"""
I had to install skopt from this repository
https://github.com/darenr/scikit-optimize that handles
the issue described here https://github.com/scikit-optimize/scikit-optimize/issues/762.
"""
from skopt.space import Categorical, Integer, Real
def clean_numpy_int_dict(dictionary):
return dict([a, int(x)] if type(x) == Integer else
[a, clean_numpy_int_dict(x)] if type(x) == dict else
[a, clean_numpy_int_list(x)] if type(x) == list else [a, (x)]
for a, x in dictionary.items())
def clean_numpy_int_list(list_n):
return [int(elem) if type(elem) == Integer else
clean_numpy_int_dict(elem) if type(elem) == dict else
clean_numpy_int_list(elem) if type(elem) == list else elem
for elem in list_n]
if __name__ == "__main__":
# get environment variables in .env
load_dotenv(find_dotenv('.env'))
DEFAULT_CV = 3
DEFAULT_N_ITER = 50
DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000),
'min_samples_leaf': Integer(1, 1000),
'max_depth': Integer(1, 20),
'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])}
DATASET_LIST = ['boston', 'iris', 'diabetes']
# , 'digits', 'linnerud', 'wine']
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.')
parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.')
parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.')
parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.')
args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
begin_random_seed_range = 1
end_random_seed_range = 2000
if args.seed is None:
random_seed = random.randint(begin_random_seed_range, end_random_seed_range)
else:
random_seed = args.seed
for dataset_name in args.datasets:
dataset_dir = os.path.join('experiments', dataset_name, 'stage1')
pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True)
logger.info('Bayesian search on dataset {}'.format(dataset_name))
dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None)
dataset = DatasetLoader.load(dataset_parameters)
if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error'
else:
scorer = 'accuracy'
bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter,
args.cv, random_seed, scorer, args.verbose)
dict_results = {'_scorer': scorer,
'_best_score_train': opt.best_score_,
'_best_score_test': opt.score(dataset.X_test, dataset.y_test),
'_best_parameters': clean_numpy_int_dict(opt.best_params_),
'_random_seed': random_seed
}
save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment