Skip to content
Snippets Groups Projects
Commit 94d1388e authored by Charly Lamothe's avatar Charly Lamothe
Browse files

- Fix merge conflicts;

- Update wrong load_dotenv in compute_results and compute_hyperparams;
- Replace old remaining task.CLASSIFICATION enum.
parents 3e8f934b 7bb11288
Branches
Tags
1 merge request!3clean scripts
Showing
with 349 additions and 241 deletions
# Environment variables go here, can be read by `python-dotenv` package:
#
# `src/script.py`
# ----------------------------------------------------------------
# import dotenv
#
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)
# dotenv_path = os.path.join(project_dir, '.env')
# dotenv.load_dotenv(dotenv_path)
# ----------------------------------------------------------------
project_dir = "."
\ No newline at end of file
...@@ -9,6 +9,17 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ ...@@ -9,6 +9,17 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn import preprocessing from sklearn import preprocessing
from bolsonaro.utils import binarize_class_data
def change_binary_func_load(base_load_function):
def func_load(return_X_y):
X, y = base_load_function(return_X_y=return_X_y)
possible_classes = sorted(set(y))
assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication"
y = binarize_class_data(y, possible_classes[-1])
return X, y
return func_load
class DatasetLoader(object): class DatasetLoader(object):
...@@ -20,45 +31,46 @@ class DatasetLoader(object): ...@@ -20,45 +31,46 @@ class DatasetLoader(object):
task = Task.REGRESSION task = Task.REGRESSION
elif name == 'iris': elif name == 'iris':
dataset_loading_func = load_iris dataset_loading_func = load_iris
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == 'diabetes': elif name == 'diabetes':
dataset_loading_func = load_diabetes dataset_loading_func = load_diabetes
task = Task.REGRESSION task = Task.REGRESSION
elif name == 'digits': elif name == 'digits':
dataset_loading_func = load_digits dataset_loading_func = load_digits
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == 'linnerud': elif name == 'linnerud':
dataset_loading_func = load_linnerud dataset_loading_func = load_linnerud
task = Task.REGRESSION task = Task.REGRESSION
elif name == 'wine': elif name == 'wine':
dataset_loading_func = load_wine dataset_loading_func = load_wine
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == 'breast_cancer': elif name == 'breast_cancer':
dataset_loading_func = load_breast_cancer dataset_loading_func = change_binary_func_load(load_breast_cancer)
task = Task.CLASSIFICATION task = Task.BINARYCLASSIFICATION
elif name == 'olivetti_faces': # bug (no return X_y) elif name == 'olivetti_faces': # bug (no return X_y)
dataset_loading_func = fetch_olivetti_faces dataset_loading_func = fetch_olivetti_faces
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == '20newsgroups': # bug (no return X_y) elif name == '20newsgroups': # bug (no return X_y)
dataset_loading_func = fetch_20newsgroups dataset_loading_func = fetch_20newsgroups
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == '20newsgroups_vectorized': elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized dataset_loading_func = fetch_20newsgroups_vectorized
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == 'lfw_people': # needs PIL (image dataset) elif name == 'lfw_people': # needs PIL (image dataset)
dataset_loading_func = fetch_lfw_people dataset_loading_func = fetch_lfw_people
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == 'lfw_pairs': elif name == 'lfw_pairs':
dataset_loading_func = fetch_lfw_pairs dataset_loading_func = fetch_lfw_pairs
task = Task.MULTICLASSIFICATION
elif name == 'covtype': elif name == 'covtype':
dataset_loading_func = fetch_covtype dataset_loading_func = fetch_covtype
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == 'rcv1': elif name == 'rcv1':
dataset_loading_func = fetch_rcv1 dataset_loading_func = fetch_rcv1
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == 'kddcup99': elif name == 'kddcup99':
dataset_loading_func = fetch_kddcup99 dataset_loading_func = fetch_kddcup99
task = Task.CLASSIFICATION task = Task.MULTICLASSIFICATION
elif name == 'california_housing': elif name == 'california_housing':
dataset_loading_func = fetch_california_housing dataset_loading_func = fetch_california_housing
task = Task.REGRESSION task = Task.REGRESSION
......
...@@ -2,5 +2,6 @@ from enum import Enum ...@@ -2,5 +2,6 @@ from enum import Enum
class Task(Enum): class Task(Enum):
CLASSIFICATION = 1 BINARYCLASSIFICATION = 1
REGRESSION = 2 REGRESSION = 2
MULTICLASSIFICATION = 3
...@@ -33,11 +33,10 @@ class HyperparameterSearcher(object): ...@@ -33,11 +33,10 @@ class HyperparameterSearcher(object):
:return: a skopt.searchcv.BayesSearchCV object :return: a skopt.searchcv.BayesSearchCV object
''' '''
if dataset.task == Task.CLASSIFICATION:
estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed)
if dataset.task == Task.REGRESSION: if dataset.task == Task.REGRESSION:
estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed) estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed)
else:
estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed)
opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter, opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter,
cv=cv, n_jobs=-1, random_state=random_seed, cv=cv, n_jobs=-1, random_state=random_seed,
......
from bolsonaro.models.omp_forest_classifier import OmpForestClassifier from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
...@@ -11,18 +11,22 @@ class ModelFactory(object): ...@@ -11,18 +11,22 @@ class ModelFactory(object):
@staticmethod @staticmethod
def build(task, model_parameters): def build(task, model_parameters):
if task == Task.CLASSIFICATION: if task == Task.BINARYCLASSIFICATION:
model_func = OmpForestClassifier model_func = OmpForestBinaryClassifier
elif task == Task.REGRESSION: elif task == Task.REGRESSION:
model_func = OmpForestRegressor model_func = OmpForestRegressor
elif task == Task.MULTICLASSIFICATION:
model_func = OmpForestMulticlassClassifier
else: else:
raise ValueError("Unsupported task '{}'".format(task)) raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters) return model_func(model_parameters)
@staticmethod @staticmethod
def load(task, directory_path, experiment_id, model_raw_results): def load(task, directory_path, experiment_id, model_raw_results):
raise NotImplementedError
model_parameters = ModelParameters.load(directory_path, experiment_id) model_parameters = ModelParameters.load(directory_path, experiment_id)
model = ModelFactory.build(task, model_parameters) model = ModelFactory.build(task, model_parameters)
model.set_forest(model_raw_results.forest) # todo faire ce qu'il faut ici pour rétablir correctement le modèle
model.set_weights(model_raw_results.weights) # model.set_forest(model_raw_results.forest)
# model.set_weights(model_raw_results.weights)
return model return model
...@@ -6,13 +6,12 @@ import datetime ...@@ -6,13 +6,12 @@ import datetime
class ModelRawResults(object): class ModelRawResults(object):
def __init__(self, forest, weights, training_time, def __init__(self, model_object, training_time,
datetime, train_score, dev_score, test_score, datetime, train_score, dev_score, test_score,
score_metric, train_score_regressor, dev_score_regressor, score_metric, train_score_regressor, dev_score_regressor,
test_score_regressor): test_score_regressor):
self._forest = forest self._model_object = model_object
self._weights = weights
self._training_time = training_time self._training_time = training_time
self._datetime = datetime self._datetime = datetime
self._train_score = train_score self._train_score = train_score
...@@ -24,12 +23,8 @@ class ModelRawResults(object): ...@@ -24,12 +23,8 @@ class ModelRawResults(object):
self._test_score_regressor = test_score_regressor self._test_score_regressor = test_score_regressor
@property @property
def forest(self): def model_object(self):
return self._forest return self.model_object
@property
def weights(self):
return self._weights
@property @property
def training_time(self): def training_time(self):
......
from abc import abstractmethod, ABCMeta
import numpy as np
from sklearn.linear_model import OrthogonalMatchingPursuit
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from sklearn.base import BaseEstimator
class OmpForest(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters, base_forest_estimator):
self._base_forest_estimator = base_forest_estimator
self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
@property
def models_parameters(self):
return self._models_parameters
def score_base_estimator(self, X, y):
return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X):
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
@property
def forest(self):
return self._base_forest_estimator.estimators_
# sklearn baseestimator api methods
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
return self
def _extract_subforest(self, X, y):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X data is used for interrogation of every tree in the forest. The y data
is used for finding the weights in OMP.
:param X: (n_sample, n_features) array
:param y: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X")
D = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
# question: maybe consider other kinds of normalization.. centering?
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
self.fit_omp(D, y)
@staticmethod
def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False):
if normalize_weights:
# we can normalize weights (by their sum) so that they sum to 1
# and they can be interpreted as impact percentages for interpretability.
# this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef)
# question: je comprend pas le truc avec nonszero?
# predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
unsigned_coef = (coef_signs * omp_obj.coef_).squeeze()
intercept = omp_obj.intercept_
adjusted_forest_predictions = base_predictions * coef_signs
predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept
else:
predictions = omp_obj.predict(base_predictions)
return predictions
@abstractmethod
def fit_omp(self, atoms, objective):
pass
@abstractmethod
def predict(self, X):
pass
@abstractmethod
def score(self, X, y):
pass
class SingleOmpForest(OmpForest):
def __init__(self, models_parameters, base_forest_estimator):
# fit_intercept shouldn't be set to False as the data isn't necessarily centered here
# normalization is handled outsite OMP
self._omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
super().__init__(models_parameters, base_forest_estimator)
def fit_omp(self, atoms, objective):
self._omp.fit(atoms, objective)
def predict(self, X):
"""
Apply the SingleOmpForest to X.
Make all the base tree predictions then apply the OMP weights for pruning.
:param X:
:return:
"""
forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
\ No newline at end of file
from collections import namedtuple
from copy import deepcopy
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import OrthogonalMatchingPursuit
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest
import numpy as np
from bolsonaro.utils import binarize_class_data
class OmpForestBinaryClassifier(SingleOmpForest):
DEFAULT_SCORE_METRIC = 'indicator'
def __init__(self, models_parameters):
estimator = RandomForestClassifier(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed, n_jobs=-1)
super().__init__(models_parameters, estimator)
def _check_classes(self, y):
assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier should be {-1, +1}"
def fit(self, X_forest, y_forest, X_omp, y_omp):
self._check_classes(y_forest)
self._check_classes(y_omp)
return super().fit(X_forest, y_forest, X_omp, y_omp)
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
"""
Evaluate OMPForestClassifer on (`X`, `y`) using `metric`
:param X:
:param y:
:param metric: might be "indicator"
:return:
"""
predictions = self.predict(X)
if metric == 'indicator':
evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1))
else:
raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
class OmpForestMulticlassClassifier(OmpForest):
DEFAULT_SCORE_METRIC = 'indicator'
def __init__(self, models_parameters):
estimator = RandomForestClassifier(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed, n_jobs=-1)
super().__init__(models_parameters, estimator)
# question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest
self._dct_class_omp = {}
def fit_omp(self, atoms, objective):
assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__)
possible_classes = sorted(set(objective))
for class_label in possible_classes:
atoms_binary = binarize_class_data(atoms, class_label, inplace=False)
objective_binary = binarize_class_data(objective, class_label, inplace=False)
# todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
omp_class = OrthogonalMatchingPursuit(
n_nonzero_coefs=self.models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
omp_class.fit(atoms_binary, objective_binary)
self._dct_class_omp[class_label] = omp_class
return self._dct_class_omp
def predict(self, X):
forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
label_names = []
preds = []
for class_label, omp_class in self._dct_class_omp.items():
label_names.append(class_label)
atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False)
preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))
# todo verifier que ce n'est pas bugué ici
preds = np.array(preds).T
max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds]
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
predictions = self.predict(X)
if metric == 'indicator':
evaluation = np.sum(np.ones_like(predictions)[predictions == y]) / X.shape[0]
else:
raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation
class OmpForestClassifier(BaseEstimator):
def __init__(self):
raise ValueError('Classification tasks are not supported for now')
def fit(self, X, y): if __name__ == "__main__":
pass forest = RandomForestClassifier(n_estimators=10)
X = np.random.rand(10, 5)
y = np.random.choice([-1, +1], 10)
forest.fit(X, y)
print(forest.predict(np.random.rand(10, 5)))
\ No newline at end of file
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.base import BaseEstimator
import numpy as np import numpy as np
from bolsonaro.models.omp_forest import SingleOmpForest
class OmpForestRegressor(BaseEstimator): class OmpForestRegressor(SingleOmpForest):
DEFAULT_SCORE_METRIC = 'mse' DEFAULT_SCORE_METRIC = 'mse'
def __init__(self, models_parameters): def __init__(self, models_parameters):
self._regressor = RandomForestRegressor(**models_parameters.hyperparameters, estimator = RandomForestRegressor(**models_parameters.hyperparameters,
random_state=models_parameters.seed, n_jobs=-1) random_state=models_parameters.seed, n_jobs=-1)
self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
@property
def forest(self):
return self._forest
def set_forest(self, forest):
self._forest = forest
self._regressor.estimators_ = forest
@property
def weights(self):
return self._weights
def set_weights(self, weights):
self._weights = weights
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_forest, y_forest, X_omp, y_omp): super().__init__(models_parameters, estimator)
self._forest = self._train_forest(X_forest, y_forest)
self._omp = self._extract_subforest(X_omp, y_omp)
self._weights = self._omp.coef_
return self
def score_regressor(self, X, y):
return self._regressor.score(X, y)
def predict(self, X):
"""
Apply the OMPForestRegressor to X.
:param X:
:return:
"""
forest_predictions = self._forest_prediction(X)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_)))) \
if self._models_parameters.normalize_weights \
else self._omp.predict(forest_predictions)
return predictions
def score(self, X, y, metric=DEFAULT_SCORE_METRIC): def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
""" """
...@@ -80,38 +33,3 @@ class OmpForestRegressor(BaseEstimator): ...@@ -80,38 +33,3 @@ class OmpForestRegressor(BaseEstimator):
raise ValueError("Unsupported metric '{}'.".format(metric)) raise ValueError("Unsupported metric '{}'.".format(metric))
return evaluation return evaluation
def _train_forest(self, X, y):
self._regressor.fit(X, y)
forest = self._regressor.estimators_
return forest
def _extract_subforest(self, X, y):
"""
Given an already estimated regressor: apply OMP to get the weight of each tree.
The X data is used for interrogation of every tree in the forest. The y data
is used for finding the weights in OMP.
:param X: (n_sample, n_features) array
:param y: (n_sample,) array
:return:
"""
self._logger.debug("Forest make prediction on X")
D = self._forest_prediction(X)
if self._models_parameters.normalize_D:
# question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
D /= self._forest_norms
omp = OrthogonalMatchingPursuit(
n_nonzero_coefs=self._models_parameters.extracted_forest_size,
fit_intercept=False, normalize=False)
self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
.format(self._models_parameters.extracted_forest_size))
return omp.fit(D, y)
def _forest_prediction(self, X):
return np.array([tree.predict(X) for tree in self._forest]).T
...@@ -8,12 +8,26 @@ import numpy as np ...@@ -8,12 +8,26 @@ import numpy as np
class Trainer(object): class Trainer(object):
"""
Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method.
"""
def __init__(self, dataset): def __init__(self, dataset):
"""
:param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes
"""
self._dataset = dataset self._dataset = dataset
self._logger = LoggerFactory.create(LOG_PATH, __name__) self._logger = LoggerFactory.create(LOG_PATH, __name__)
def train(self, model, models_dir): def train(self, model, models_dir):
"""
:param model: Object with
:param models_dir: Where the results will be saved
:return:
"""
# todo cette fonction ne fait pas que "train", elle choisit le jeu de données, train et evalue le modèle -> nom à changer
self._logger.debug('Training model using train set...') self._logger.debug('Training model using train set...')
begin_time = time.time() begin_time = time.time()
...@@ -45,16 +59,24 @@ class Trainer(object): ...@@ -45,16 +59,24 @@ class Trainer(object):
) )
end_time = time.time() end_time = time.time()
ModelRawResults( results = ModelRawResults(
forest=model.forest, model_object=model,
weights=model.weights,
training_time=end_time - begin_time, training_time=end_time - begin_time,
datetime=datetime.datetime.now(), datetime=datetime.datetime.now(),
train_score=model.score(self._dataset.X_train, self._dataset.y_train), train_score=model.score(self._dataset.X_train, self._dataset.y_train),
dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev),
test_score=model.score(self._dataset.X_test, self._dataset.y_test), test_score=model.score(self._dataset.X_test, self._dataset.y_test),
score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way
train_score_regressor=model.score_regressor(self._dataset.X_train, self._dataset.y_train), train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train),
dev_score_regressor=model.score_regressor(self._dataset.X_dev, self._dataset.y_dev), dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=model.score_regressor(self._dataset.X_test, self._dataset.y_test) test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test)
).save(models_dir) )
results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_regressor))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train: {}".format(results.train_score_regressor))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev: {}".format(results.dev_score_regressor))
self._logger.info("Performance on dev: {}".format(results.dev_score))
import os import os
import json import json
import pickle import pickle
from copy import deepcopy
def resolve_experiment_id(models_dir): def resolve_experiment_id(models_dir):
...@@ -45,3 +46,21 @@ def load_obj_from_pickle(file_path, constructor): ...@@ -45,3 +46,21 @@ def load_obj_from_pickle(file_path, constructor):
with open(file_path, 'rb') as input_file: with open(file_path, 'rb') as input_file:
parameters = pickle.load(input_file) parameters = pickle.load(input_file)
return constructor(**parameters) return constructor(**parameters)
def binarize_class_data(data, class_pos, inplace=True):
"""
Replace class_pos by +1 and ~class_pos by -1.
:param data: an array of classes
:param class_pos: the positive class to be replaced by +1
:param inplace: If True, modify data in place (still return it, also)
:return:
"""
if not inplace:
data = deepcopy(data)
position_class_labels = (data == class_pos)
data[~(position_class_labels)] = -1
data[(position_class_labels)] = +1
return data
\ No newline at end of file
...@@ -38,7 +38,7 @@ def clean_numpy_int_list(list_n): ...@@ -38,7 +38,7 @@ def clean_numpy_int_list(list_n):
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env # get environment variables in .env
load_dotenv(find_dotenv('.env.example')) load_dotenv(find_dotenv('.env'))
DEFAULT_CV = 3 DEFAULT_CV = 3
DEFAULT_N_ITER = 50 DEFAULT_N_ITER = 50
...@@ -79,11 +79,10 @@ if __name__ == "__main__": ...@@ -79,11 +79,10 @@ if __name__ == "__main__":
dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None) dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None)
dataset = DatasetLoader.load(dataset_parameters) dataset = DatasetLoader.load(dataset_parameters)
if dataset.task == Task.CLASSIFICATION:
scorer = 'accuracy'
if dataset.task == Task.REGRESSION: if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error' scorer = 'neg_mean_squared_error'
else:
scorer = 'accuracy'
bayesian_searcher = HyperparameterSearcher() bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter, opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter,
......
...@@ -12,7 +12,7 @@ import os ...@@ -12,7 +12,7 @@ import os
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env # get environment variables in .env
load_dotenv(find_dotenv('.env.example')) load_dotenv(find_dotenv('.env'))
DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
......
...@@ -19,9 +19,20 @@ import json ...@@ -19,9 +19,20 @@ import json
def process_job(seed, parameters, experiment_id, hyperparameters): def process_job(seed, parameters, experiment_id, hyperparameters):
"""
Experiment function.
Will be used as base function for worker in multithreaded application.
:param seed:
:param parameters:
:param experiment_id:
:return:
"""
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident())) seed, threading.get_ident()))
logger.info('seed={}'.format(seed)) logger.info('seed={}'.format(seed))
seed_str = str(seed) seed_str = str(seed)
experiment_id_str = str(experiment_id) experiment_id_str = str(experiment_id)
models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \ models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
...@@ -36,12 +47,12 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -36,12 +47,12 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
dataset_normalizer=parameters['dataset_normalizer'] dataset_normalizer=parameters['dataset_normalizer']
) )
dataset_parameters.save(models_dir, experiment_id_str) dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load(dataset_parameters) dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset) trainer = Trainer(dataset)
for extracted_forest_size in parameters['extracted_forest_size']: for extracted_forest_size in parameters['extracted_forest_size']:
# question if training is too long, one may also split experiments for different forest sizes into different workers
logger.info('extracted_forest_size={}'.format(extracted_forest_size)) logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
...@@ -62,8 +73,7 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -62,8 +73,7 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
logger.info('Training done') logger.info('Training done')
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env load_dotenv(find_dotenv('.env'))
load_dotenv(find_dotenv('.env.example'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
DEFAULT_DATASET_NAME = 'boston' DEFAULT_DATASET_NAME = 'boston'
...@@ -110,6 +120,7 @@ if __name__ == "__main__": ...@@ -110,6 +120,7 @@ if __name__ == "__main__":
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
# The number of tree to extract from forest (K)
parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \ parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \
if type(parameters['extracted_forest_size']) == list \ if type(parameters['extracted_forest_size']) == list \
else [parameters['extracted_forest_size']] else [parameters['extracted_forest_size']]
...@@ -128,6 +139,7 @@ if __name__ == "__main__": ...@@ -128,6 +139,7 @@ if __name__ == "__main__":
if parameters['seeds'] != None and parameters['random_seed_number'] > 1: if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
# Seeds are either provided as parameters or generated at random
seeds = parameters['seeds'] if parameters['seeds'] is not None \ seeds = parameters['seeds'] if parameters['seeds'] is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \ else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(parameters['random_seed_number'])] for i in range(parameters['random_seed_number'])]
......
{
"dataset_name": "boston",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": 100,
"extracted_forest_size": [
10,
20,
30
],
"models_dir": ".\\models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 3,
"seeds": null,
"subsets_used": "train+dev,train+dev",
"normalize_weights": false
}
\ No newline at end of file
{
"dataset_name": "boston",
"normalize_D": true,
"dataset_normalizer": "standard",
"forest_size": 100,
"extracted_forest_size": [
10,
20,
30
],
"models_dir": ".\\models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 3,
"seeds": null,
"subsets_used": "train+dev,train+dev",
"normalize_weights": false
}
\ No newline at end of file
{
"dataset_name": "boston",
"normalize_D": true,
"dataset_normalizer": "standard",
"forest_size": 100,
"extracted_forest_size": [
10,
20,
30
],
"models_dir": ".\\models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 3,
"seeds": null,
"subsets_used": "train+dev,train+dev",
"normalize_weights": true
}
\ No newline at end of file
{
"dataset_name": "boston",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": 100,
"extracted_forest_size": [
10,
20,
30
],
"models_dir": ".\\models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 3,
"seeds": null,
"subsets_used": "train+dev,train+dev",
"normalize_weights": true
}
\ No newline at end of file
{
"dataset_name": "boston",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": 100,
"extracted_forest_size": [
10,
20,
30
],
"models_dir": ".\\models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 3,
"seeds": null,
"subsets_used": "train,dev",
"normalize_weights": false
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment