Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • 12-experiment-pipeline
  • 13-visualization
  • 14-correction-of-multiclass-classif
  • 15-integration-sota
  • 17-adding-new-datasets
  • 19-add-some-tests
  • 20-coherence-des-arbres-de-predictions
  • 24-non-negative-omp
  • correlation
  • master
  • archive/10-gridsearching-of-the-base-forest
  • archive/4-implement-omp_forest_classifier
  • archive/5-add-plots-2
  • archive/Leo_Add_first_notebook
  • archive/farah_notation_and_related_work
  • archive/wip_clean_scripts
16 results

Target

Select target project
  • luc.giffon/bolsonaro
1 result
Select Git revision
  • 12-experiment-pipeline
  • 13-visualization
  • 14-correction-of-multiclass-classif
  • 15-integration-sota
  • 17-adding-new-datasets
  • 19-add-some-tests
  • 20-coherence-des-arbres-de-predictions
  • 24-non-negative-omp
  • correlation
  • master
  • archive/10-gridsearching-of-the-base-forest
  • archive/4-implement-omp_forest_classifier
  • archive/5-add-plots-2
  • archive/Leo_Add_first_notebook
  • archive/farah_notation_and_related_work
  • archive/wip_clean_scripts
16 results
Show changes
Commits on Source (39)
Showing
with 55222 additions and 314 deletions
models/*
results/*
experiments/unnamed/
*/.kile/*
*.kilepr
......@@ -80,9 +80,6 @@ target/
# Jupyter NB Checkpoints
.ipynb_checkpoints/
# exclude data from source control by default
/data/
# Mac OS-specific storage files
.DS_Store
......@@ -371,6 +368,3 @@ TSWLatexianTemp*
*.lpz
reports/*.pdf
# Image
*.png
* Fix pickle loading of ModelRawResults, because saving the model_object leads import issues.
* Fix ModelFactory.load function.
* Fix model results loading in compute_results.py.
* Check that omp multiclasses classifier is working as expected.
* In the bayesian search computation, output a different file name depending on the task of the trained model.
* Check the best params scores of the regressors (neg_mean_squared_error leads to huge negative values).
* Prepare the json experiment files to run.
\ No newline at end of file
* Fix the dataset error of fetcher when job_number > 1.
\ No newline at end of file
......@@ -14,10 +14,6 @@ class Dataset(object):
def task(self):
return self._task
@property
def dataset_parameters(self):
return self._dataset_parameters
@property
def X_train(self):
return self._X_train
......
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load
......@@ -9,13 +10,38 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import random
import pandas as pd
class DatasetLoader(object):
DEFAULT_DATASET_NAME = 'boston'
DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = 'standard'
DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 5
DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.1
DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_SUBSETS_USED = 'train,dev'
DEFAULT_NORMALIZE_WEIGHTS = False
dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine',
'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',
'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds']
dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5,
'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15,
'20newsgroups_vectorized':3, 'lfw_people':3,
'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3,
'diamonds': 15}
@staticmethod
def load(dataset_parameters):
name = dataset_parameters.name
X, y = None, None
if name == 'boston':
dataset_loading_func = load_boston
task = Task.REGRESSION
......@@ -37,37 +63,52 @@ class DatasetLoader(object):
elif name == 'breast_cancer':
dataset_loading_func = change_binary_func_load(load_breast_cancer)
task = Task.BINARYCLASSIFICATION
elif name == 'olivetti_faces': # bug (no return X_y)
dataset_loading_func = fetch_olivetti_faces
task = Task.MULTICLASSIFICATION
elif name == '20newsgroups': # bug (no return X_y)
dataset_loading_func = fetch_20newsgroups
elif name == 'olivetti_faces':
dataset = fetch_olivetti_faces(random_state=dataset_parameters.random_state, shuffle=True)
task = Task.MULTICLASSIFICATION
X, y = dataset.data, dataset.target
elif name == '20newsgroups_vectorized':
dataset_loading_func = fetch_20newsgroups_vectorized
dataset = fetch_20newsgroups_vectorized()
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
elif name == 'lfw_people': # needs PIL (image dataset)
dataset_loading_func = fetch_lfw_people
elif name == 'lfw_people':
dataset = fetch_lfw_people()
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
elif name == 'lfw_pairs':
dataset_loading_func = fetch_lfw_pairs
dataset = fetch_lfw_pairs()
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
elif name == 'covtype':
dataset_loading_func = fetch_covtype
X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION
elif name == 'rcv1':
dataset_loading_func = fetch_rcv1
task = Task.MULTICLASSIFICATION
elif name == 'kddcup99':
dataset_loading_func = fetch_kddcup99
X, y = fetch_rcv1(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION
elif name == 'california_housing':
dataset_loading_func = fetch_california_housing
X, y = fetch_california_housing(return_X_y=True)
task = Task.REGRESSION
elif name == 'diamonds':
# TODO: make a proper fetcher instead of the following code
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('data/diamonds.csv')
df.drop(['Unnamed: 0'], axis=1 , inplace=True)
df = df[(df[['x','y','z']] != 0).all(axis=1)]
df.drop(['x','y','z'], axis=1, inplace= True)
label_cut = LabelEncoder()
label_color = LabelEncoder()
label_clarity = LabelEncoder()
df['cut'] = label_cut.fit_transform(df['cut'])
df['color'] = label_color.fit_transform(df['color'])
df['clarity'] = label_clarity.fit_transform(df['clarity'])
X, y = df.drop(['price'], axis=1), df['price']
task = Task.REGRESSION
else:
raise ValueError("Unsupported dataset '{}'".format(name))
if X is None:
X, y = dataset_loading_func(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=dataset_parameters.test_size,
random_state=dataset_parameters.random_state)
......@@ -92,3 +133,20 @@ class DatasetLoader(object):
return Dataset(task, X_train,
X_dev, X_test, y_train, y_dev, y_test)
@staticmethod
def load_default(dataset_name, seed):
begin_random_seed_range = 1
end_random_seed_range = 2000
seed = seed if seed else random.randint(begin_random_seed_range, end_random_seed_range)
dataset_parameters = DatasetParameters(
name=dataset_name,
test_size=DatasetLoader.DEFAULT_TEST_SIZE,
dev_size=DatasetLoader.DEFAULT_DEV_SIZE,
random_state=seed,
dataset_normalizer=DatasetLoader.DEFAULT_DATASET_NORMALIZER
)
return DatasetLoader.load(dataset_parameters)
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import os
import pickle
......@@ -11,22 +13,35 @@ class ModelFactory(object):
@staticmethod
def build(task, model_parameters):
if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]:
raise ValueError("Unsupported task '{}'".format(task))
if task == Task.BINARYCLASSIFICATION:
model_func = OmpForestBinaryClassifier
if model_parameters.extraction_strategy == 'omp':
return OmpForestBinaryClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
elif task == Task.REGRESSION:
model_func = OmpForestRegressor
if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestRegressor(model_parameters)
else:
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
elif task == Task.MULTICLASSIFICATION:
model_func = OmpForestMulticlassClassifier
if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters)
@staticmethod
def load(task, directory_path, experiment_id, model_raw_results):
raise NotImplementedError
model_parameters = ModelParameters.load(directory_path, experiment_id)
model = ModelFactory.build(task, model_parameters)
# todo faire ce qu'il faut ici pour rétablir correctement le modèle
model.set_forest(model_raw_results.model_object.forest)
model.set_weights(model_raw_results.model_object.weights)
return model
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
......@@ -5,13 +5,15 @@ import os
class ModelParameters(object):
def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters):
def __init__(self, extracted_forest_size, normalize_D, subsets_used,
normalize_weights, seed, hyperparameters, extraction_strategy):
self._extracted_forest_size = extracted_forest_size
self._normalize_D = normalize_D
self._subsets_used = subsets_used
self._normalize_weights = normalize_weights
self._seed = seed
self._hyperparameters = hyperparameters
self._extraction_strategy = extraction_strategy
@property
def extracted_forest_size(self):
......@@ -37,6 +39,10 @@ class ModelParameters(object):
def hyperparameters(self):
return self._hyperparameters
@property
def extraction_strategy(self):
return self._extraction_strategy
def save(self, directory_path, experiment_id):
save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id),
self.__dict__)
......
......@@ -8,8 +8,8 @@ class ModelRawResults(object):
def __init__(self, model_object, training_time,
datetime, train_score, dev_score, test_score,
score_metric, train_score_regressor, dev_score_regressor,
test_score_regressor):
train_score_base, dev_score_base,
test_score_base, score_metric, base_score_metric):
self._model_object = model_object
self._training_time = training_time
......@@ -17,10 +17,11 @@ class ModelRawResults(object):
self._train_score = train_score
self._dev_score = dev_score
self._test_score = test_score
self._train_score_base = train_score_base
self._dev_score_base = dev_score_base
self._test_score_base = test_score_base
self._score_metric = score_metric
self._train_score_regressor = train_score_regressor
self._dev_score_regressor = dev_score_regressor
self._test_score_regressor = test_score_regressor
self._base_score_metric = base_score_metric
@property
def model_object(self):
......@@ -47,20 +48,24 @@ class ModelRawResults(object):
return self._test_score
@property
def score_metric(self):
return self._score_metric
def train_score_base(self):
return self._train_score_base
@property
def dev_score_base(self):
return self._dev_score_base
@property
def train_score_regressor(self):
return self._train_score_regressor
def test_score_base(self):
return self._test_score_base
@property
def dev_score_regressor(self):
return self._dev_score_regressor
def score_metric(self):
return self._score_metric
@property
def test_score_regressor(self):
return self._test_score_regressor
def base_score_metric(self):
return self._base_score_metric
def save(self, models_dir):
save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle',
......
......@@ -17,10 +17,14 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
def models_parameters(self):
return self._models_parameters
def predict_base_estimator(self, X):
return self._base_forest_estimator.predict(X)
def score_base_estimator(self, X, y):
return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X):
# We need to use predict_proba to get the probabilities of each class
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
@property
......@@ -63,7 +67,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
if normalize_weights:
# we can normalize weights (by their sum) so that they sum to 1
# and they can be interpreted as impact percentages for interpretability.
# this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef)
# this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) --> I don't see why
# question: je comprend pas le truc avec nonszero?
# predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
......
......@@ -60,7 +60,7 @@ class OmpForestMulticlassClassifier(OmpForest):
for class_label in possible_classes:
atoms_binary = binarize_class_data(atoms, class_label, inplace=False)
objective_binary = binarize_class_data(objective, class_label, inplace=False)
# todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
# TODO: peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
omp_class = OrthogonalMatchingPursuit(
n_nonzero_coefs=self.models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False)
......@@ -69,7 +69,9 @@ class OmpForestMulticlassClassifier(OmpForest):
return self._dct_class_omp
def predict(self, X):
forest_predictions = self._base_estimator_predictions(X)
'''forest_predictions = self._base_estimator_predictions(X)
print(forest_predictions.shape)
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
......@@ -79,9 +81,26 @@ class OmpForestMulticlassClassifier(OmpForest):
for class_label, omp_class in self._dct_class_omp.items():
label_names.append(class_label)
atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False)
print(atoms_binary.shape)
preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))
# todo verifier que ce n'est pas bugué ici
# TODO: verifier que ce n'est pas bugué ici
preds = np.array(preds).T'''
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
label_names = []
preds = []
num_class = 0
for class_label, omp_class in self._dct_class_omp.items():
label_names.append(class_label)
atoms_binary = (forest_predictions[num_class] - 0.5) * 2 # centré réduit de 0/1 à -1/1
preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))
num_class += 1
preds = np.array(preds).T
max_preds = np.argmax(preds, axis=1)
......@@ -97,6 +116,27 @@ class OmpForestMulticlassClassifier(OmpForest):
return evaluation
@staticmethod
def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False):
if normalize_weights:
# we can normalize weights (by their sum) so that they sum to 1
# and they can be interpreted as impact percentages for interpretability.
# this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) --> I don't see why
# question: je comprend pas le truc avec nonszero?
# predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :] # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
unsigned_coef = (coef_signs * omp_obj.coef_).squeeze()
intercept = omp_obj.intercept_
adjusted_forest_predictions = base_predictions * coef_signs
predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept
else:
predictions = omp_obj.predict(base_predictions)
return predictions
if __name__ == "__main__":
forest = RandomForestClassifier(n_estimators=10)
......
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta
import numpy as np
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
"""
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
"""
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
self._regressor.fit(X_train, y_train)
y_val_pred = self._regressor.predict(X_val)
forest_pred = score_metric(y_val, y_val_pred)
forest = self._regressor.estimators_
selected_trees = list()
tree_list = list(self._regressor.estimators_)
for _ in range(self._extracted_forest_size):
best_similarity = 100000
found_index = 0
for i in range(len(tree_list)):
lonely_tree = tree_list[i]
del tree_list[i]
val_list = list()
for tree in tree_list:
val_pred = tree.predict(X_val)
val_list.append(val_pred)
val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0)
val_score = score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity):
found_index = i
best_similarity = temp_similarity
tree_list.insert(i, lonely_tree)
selected_trees.append(tree_list[found_index])
del tree_list[found_index]
pruned_forest = list(set(forest) - set(selected_trees))
self._regressor.estimators_ = pruned_forest
def score(self, X, y):
test_list = list()
for mod in self._regressor.estimators_:
test_pred = mod.predict(X)
test_list.append(test_pred)
test_list = np.array(test_list)
test_mean = np.mean(test_list, axis=0)
score = mean_squared_error(test_mean, y)
return score
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task
from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import time
import datetime
import numpy as np
......@@ -12,16 +18,41 @@ class Trainer(object):
Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method.
"""
def __init__(self, dataset):
def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score,
base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score):
"""
:param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes
"""
self._dataset = dataset
self._logger = LoggerFactory.create(LOG_PATH, __name__)
self._regression_score_metric = regression_score_metric
self._classification_score_metric = classification_score_metric
self._base_regression_score_metric = base_regression_score_metric
self._base_classification_score_metric = base_classification_score_metric
self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else classification_score_metric.__name__
self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else base_classification_score_metric.__name__
def init(self, model):
if model.models_parameters.subsets_used == 'train,dev':
@property
def score_metric_name(self):
return self._score_metric_name
@property
def base_score_metric_name(self):
return self._base_score_metric_name
def init(self, model, subsets_used='train,dev'):
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
if subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
else:
self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
self._logger.debug('Fitting the forest on train subset')
elif model.models_parameters.subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._X_omp = self._dataset.X_dev
......@@ -43,43 +74,77 @@ class Trainer(object):
def train(self, model):
"""
:param model: Object with
:param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
:return:
"""
self._logger.debug('Training model using train set...')
self._begin_time = time.time()
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
model.fit(
X=self._X_forest,
y=self._y_forest
)
else:
model.fit(
X_forest=self._X_forest,
y_forest=self._y_forest,
X_omp=self._X_omp,
y_omp=self._y_omp
self._X_forest,
self._y_forest,
self._X_omp,
self._y_omp
)
self._end_time = time.time()
def __score_func(self, model, X, y_true):
if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
y_pred = model.predict(X)
result = self._regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
y_pred = model.predict(X)
if type(model) is OmpForestBinaryClassifier:
y_pred = y_pred.round()
result = self._classification_score_metric(y_true, y_pred)
return result
def __score_func_base(self, model, X, y_true):
if type(model) == OmpForestRegressor:
y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
y_pred = model.predict_base_estimator(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) == RandomForestClassifier:
y_pred = model.predict(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred)
return result
def compute_results(self, model, models_dir):
"""
:param model: Object with
:param models_dir: Where the results will be saved
"""
results = ModelRawResults(
model_object=model,
model_object='',
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=model.score(self._dataset.X_train, self._dataset.y_train),
dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev),
test_score=model.score(self._dataset.X_test, self._dataset.y_test),
score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way
train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train),
dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test)
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
)
results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_regressor))
self._logger.info("Base performance on test: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train: {}".format(results.train_score_regressor))
self._logger.info("Base performance on train: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev: {}".format(results.dev_score_regressor))
self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
......@@ -2,6 +2,8 @@ import os
import json
import pickle
from copy import deepcopy
import contextlib
import joblib
def resolve_experiment_id(models_dir):
......@@ -58,7 +60,6 @@ def binarize_class_data(data, class_pos, inplace=True):
"""
if not inplace:
data = deepcopy(data)
position_class_labels = (data == class_pos)
data[~(position_class_labels)] = -1
data[(position_class_labels)] = +1
......@@ -66,10 +67,48 @@ def binarize_class_data(data, class_pos, inplace=True):
return data
def change_binary_func_load(base_load_function):
def func_load(return_X_y):
def func_load(return_X_y, random_state=None):
if random_state:
X, y = base_load_function(return_X_y=return_X_y, random_state=random_state)
else:
X, y = base_load_function(return_X_y=return_X_y)
possible_classes = sorted(set(y))
assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication"
y = binarize_class_data(y, possible_classes[-1])
return X, y
return func_load
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
"""Context manager to patch joblib to report into tqdm progress bar given as argument"""
class TqdmBatchCompletionCallback:
def __init__(self, time, index, parallel):
self.index = index
self.parallel = parallel
def __call__(self, index):
tqdm_object.update()
if self.parallel._original_iterator is not None:
self.parallel.dispatch_next()
old_batch_callback = joblib.parallel.BatchCompletionCallBack
joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
try:
yield tqdm_object
finally:
joblib.parallel.BatchCompletionCallBack = old_batch_callback
tqdm_object.close()
def is_int(value):
try:
int(value)
return True
except ValueError:
return False
def is_float(value):
try:
float(value)
return True
except ValueError:
return False
......@@ -57,7 +57,56 @@ class Plotter(object):
ax.plot(x_value, mean, c=color_mean, label=label)
@staticmethod
def plot_losses(file_path, all_experiment_scores, x_value, xlabel, ylabel, all_labels, title):
def plot_stage1_losses(file_path, all_experiment_scores_with_params,
all_experiment_scores_wo_params, x_value, xlabel, ylabel, all_labels, title):
fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True)
n = len(all_experiment_scores_with_params)
if n != len(all_experiment_scores_wo_params):
raise ValueError('all_experiment_scores_with_params and all_experiment_scores_wo_params must have the same len to be compared.')
"""
Get as many different colors from the specified cmap (here nipy_spectral)
as there are curve to plot.
"""
colors = Plotter.get_colors_from_cmap(n)
for j, all_experiment_scores in enumerate([all_experiment_scores_with_params,
all_experiment_scores_wo_params]):
# For each curve to plot
for i in range(n):
# Retreive the scores in a list for each seed
experiment_scores = list(all_experiment_scores[i].values())
# Compute the mean and the std for the CI
mean_experiment_scores = np.average(experiment_scores, axis=0)
std_experiment_scores = np.std(experiment_scores, axis=0)
# Plot the score curve with the CI
Plotter.plot_mean_and_CI(
ax=axes[j],
mean=mean_experiment_scores,
lb=mean_experiment_scores + std_experiment_scores,
ub=mean_experiment_scores - std_experiment_scores,
x_value=x_value,
color_mean=colors[i],
facecolor=colors[i],
label=all_labels[i]
)
axes[0].set_xlabel(xlabel)
axes[1].set_xlabel(xlabel)
axes[0].set_ylabel(ylabel)
axes[1].set_title(title)
handles, labels = axes[0].get_legend_handles_labels()
legend = axes[0].legend(handles, labels, loc='upper center', bbox_to_anchor=(1.1, -0.15))
fig.savefig(file_path, dpi=fig.dpi, bbox_extra_artists=(legend,), bbox_inches='tight')
plt.close(fig)
@staticmethod
def plot_stage2_losses(file_path, all_experiment_scores, x_value,
xlabel, ylabel, all_labels, title):
fig, ax = plt.subplots()
n = len(all_experiment_scores)
......@@ -91,7 +140,7 @@ class Plotter(object):
plt.ylabel(ylabel)
plt.title(title)
plt.legend(loc='upper right')
fig.savefig(file_path, dpi=fig.dpi)
fig.savefig(file_path, dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)
@staticmethod
......
......@@ -4,7 +4,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.hyperparameter_searcher import HyperparameterSearcher
from bolsonaro.utils import save_obj_to_json
from bolsonaro.utils import save_obj_to_json, tqdm_joblib, is_int, is_float
import argparse
import os
......@@ -12,13 +12,20 @@ import pathlib
import pickle
import random
from dotenv import find_dotenv, load_dotenv
from joblib import Parallel, delayed
from tqdm import tqdm
import threading
import numpy as np
import math
from collections import Counter
from itertools import chain, combinations
"""
I had to install skopt from this repository
https://github.com/darenr/scikit-optimize that handles
the issue described here https://github.com/scikit-optimize/scikit-optimize/issues/762.
"""
from skopt.space import Categorical, Integer, Real
from skopt.space import Categorical, Integer
def clean_numpy_int_dict(dictionary):
......@@ -34,6 +41,89 @@ def clean_numpy_int_list(list_n):
clean_numpy_int_list(elem) if type(elem) == list else elem
for elem in list_n]
def process_job(dataset_name, seed, param_space, args):
logger = LoggerFactory.create(LOG_PATH, 'hyperparameter-searcher_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
dataset = DatasetLoader.load_default(dataset_name, seed)
if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error'
else:
scorer = 'accuracy'
bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, param_space, args.n_iter,
args.cv, seed, scorer)
return {
'_scorer': scorer,
'_best_score_train': opt.best_score_,
'_best_score_test': opt.score(dataset.X_test, dataset.y_test),
'_best_parameters': clean_numpy_int_dict(opt.best_params_),
'_random_seed': seed
}
def run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args):
# Run one hyperparameter search job per seed
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
opt_results = Parallel(n_jobs=args.job_number)(delayed(process_job)(
dataset_name, seeds[i], param_space, args) for i in range(len(seeds)))
return opt_results
def compute_best_params_over_seeds(seeds, dataset_name, param_space, args):
opt_results = run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args)
# Move k best_parameters to a list of dict
all_best_params = [opt_result['_best_parameters'] for opt_result in opt_results]
"""
list of hyperparam dicts -> list of hyperparam list
where each element of form 'key:value' becomes 'key_value'
to afterwards count most common pairs.
"""
stringify_best_params = list()
for current_best_params in all_best_params:
new_best_params = list()
for key, value in current_best_params.items():
new_best_params.append(key + '_' + str(value))
stringify_best_params.append(new_best_params)
# Compute pair combinations
pair_combinations = chain.from_iterable(combinations(line, 2) for line in stringify_best_params)
# Count most common pair combinations in ascent order
most_common_pair_combinations = Counter(pair_combinations).most_common()
"""
Select the most frequent hyperparameter values
until all different hyperparameter variables are
filled.
"""
all_param_names = all_best_params[0].keys()
best_params = dict()
for pair, _ in most_common_pair_combinations:
for element in pair:
split = element.split('_')
param, value = '_'.join(split[:-1]), split[-1]
if param not in best_params:
if is_int(value):
value = int(value)
elif is_float(value):
value = float(value)
best_params[param] = value
if len(best_params) == len(all_param_names):
break
return {
'_scorer': opt_results[0]['_scorer'],
'_best_score_train': np.mean([opt_result['_best_score_train'] for opt_result in opt_results]),
'_best_score_test': np.mean([opt_result['_best_score_test'] for opt_result in opt_results]),
'_best_parameters': best_params,
'_random_seed': [opt_result['_random_seed'] for opt_result in opt_results]
}
if __name__ == "__main__":
# get environment variables in .env
......@@ -41,57 +131,54 @@ if __name__ == "__main__":
DEFAULT_CV = 3
DEFAULT_N_ITER = 50
DEFAULT_VERBOSE = False
DEFAULT_JOB_NUMBER = -1
DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000),
'min_samples_leaf': Integer(1, 1000),
'max_depth': Integer(1, 20),
'max_features': Categorical(['auto', 'sqrt', 'log2'], [0.5, 0.25, 0.25])}
DATASET_LIST = ['boston', 'iris', 'diabetes']
# , 'digits', 'linnerud', 'wine']
begin_random_seed_range = 1
end_random_seed_range = 2000
DEFAULT_USE_VARIABLE_SEED_NUMBER = False
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--cv', nargs='?', type=int, default=DEFAULT_CV, help='Specify the size of the cross-validation.')
parser.add_argument('--n_iter', nargs='?', type=int, default=DEFAULT_N_ITER, help='Specify the number of iterations for the bayesian search.')
parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.')
parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--use_variable_seed_number', action='store_true', default=DEFAULT_USE_VARIABLE_SEED_NUMBER, help='Compute the amount of random seeds depending on the dataset.')
parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
begin_random_seed_range = 1
end_random_seed_range = 2000
if args.seeds != None and args.random_seed_number > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
if args.seed is None:
random_seed = random.randint(begin_random_seed_range, end_random_seed_range)
else:
random_seed = args.seed
# Seeds are either provided as parameters or generated at random
if not args.use_variable_seed_number:
seeds = args.seeds if args.seeds is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(args.random_seed_number)]
for dataset_name in args.datasets:
dataset_dir = os.path.join('experiments', dataset_name, 'stage1')
pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True)
logger.info('Bayesian search on dataset {}'.format(dataset_name))
dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None)
dataset = DatasetLoader.load(dataset_parameters)
if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error'
else:
scorer = 'accuracy'
bayesian_searcher = HyperparameterSearcher()
opt = bayesian_searcher.search(dataset, DICT_PARAM_SPACE, args.n_iter,
args.cv, random_seed, scorer, args.verbose)
"""
Compute the amount of random seeds as specified in
DatasetLoader.dataset_seed_numbers dictionary, depending on
the dataset.
"""
if args.use_variable_seed_number:
seeds = [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(DatasetLoader.dataset_seed_numbers[dataset_name])]
dict_results = {'_scorer': scorer,
'_best_score_train': opt.best_score_,
'_best_score_test': opt.score(dataset.X_test, dataset.y_test),
'_best_parameters': clean_numpy_int_dict(opt.best_params_),
'_random_seed': random_seed
}
dict_results = compute_best_params_over_seeds(seeds, dataset_name,
DICT_PARAM_SPACE, args)
save_obj_to_json(os.path.join(dataset_dir, 'params.json'), dict_results)
This diff is collapsed.
......@@ -3,7 +3,7 @@ from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.trainer import Trainer
from bolsonaro.utils import resolve_experiment_id
from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
......@@ -13,9 +13,12 @@ import json
import pathlib
import random
import os
from concurrent import futures
from joblib import Parallel, delayed
import threading
import json
from tqdm import tqdm
import numpy as np
import shutil
def process_job(seed, parameters, experiment_id, hyperparameters):
......@@ -51,10 +54,10 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer = Trainer(dataset)
if parameters['extraction_strategy'] != 'none':
for extracted_forest_size in parameters['extracted_forest_size']:
# question if training is too long, one may also split experiments for different forest sizes into different workers
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
......@@ -63,52 +66,104 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model)
trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
"""
Command lines example for stage 1:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --extracted_forest_size_stop=0.05
python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 --dataset_name=california_housing
Command lines example for stage 2:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 no_normalization --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D --normalize_D --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_weights --normalize_weights --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D_and_weights --normalize_D --normalize_weights --extracted_forest_size_stop=0.05
python code/compute_results.py --stage 2 --experiment_ids 7 8 9 10 --dataset_name=california_housing
Command lines example for stage 3:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev
python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing
Command lines example for stage 4:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 4 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 4 omp_with_params --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/compute_results.py --stage 4 --experiment_ids 1 2 3 --dataset_name=california_housing
"""
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
DEFAULT_DATASET_NAME = 'boston'
DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = None
DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE = 10
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_SUBSETS_USED = 'train,dev'
DEFAULT_NORMALIZE_WEIGHTS = False
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_sizes/{extracted_forest_size}
DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
DEFAULT_VERBOSE = False
DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp'
begin_random_seed_range = 1
end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--experiment_id', nargs='?', type=int, default=None, help='Specify an experiment id. Remove already existing model with this specified experiment id.')
parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.')
parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.')
parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
parser.add_argument('--subsets_used', nargs='?', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.')
args = parser.parse_args()
if args.experiment_configuration:
......@@ -118,26 +173,43 @@ if __name__ == "__main__":
else:
parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
# The number of tree to extract from forest (K)
parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \
if type(parameters['extracted_forest_size']) == list \
else [parameters['extracted_forest_size']]
hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
if os.path.exists(hyperparameters_path):
logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
with open(hyperparameters_path, 'r+') as file_hyperparameter:
hyperparameters = json.load(file_hyperparameter)['best_parameters']
loaded_hyperparameters = json.load(file_hyperparameter)['best_parameters']
if args.skip_best_hyperparams:
hyperparameters = {'n_estimators': loaded_hyperparameters['n_estimators']}
else:
hyperparameters = loaded_hyperparameters
else:
hyperparameters = {}
if parameters['forest_size'] is not None:
"""
First case: no best hyperparameters are specified and no forest_size parameter
specified in argument, so use the DEFAULT_FOREST_SIZE.
Second case: no matter if hyperparameters are specified, the forest_size parameter
will override it.
Third implicit case: use the number of estimators found in specified hyperparameters.
"""
if len(hyperparameters) == 0 and parameters['forest_size'] is None:
hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE
elif parameters['forest_size'] is not None:
hyperparameters['n_estimators'] = parameters['forest_size']
# The number of tree to extract from forest (K)
parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
np.linspace(0, args.extracted_forest_size_stop,
parameters['extracted_forest_size_samples'] + 1,
endpoint=False)[1:]).astype(np.int)).tolist()
if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
......@@ -146,6 +218,10 @@ if __name__ == "__main__":
else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(parameters['random_seed_number'])]
if args.experiment_id:
experiment_id = args.experiment_id
shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
else:
# Resolve the next experiment id number (last id + 1)
experiment_id = resolve_experiment_id(parameters['models_dir'])
logger.info('Experiment id: {}'.format(experiment_id))
......@@ -153,18 +229,32 @@ if __name__ == "__main__":
"""
If the experiment configuration isn't coming from
an already existing file, save it to a json file to
keep trace of it.
keep trace of it (either a specified path, either in 'unnamed' dir.).
"""
if args.experiment_configuration is None:
with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format(
experiment_id), 'w') as output_file:
if args.save_experiment_configuration:
if len(args.save_experiment_configuration) != 2:
raise ValueError('save_experiment_configuration must have two parameters.')
elif int(args.save_experiment_configuration[0]) not in list(range(1, 6)):
raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 5]).')
output_experiment_stage_path = os.path.join(args.experiment_configuration_path,
args.dataset_name, 'stage' + args.save_experiment_configuration[0])
pathlib.Path(output_experiment_stage_path).mkdir(parents=True, exist_ok=True)
output_experiment_configuration_path = os.path.join(output_experiment_stage_path,
args.save_experiment_configuration[1] + '.json')
else:
pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True)
output_experiment_configuration_path = os.path.join(
args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format(
experiment_id))
with open(output_experiment_configuration_path, 'w') as output_file:
json.dump(
parameters,
output_file,
indent=4
)
# Train as much job as there are seeds
with futures.ProcessPoolExecutor(len(seeds)) as executor:
list(f.result() for f in futures.as_completed(executor.submit(process_job, seed,
parameters, experiment_id, hyperparameters) for seed in seeds))
# Run as much job as there are seeds
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i],
parameters, experiment_id, hyperparameters) for i in range(len(seeds)))
# This file may be used to create an environment using:
# $ conda create --name <env> --file <this file>
# platform: linux-64
_libgcc_mutex=0.1=main
alabaster=0.7.12=pypi_0
attrs=19.3.0=pypi_0
autopep8=1.4.4=py_0
awscli=1.16.273=pypi_0
babel=2.7.0=pypi_0
backcall=0.1.0=pypi_0
blas=1.0=mkl
bleach=3.1.0=pypi_0
botocore=1.13.9=pypi_0
ca-certificates=2019.10.16=0
certifi=2019.9.11=py37_0
chardet=3.0.4=pypi_0
click=7.0=pypi_0
colorama=0.4.1=pypi_0
coverage=4.5.4=pypi_0
cycler=0.10.0=py_2
dbus=1.13.6=he372182_0
decorator=4.4.1=pypi_0
defusedxml=0.6.0=pypi_0
docutils=0.15.2=pypi_0
entrypoints=0.3=pypi_0
expat=2.2.5=he1b5a44_1004
flake8=3.7.9=pypi_0
fontconfig=2.13.1=he4413a7_1000
freetype=2.10.0=he983fc9_1
gettext=0.19.8.1=hc5be6a0_1002
glib=2.58.3=h6f030ca_1002
gst-plugins-base=1.14.5=h0935bb2_0
gstreamer=1.14.5=h36ae1b5_0
icu=58.2=hf484d3e_1000
idna=2.8=pypi_0
imagesize=1.1.0=pypi_0
importlib-metadata=0.23=pypi_0
intel-openmp=2019.4=243
ipykernel=5.1.3=pypi_0
ipython=7.9.0=pypi_0
ipython-genutils=0.2.0=pypi_0
ipywidgets=7.5.1=pypi_0
jedi=0.15.1=pypi_0
jinja2=2.10.3=pypi_0
jmespath=0.9.4=pypi_0
joblib=0.14.0=py_0
jpeg=9c=h14c3975_1001
jsonschema=3.1.1=pypi_0
jupyter=1.0.0=pypi_0
jupyter-client=5.3.4=pypi_0
jupyter-console=6.0.0=pypi_0
jupyter-core=4.6.1=pypi_0
kiwisolver=1.1.0=py37hc9558a2_0
libedit=3.1.20181209=hc058e9b_0
libffi=3.2.1=hd88cf55_4
libgcc-ng=9.1.0=hdf63c60_0
libgfortran-ng=7.3.0=hdf63c60_0
libiconv=1.15=h516909a_1005
libpng=1.6.37=hed695b0_0
libstdcxx-ng=9.1.0=hdf63c60_0
libuuid=2.32.1=h14c3975_1000
libxcb=1.13=h14c3975_1002
libxml2=2.9.9=h13577e0_2
markupsafe=1.1.1=pypi_0
matplotlib=3.1.1=pypi_0
mccabe=0.6.1=pypi_0
mistune=0.8.4=pypi_0
mkl=2019.4=243
mkl-service=2.3.0=py37he904b0f_0
mkl_fft=1.0.14=py37ha843d7b_0
mkl_random=1.1.0=py37hd6b4f25_0
more-itertools=7.2.0=pypi_0
nbconvert=5.6.1=pypi_0
nbformat=4.4.0=pypi_0
ncurses=6.1=he6710b0_1
notebook=6.0.2=pypi_0
numpy=1.17.2=py37haad9e8e_0
numpy-base=1.17.2=py37hde5b4d6_0
openssl=1.1.1d=h7b6447c_3
packaging=19.2=pypi_0
pandas=0.25.2=py37he6710b0_0
pandocfilters=1.4.2=pypi_0
parso=0.5.1=pypi_0
pcre=8.43=he1b5a44_0
pexpect=4.7.0=pypi_0
pickleshare=0.7.5=pypi_0
pip=19.3.1=py37_0
prometheus-client=0.7.1=pypi_0
prompt-toolkit=2.0.10=pypi_0
pthread-stubs=0.4=h14c3975_1001
ptyprocess=0.6.0=pypi_0
pyaml=19.4.1=pypi_0
pyasn1=0.4.7=pypi_0
pycodestyle=2.5.0=pypi_0
pyflakes=2.1.1=pypi_0
pygments=2.4.2=pypi_0
pyparsing=2.4.3=pypi_0
pyqt=5.9.2=py37hcca6a23_4
pyrsistent=0.15.5=pypi_0
python=3.7.4=h265db76_1
python-dateutil=2.8.0=py37_0
python-dotenv=0.10.3=pypi_0
pytz=2019.3=py_0
pyyaml=5.1.2=pypi_0
pyzmq=18.1.0=pypi_0
qt=5.9.7=h52cfd70_2
qtconsole=4.5.5=pypi_0
readline=7.0=h7b6447c_5
requests=2.22.0=pypi_0
rsa=3.4.2=pypi_0
s3transfer=0.2.1=pypi_0
scikit-learn=0.21.3=py37hd81dba3_0
scikit-optimize=0.6+19.g180d6be=pypi_0
scipy=1.3.1=py37h7c811a0_0
send2trash=1.5.0=pypi_0
setuptools=41.6.0=py37_0
sip=4.19.8=py37hf484d3e_1000
six=1.12.0=py37_0
snowballstemmer=2.0.0=pypi_0
sphinx=2.2.1=pypi_0
sphinxcontrib-applehelp=1.0.1=pypi_0
sphinxcontrib-devhelp=1.0.1=pypi_0
sphinxcontrib-htmlhelp=1.0.2=pypi_0
sphinxcontrib-jsmath=1.0.1=pypi_0
sphinxcontrib-qthelp=1.0.2=pypi_0
sphinxcontrib-serializinghtml=1.1.3=pypi_0
sqlite=3.30.1=h7b6447c_0
terminado=0.8.2=pypi_0
testpath=0.4.4=pypi_0
tk=8.6.8=hbc83047_0
tornado=6.0.3=py37h516909a_0
tqdm=4.37.0=pypi_0
traitlets=4.3.3=pypi_0
urllib3=1.25.6=pypi_0
wcwidth=0.1.7=pypi_0
webencodings=0.5.1=pypi_0
wheel=0.33.6=py37_0
widgetsnbextension=3.5.1=pypi_0
xorg-libxau=1.0.9=h14c3975_0
xorg-libxdmcp=1.1.3=h516909a_0
xz=5.2.4=h14c3975_4
zipp=0.6.0=pypi_0
zlib=1.2.11=h7b6447c_3
This diff is collapsed.
{
"experiment_id": 1,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"1",
"none_with_params"
],
"job_number": -1,
"extraction_strategy": "none",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 4,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"none_wo_params"
],
"job_number": -1,
"extraction_strategy": "none",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file