Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • luc.giffon/bolsonaro
1 result
Select Git revision
Show changes
Commits on Source (61)
Showing
with 607 additions and 503 deletions
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load
from bolsonaro.utils import change_binary_func_load, change_binary_func_openml
from sklearn.datasets import load_boston, load_iris, load_diabetes, \
load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing, \
fetch_openml
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import random
......@@ -30,13 +31,15 @@ class DatasetLoader(object):
dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine',
'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',
'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds']
'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds', 'steel-plates',
'kr-vs-kp', 'kin8nm', 'spambase', 'musk', 'gamma']
dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5,
'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15,
'20newsgroups_vectorized':3, 'lfw_people':3,
'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3,
'diamonds': 15}
'diamonds': 15, 'steel-plates': 15, 'kr-vs-kp': 15, 'kin8nm': 15,
'spambase': 15, 'musk': 15, 'gamma': 15}
@staticmethod
def load(dataset_parameters):
......@@ -103,6 +106,24 @@ class DatasetLoader(object):
df['clarity'] = label_clarity.fit_transform(df['clarity'])
X, y = df.drop(['price'], axis=1), df['price']
task = Task.REGRESSION
elif name == 'steel-plates':
dataset_loading_func = change_binary_func_openml('steel-plates-fault')
task = Task.BINARYCLASSIFICATION
elif name == 'kr-vs-kp':
dataset_loading_func = change_binary_func_openml('kr-vs-kp')
task = Task.BINARYCLASSIFICATION
elif name == 'kin8nm':
X, y = fetch_openml('kin8nm', return_X_y=True)
task = Task.REGRESSION
elif name == 'spambase':
dataset_loading_func = change_binary_func_openml('spambase')
task = Task.BINARYCLASSIFICATION
elif name == 'musk':
dataset_loading_func = change_binary_func_openml('musk')
task = Task.BINARYCLASSIFICATION
elif name == 'gamma':
dataset_loading_func = change_binary_func_openml('MagicTelescope')
task = Task.BINARYCLASSIFICATION
else:
raise ValueError("Unsupported dataset '{}'".format(name))
......
from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans
from abc import abstractmethod, ABCMeta
import numpy as np
from scipy.stats import mode
from joblib import Parallel, delayed
from tqdm import tqdm
class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
"""
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
"""
def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
predictions = list()
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X_train))
predictions = np.array(predictions)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
labels = np.array(kmeans.labels_)
# For each cluster select the best tree on the validation set
extracted_forest_sizes = list(range(self._extracted_forest_size))
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
for i in range(self._extracted_forest_size))
self._estimator.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val,
y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update()
return self._estimator.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._estimator.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update()
return tree_pred
def predict(self, X):
return self._estimator.predict(X)
def score(self, X, y):
predictions = list()
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X))
predictions = np.array(predictions)
mean_predictions = np.mean(predictions, axis=0)
score = self._score_metric(mean_predictions, y)
return score
def predict_base_estimator(self, X):
return self._estimator.predict(X)
......@@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
......@@ -22,9 +23,11 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION:
if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters)
......@@ -33,15 +36,21 @@ class ModelFactory(object):
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestRegressor(model_parameters)
else:
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'none':
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.MULTICLASSIFICATION:
if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
......@@ -6,12 +6,12 @@ import datetime
class ModelRawResults(object):
def __init__(self, model_object, training_time,
def __init__(self, model_weights, training_time,
datetime, train_score, dev_score, test_score,
train_score_base, dev_score_base,
test_score_base, score_metric, base_score_metric):
self._model_object = model_object
self._model_weights = model_weights
self._training_time = training_time
self._datetime = datetime
self._train_score = train_score
......@@ -24,8 +24,8 @@ class ModelRawResults(object):
self._base_score_metric = base_score_metric
@property
def model_object(self):
return self.model_object
def model_weights(self):
return self.model_weights
@property
def training_time(self):
......@@ -68,6 +68,8 @@ class ModelRawResults(object):
return self._base_score_metric
def save(self, models_dir):
if not os.path.exists(models_dir):
os.mkdir(models_dir)
save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle',
self.__dict__)
......
......@@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator
class OmpForest(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters, base_forest_estimator):
self._base_forest_estimator = base_forest_estimator
self._models_parameters = models_parameters
......@@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X):
# We need to use predict_proba to get the probabilities of each class
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
@property
......@@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
# sklearn baseestimator api methods
def fit(self, X_forest, y_forest, X_omp, y_omp):
# print(y_forest.shape)
# print(set([type(y) for y in y_forest]))
self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
return self
......@@ -96,6 +98,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
pass
class SingleOmpForest(OmpForest):
def __init__(self, models_parameters, base_forest_estimator):
# fit_intercept shouldn't be set to False as the data isn't necessarily centered here
# normalization is handled outsite OMP
......@@ -123,3 +126,24 @@ class SingleOmpForest(OmpForest):
forest_predictions /= self._forest_norms
return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = self._base_estimator_predictions(X).T
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)[0]
select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)
print(len(omp_trees_indices))
return select_trees
......@@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest):
return super().fit(X_forest, y_forest, X_omp, y_omp)
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_])
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)
omp_trees_predictions = forest_predictions[omp_trees_indices].T[1]
# Here forest_pred is the probability of being class 1.
result_omp = np.mean(omp_trees_predictions, axis=1)
result_omp = (result_omp - 0.5) * 2
return result_omp
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
"""
Evaluate OMPForestClassifer on (`X`, `y`) using `metric`
......@@ -106,6 +134,36 @@ class OmpForestMulticlassClassifier(OmpForest):
max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds]
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
label_names = []
preds = []
num_class = 0
for class_label, omp_class in self._dct_class_omp.items():
weights = omp_class.coef_
omp_trees_indices = np.nonzero(weights)
label_names.append(class_label)
atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1
preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices))
num_class += 1
preds = np.array(preds).T
max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds]
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
predictions = self.predict(X)
......
......@@ -3,6 +3,7 @@ from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta
import numpy as np
from tqdm import tqdm
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
......@@ -10,56 +11,69 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
"""
def __init__(self, models_parameters):
def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed)
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
self._regressor.fit(X_train, y_train)
y_val_pred = self._regressor.predict(X_val)
forest_pred = score_metric(y_val, y_val_pred)
forest = self._regressor.estimators_
y_val_pred = self._estimator.predict(X_val)
forest_pred = self._score_metric(y_val, y_val_pred)
forest = self._estimator.estimators_
selected_trees = list()
tree_list = list(self._regressor.estimators_)
tree_list = list(self._estimator.estimators_)
val_scores = list()
with tqdm(tree_list) as tree_pred_bar:
tree_pred_bar.set_description('[Initial tree predictions]')
for tree in tree_pred_bar:
val_scores.append(tree.predict(X_val))
tree_pred_bar.update(1)
for _ in range(self._extracted_forest_size):
with tqdm(range(self._extracted_forest_size), disable=True) as pruning_forest_bar:
pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]')
for i in pruning_forest_bar:
best_similarity = 100000
found_index = 0
for i in range(len(tree_list)):
lonely_tree = tree_list[i]
del tree_list[i]
val_list = list()
for tree in tree_list:
val_pred = tree.predict(X_val)
val_list.append(val_pred)
val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0)
val_score = score_metric(val_mean, y_val)
with tqdm(range(len(tree_list)), disable=True) as tree_list_bar:
tree_list_bar.set_description(f'[Tree selection s={self._extracted_forest_size} #{i}]')
for j in tree_list_bar:
lonely_tree = tree_list[j]
del tree_list[j]
val_mean = np.mean(np.asarray(val_scores), axis=0)
val_score = self._score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity):
found_index = i
found_index = j
best_similarity = temp_similarity
tree_list.insert(i, lonely_tree)
tree_list.insert(j, lonely_tree)
val_scores.insert(j, lonely_tree.predict(X_val))
tree_list_bar.update(1)
selected_trees.append(tree_list[found_index])
del tree_list[found_index]
del val_scores[found_index]
pruning_forest_bar.update(1)
pruned_forest = list(set(forest) - set(selected_trees))
self._regressor.estimators_ = pruned_forest
self._estimator.estimators_ = pruned_forest
def score(self, X, y):
test_list = list()
for mod in self._regressor.estimators_:
for mod in self._estimator.estimators_:
test_pred = mod.predict(X)
test_list.append(test_pred)
test_list = np.array(test_list)
test_mean = np.mean(test_list, axis=0)
score = mean_squared_error(test_mean, y)
score = self._score_metric(test_mean, y)
return score
def predict_base_estimator(self, X):
return self._estimator.predict(X)
......@@ -95,14 +95,21 @@ class Trainer(object):
)
self._end_time = time.time()
def __score_func(self, model, X, y_true):
def __score_func(self, model, X, y_true, weights=True):
if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
if weights:
y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
result = self._regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
if weights:
y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
if type(model) is OmpForestBinaryClassifier:
y_pred = y_pred.round()
y_pred = np.sign(y_pred)
y_pred = np.where(y_pred==0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred)
return result
......@@ -126,8 +133,17 @@ class Trainer(object):
:param model: Object with
:param models_dir: Where the results will be saved
"""
model_weights = ''
if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]:
model_weights = model._omp.coef_
elif type(model) == OmpForestMulticlassClassifier:
model_weights = model._dct_class_omp
elif type(model) == OmpForestBinaryClassifier:
model_weights = model._omp
results = ModelRawResults(
model_object='',
model_weights=model_weights,
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
......@@ -148,3 +164,27 @@ class Trainer(object):
self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
results = ModelRawResults(
model_weights='',
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
)
results.save(models_dir+'_no_weights')
self._logger.info("Base performance on test without weights: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train without weights: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
......@@ -5,6 +5,8 @@ from copy import deepcopy
import contextlib
import joblib
from sklearn.datasets import fetch_openml
def resolve_experiment_id(models_dir):
"""
......@@ -78,6 +80,16 @@ def change_binary_func_load(base_load_function):
return X, y
return func_load
def change_binary_func_openml(dataset_name):
def func_load(return_X_y=True, random_state=None):
X, y = fetch_openml(dataset_name, return_X_y=return_X_y)
possible_classes = sorted(set(y))
assert len(possible_classes) == 2, "Function change binary_func_load only work for binary classfication"
y = binarize_class_data(y, possible_classes[-1])
y = y.astype('int')
return X, y
return func_load
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
"""Context manager to patch joblib to report into tqdm progress bar given as argument"""
......
......@@ -109,16 +109,16 @@ class Plotter(object):
fig, ax = plt.subplots()
n = len(all_experiment_scores)
nb_experiments = len(all_experiment_scores)
"""
Get as many different colors from the specified cmap (here nipy_spectral)
as there are curve to plot.
"""
colors = Plotter.get_colors_from_cmap(n)
colors = Plotter.get_colors_from_cmap(nb_experiments)
# For each curve to plot
for i in range(n):
for i in range(nb_experiments):
# Retreive the scores in a list for each seed
experiment_scores = list(all_experiment_scores[i].values())
# Compute the mean and the std for the CI
......
......@@ -17,7 +17,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes'
return len(os.listdir(extracted_forest_sizes_root_path))
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id):
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
......@@ -28,6 +28,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_train_scores = dict()
experiment_dev_scores = dict()
experiment_test_scores = dict()
experiment_weights = dict()
all_extracted_forest_sizes = list()
# Used to check if all losses were computed using the same metric (it should be the case)
......@@ -44,14 +45,19 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_train_scores[seed] = list()
experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list()
experiment_weights[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
extracted_forest_sizes.sort(key=int)
all_extracted_forest_sizes.append(list(map(int, extracted_forest_sizes)))
for extracted_forest_size in extracted_forest_sizes:
# models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
if weights:
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
else:
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + '_no_weights'
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(extracted_forest_size_path)
# Save the scores
......@@ -60,6 +66,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_test_scores[seed].append(model_raw_results.test_score)
# Save the metric
experiment_score_metrics.append(model_raw_results.score_metric)
# Save the weights
#experiment_weights[seed].append(model_raw_results.model_weights)
# Sanity checks
if len(set(experiment_score_metrics)) > 1:
......@@ -67,7 +75,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
if len(set([sum(extracted_forest_sizes) for extracted_forest_sizes in all_extracted_forest_sizes])) != 1:
raise ValueError("The extracted forest sizes aren't the sames across seeds.")
return experiment_train_scores, experiment_dev_scores, experiment_test_scores, all_extracted_forest_sizes[0], experiment_score_metrics[0]
return experiment_train_scores, experiment_dev_scores, experiment_test_scores, \
all_extracted_forest_sizes[0], experiment_score_metrics[0]#, experiment_weights
def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experiment_id, extracted_forest_sizes_number):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
......@@ -120,6 +129,7 @@ if __name__ == "__main__":
DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
DEFAULT_PLOT_WEIGHT_DENSITY = False
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].')
......@@ -130,6 +140,7 @@ if __name__ == "__main__":
parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.')
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.')
args = parser.parse_args()
if args.stage not in list(range(1, 6)):
......@@ -347,9 +358,17 @@ if __name__ == "__main__":
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
"""omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric, experiment_weights = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])"""
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
#omp_with_params_without_weights
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2], weights=False)
"""# base_with_params
logger.info('Loading base_with_params experiment scores 2...')
......@@ -369,13 +388,58 @@ if __name__ == "__main__":
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4')
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4_fix')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores],
all_labels=['base', 'random', 'omp'],
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
omp_with_params_without_weights_test_scores],
all_labels=['base', 'random', 'omp', 'omp_without_weights'],
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
elif args.stage == 5:
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
# base_with_params
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
# omp_with_params
logger.info('Loading kmeans_with_params experiment scores...')
kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \
kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[3])
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric
== omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
kmeans_with_params_test_scores],
all_labels=['base', 'random', 'omp', 'kmeans'],
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
......@@ -384,16 +448,3 @@ if __name__ == "__main__":
raise ValueError('This stage number is not supported yet, but it will be!')
logger.info('Done.')
"""
TODO:
For each dataset:
Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations)
Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
Stage 4) A figure to finally compare the perf of our approach using the previous selected
parameters vs the baseline vs other papers using different extracted forest size
(percentage of the tree size found previously in best hyperparams search) on the abscissa.
IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1).
"""
......@@ -21,7 +21,7 @@ import numpy as np
import shutil
def process_job(seed, parameters, experiment_id, hyperparameters):
def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose):
"""
Experiment function.
......@@ -34,7 +34,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
"""
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
seed_str = str(seed)
experiment_id_str = str(experiment_id)
......@@ -55,13 +54,31 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer = Trainer(dataset)
if parameters['extraction_strategy'] != 'none':
for extracted_forest_size in parameters['extracted_forest_size']:
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer)
for i in range(len(parameters['extracted_forest_size'])))
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
continue
else:
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
if already_exists:
logger.info('Base forest result already exists. Skipping...')
else:
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=extracted_forest_size,
extracted_forest_size=forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
......@@ -76,14 +93,36 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info(f'Training done for seed {seed_str}')
seed_job_pb.update(1)
def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
seed, parameters, dataset, hyperparameters, experiment_id, trainer):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
seed, extracted_forest_size, threading.get_ident()))
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
return
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
if already_exists:
logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
return
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=forest_size,
extracted_forest_size=extracted_forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
......@@ -98,7 +137,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
"""
Command lines example for stage 1:
......@@ -138,6 +176,7 @@ if __name__ == "__main__":
DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp'
DEFAULT_OVERWRITE = False
begin_random_seed_range = 1
end_random_seed_range = 2000
......@@ -163,7 +202,8 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.')
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
args = parser.parse_args()
if args.experiment_configuration:
......@@ -173,7 +213,7 @@ if __name__ == "__main__":
else:
parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']:
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
......@@ -208,7 +248,7 @@ if __name__ == "__main__":
parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
np.linspace(0, args.extracted_forest_size_stop,
parameters['extracted_forest_size_samples'] + 1,
endpoint=False)[1:]).astype(np.int)).tolist()
endpoint=True)[1:]).astype(np.int)).tolist()
if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
......@@ -220,6 +260,7 @@ if __name__ == "__main__":
if args.experiment_id:
experiment_id = args.experiment_id
if args.overwrite:
shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
else:
# Resolve the next experiment id number (last id + 1)
......@@ -255,6 +296,6 @@ if __name__ == "__main__":
)
# Run as much job as there are seeds
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i],
parameters, experiment_id, hyperparameters) for i in range(len(seeds)))
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as seed_job_pb:
Parallel(n_jobs=args.job_number)(delayed(seed_job)(seed_job_pb, seeds[i],
parameters, experiment_id, hyperparameters, args.verbose) for i in range(len(seeds)))
{
"experiment_id": 1,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"1",
"none_with_params"
],
"job_number": -1,
"extraction_strategy": "none",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 4,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"none_wo_params"
],
"job_number": -1,
"extraction_strategy": "none",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 6,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"omp_wo_params"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.7953125,
"best_score_test": 0.7909854175872735,
"best_parameters": {
"max_depth": 20,
"max_features": "sqrt",
"min_samples_leaf": 1,
"n_estimators": 809
},
"random_seed": 1763
}
\ No newline at end of file
{
"experiment_id": 2,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"1",
"random_with_params"
],
"job_number": -1,
"extraction_strategy": "random",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 5,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage1",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"random_wo_params"
],
"job_number": -1,
"extraction_strategy": "random",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file
{
"experiment_id": 1,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "20newsgroups_vectorized",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "models/20newsgroups_vectorized/stage2",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"2",
"no_normalization"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
7,
13,
20,
27,
34
]
}
\ No newline at end of file