Commit 3f1e869d authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Merge branch '15-integration-sota' into 'master'

Resolve "integration-sota"

Closes #15

See merge request !12
parents 00d0f323 41ec448d
from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans
from abc import abstractmethod, ABCMeta
import numpy as np
from scipy.stats import mode
from joblib import Parallel, delayed
from tqdm import tqdm
class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
"""
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
"""
def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
predictions = list()
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X_train))
predictions = np.array(predictions)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
labels = np.array(kmeans.labels_)
# For each cluster select the best tree on the validation set
extracted_forest_sizes = list(range(self._extracted_forest_size))
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
for i in range(self._extracted_forest_size))
self._estimator.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val,
y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update()
return self._estimator.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._estimator.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update()
return tree_pred
def predict(self, X):
return self._estimator.predict(X)
def score(self, X, y):
predictions = list()
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X))
predictions = np.array(predictions)
mean_predictions = np.mean(predictions, axis=0)
score = self._score_metric(mean_predictions, y)
return score
def predict_base_estimator(self, X):
return self._estimator.predict(X)
......@@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
......@@ -22,9 +23,11 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION:
if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters)
......@@ -33,15 +36,21 @@ class ModelFactory(object):
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestRegressor(model_parameters)
else:
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'none':
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.MULTICLASSIFICATION:
if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
......@@ -3,6 +3,7 @@ from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta
import numpy as np
from tqdm import tqdm
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
......@@ -10,56 +11,69 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
"""
def __init__(self, models_parameters):
def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed)
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
self._regressor.fit(X_train, y_train)
y_val_pred = self._regressor.predict(X_val)
forest_pred = score_metric(y_val, y_val_pred)
forest = self._regressor.estimators_
y_val_pred = self._estimator.predict(X_val)
forest_pred = self._score_metric(y_val, y_val_pred)
forest = self._estimator.estimators_
selected_trees = list()
tree_list = list(self._regressor.estimators_)
tree_list = list(self._estimator.estimators_)
val_scores = list()
with tqdm(tree_list) as tree_pred_bar:
tree_pred_bar.set_description('[Initial tree predictions]')
for tree in tree_pred_bar:
val_scores.append(tree.predict(X_val))
tree_pred_bar.update(1)
for _ in range(self._extracted_forest_size):
best_similarity = 100000
found_index = 0
for i in range(len(tree_list)):
lonely_tree = tree_list[i]
del tree_list[i]
val_list = list()
for tree in tree_list:
val_pred = tree.predict(X_val)
val_list.append(val_pred)
val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0)
val_score = score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity):
found_index = i
best_similarity = temp_similarity
tree_list.insert(i, lonely_tree)
selected_trees.append(tree_list[found_index])
del tree_list[found_index]
with tqdm(range(self._extracted_forest_size), disable=True) as pruning_forest_bar:
pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]')
for i in pruning_forest_bar:
best_similarity = 100000
found_index = 0
with tqdm(range(len(tree_list)), disable=True) as tree_list_bar:
tree_list_bar.set_description(f'[Tree selection s={self._extracted_forest_size} #{i}]')
for j in tree_list_bar:
lonely_tree = tree_list[j]
del tree_list[j]
val_mean = np.mean(np.asarray(val_scores), axis=0)
val_score = self._score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity):
found_index = j
best_similarity = temp_similarity
tree_list.insert(j, lonely_tree)
val_scores.insert(j, lonely_tree.predict(X_val))
tree_list_bar.update(1)
selected_trees.append(tree_list[found_index])
del tree_list[found_index]
del val_scores[found_index]
pruning_forest_bar.update(1)
pruned_forest = list(set(forest) - set(selected_trees))
self._regressor.estimators_ = pruned_forest
self._estimator.estimators_ = pruned_forest
def score(self, X, y):
test_list = list()
for mod in self._regressor.estimators_:
for mod in self._estimator.estimators_:
test_pred = mod.predict(X)
test_list.append(test_pred)
test_list = np.array(test_list)
test_mean = np.mean(test_list, axis=0)
score = mean_squared_error(test_mean, y)
score = self._score_metric(test_mean, y)
return score
def predict_base_estimator(self, X):
return self._estimator.predict(X)
......@@ -400,23 +400,51 @@ if __name__ == "__main__":
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
elif args.stage == 5:
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
# base_with_params
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
# omp_with_params
logger.info('Loading kmeans_with_params experiment scores...')
kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \
kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[3])
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric
== omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
# experiment_weights
#Plotter.weight_density(experiment_weights, output_path + os.sep + 'weight_density.png')
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
kmeans_with_params_test_scores],
all_labels=['base', 'random', 'omp', 'kmeans'],
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
else:
raise ValueError('This stage number is not supported yet, but it will be!')
logger.info('Done.')
"""
TODO:
For each dataset:
Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations)
Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
Stage 4) A figure to finally compare the perf of our approach using the previous selected
parameters vs the baseline vs other papers using different extracted forest size
(percentage of the tree size found previously in best hyperparams search) on the abscissa.
IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1).
"""
# Implemenation of the paper 'Ensemble selection from libraries of models' by Rich Caruana et al.
# A set of trees is trained, then those performing the best on the dev set are added to the forest.
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.externals import joblib
import numpy as np
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
(data, target) = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=10000, random_state=2019)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=3000, random_state=2019)
criterion_arr = ["mse"]#, "friedman_mse", "mae"]
splitter_arr = ["best"]#, "random"]
depth_arr = [i for i in range(5, 20, 1)]
min_samples_split_arr = [i for i in range(2, 20, 1)]
min_samples_leaf_arr = [i for i in range(2, 20, 1)]
max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
library = list()
for criterion in criterion_arr:
for splitter in splitter_arr:
for depth in depth_arr:
for min_samples_split in min_samples_split_arr:
for min_samples_leaf in min_samples_leaf_arr:
for max_features in max_features_arr:
t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=2017)
t.fit(X_train, y_train)
#filename= "t_{}_{}_{}_{}_{}_{}.sav".format(criterion, splitter, depth, min_sample_split, min_sample_leaf, max_features)
library.append(t)
print("classifiers", len(library))
scores_list = list()
for classif in library:
r2 = classif.score(X_val, y_val)
scores_list.append(r2)
print("scores", len(scores_list))
#print(scores_list)
##########################
np_scores_list = np.array(scores_list)
#sort_ind = np.argsort(np_scores_list)[::-1]
#sorted_scores = [scores_list[i] for i in sort_ind]
#sorted_class = [class_list[i] for i in sort_ind]
#print(sorted_class)
#print(sorted_scores)
#res = list()
#for s in [10, 20, 30]:
# best_class = sorted_class[:s]
# temp_res = list()
# for r in best_class:
# r2 = r.score(X_test, y_test)
# temp_res.append(r2)
# res.append(np.mean(temp_res))
#print("scores on test set", res)
###########################
#for k in range(num_sel_tree-1):
# cand_index = 0
# best_mean = 0
# #del scores_sel[-1]
# for j in range(len(scores_list)):
# scores_sel.append(scores_list[j])
# temp_scores_sel = np.array(scores_sel)
# temp_mean = np.mean(temp_scores_sel)
# if (temp_mean > best_mean):
# best_mean = temp_mean
# cand_index = j
# del scores_sel[-1]
# ens_sel.append(class_list[cand_index])
# scores_sel.append(scores_list[cand_index])
# del scores_list[cand_index]
# del class_list[cand_index]
#print("selected models",ens_sel)
#print("selected_scores", scores_sel)
trees_in_forest = list()
perf_prun_forest = list()
for num_sel_tree in [2, 4, 6, 8, 10, 15, 20, 30, 40, 50]:
class_list = list(library)
print("class list", len(class_list))
m = np.argmax(np_scores_list)
ens_sel = [class_list[m]]
#scores_sel = [scores_list[m]]
#del scores_list[m]
temp_pred = class_list[m].predict(X_val)
del class_list[m]
#print("prima di entrare nel for", len(class_list))
for k in range(num_sel_tree-1):
cand_index = 0
r2_best = -10000
#print("ad ogni loop", len(class_list))
for j in range(len(class_list)):
temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val)))
temp_mean = np.mean(temp_pred, axis=0)
#print("temp pred and temp mean shapes", temp_pred.shape, temp_mean.shape)
r2_temp = r2_score(y_val, temp_mean)
if (r2_temp > r2_best):
r2_best = r2_temp
cand_index = j
temp_pred = np.delete(temp_pred, -1, 0)
#print(temp_pred.shape)
ens_sel.append(class_list[cand_index])
#scores_sel.append(scores_list[cand_index])
temp_pred = np.vstack((temp_pred, class_list[cand_index].predict(X_val)))
#del scores_list[cand_index]
del class_list[cand_index]
#print("ens_sel", len(ens_sel))
test_list = list()
for mod in ens_sel:
test_pred = mod.predict(X_test)
test_list.append(test_pred)
#print("scores sep", mod.score(X_test, y_test))
test_list = np.array(test_list)
#print("test list shape", test_list.shape)
test_mean = np.mean(test_list, axis=0)
#print("test list shape", test_mean.shape)
r2_test = r2_score(test_mean, y_test)
#print(r2_test)
#print(ens_sel[0].score(X_test, y_test), ens_sel[1].score(X_test, y_test))
print(num_sel_tree, r2_test)
trees_in_forest.append(num_sel_tree)
perf_prun_forest.append(r2_test)
print(trees_in_forest)
print(perf_prun_forest)
ax = plt.gca()
ax.plot(trees_in_forest, perf_prun_forest, label='ensemble selection')
ax.legend()
#plt.title('fashion mnist')
plt.xlabel('num trees')
plt.ylabel('r2 score')
plt.savefig("ensemble_selection.pdf")
plt.show()
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.externals import joblib
import numpy as np
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
(data, target) = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=10000, random_state=2019)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=3000, random_state=2019)
num_trees = 100
prun_for_size=[2, 4, 6, 8, 10, 12, 15, 20]
randfor = RandomForestRegressor(num_trees, max_depth=7, random_state=2019)
randfor.fit(X_train, y_train)
randfor_pred = randfor.score(X_val, y_val)
trees_forest = randfor.estimators_
trees_in_forest = list()
perf_prun_forest = list()
for k in range(len(prun_for_size)):
ens_sel = list()
trees_list = list(randfor.estimators_)
#print("dovrebbe essere la taglia iniziale", len(trees_list))
for j in range(num_trees - prun_for_size[k]):
best_simil = 100000
cand_ind = 0
for i in range(len(trees_list)):
lonely_tree = trees_list[i]
del trees_list[i]
val_list = list()
#print("quando poto", len(trees_list))
for tree in trees_list:
val_pred = tree.predict(X_val)
val_list.append(val_pred)
val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0)
r2_val = r2_score(val_mean, y_val)
temp_simil = abs(randfor_pred-r2_val)
if (temp_simil < best_simil):
cand_ind = i
best_simil = temp_simil
trees_list.insert(i, lonely_tree)
#print("quando innesto", len(trees_list))
ens_sel.append(trees_list[cand_ind])
del trees_list[cand_ind]
prun_for = list(set(trees_forest) - set(ens_sel))
print("prun_for", len(prun_for))
print("trees forest", len(trees_forest))
print("ens_sel", len(ens_sel))
test_list = list()
for mod in prun_for:
test_pred = mod.predict(X_test)
test_list.append(test_pred)
#print("scores sep", mod.score(X_test, y_test))
test_list = np.array(test_list)
#print("test list shape", test_list.shape)
test_mean = np.mean(test_list, axis=0)
#print("test list shape", test_mean.shape)
r2_test = r2_score(test_mean, y_test)
#print(r2_test)
#print(ens_sel[0].score(X_test, y_test), ens_sel[1].score(X_test, y_test))
print(len(prun_for), r2_test)
trees_in_forest.append(len(prun_for))
perf_prun_forest.append(r2_test)
print(trees_in_forest)
print(r2_test)
ax = plt.gca()
ax.plot(trees_in_forest, perf_prun_forest, label='pruned forest')
ax.legend()
#plt.title('fashion mnist')
plt.xlabel('num trees')
plt.ylabel('r2 score')
plt.savefig("pruned_forest.pdf")
plt.show()
......@@ -21,7 +21,7 @@ import numpy as np
import shutil
def process_job(seed, parameters, experiment_id, hyperparameters):
def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose):
"""
Experiment function.
......@@ -34,7 +34,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
"""
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
seed_str = str(seed)
experiment_id_str = str(experiment_id)
......@@ -55,13 +54,31 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer = Trainer(dataset)
if parameters['extraction_strategy'] != 'none':
for extracted_forest_size in parameters['extracted_forest_size']:
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer)
for i in range(len(parameters['extracted_forest_size'])))
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
continue
else: