Skip to content
Snippets Groups Projects
Commit 6992da59 authored by Luc Giffon's avatar Luc Giffon
Browse files

fix bugs similarity regressor + optimize

parent d6d303ec
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
This commit is part of merge request !23. Comments created here will be created in the context of that merge request.
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
......@@ -11,13 +13,11 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
"""
def __init__(self, models_parameters, score_metric=mean_squared_error):
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
self._selected_trees = list()
@property
def models_parameters(self):
......@@ -27,57 +27,96 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
def selected_trees(self):
return self._selected_trees
def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
y_val_pred = self._estimator.predict(X_val)
forest_pred = self._score_metric(y_val, y_val_pred)
forest = self._estimator.estimators_
tree_list = list(self._estimator.estimators_)
# param = self._models_parameters.extraction_strategy
param = "similarity_predictions"
val_scores = list()
#
# if param == "similarity_similarities":
# pass
# elif param == "similarity_predictions":
# pass
# else:
# raise ValueError
# get score of base forest on val
tree_list = list(self._estimator.estimators_) # get score of base forest on val
trees_to_remove = list()
# get score of each single tree of forest on val
val_predictions = np.empty((len(tree_list), X_val.shape[0]))
with tqdm(tree_list) as tree_pred_bar:
tree_pred_bar.set_description('[Initial tree predictions]')
for tree in tree_pred_bar:
val_scores.append(tree.predict(X_val))
for idx_tree, tree in enumerate(tree_pred_bar):
val_predictions[idx_tree, :] = tree.predict(X_val)
tree_pred_bar.update(1)
with tqdm(range(self._extracted_forest_size), disable=True) as pruning_forest_bar:
# boolean mask of trees to take into account for next evaluation of trees importance
mask_trees_to_consider = np.ones(val_predictions.shape[0], dtype=bool)
# the technique does backward selection, that is: trees are removed one after an other
nb_tree_to_remove = len(tree_list) - self._extracted_forest_size
with tqdm(range(nb_tree_to_remove), disable=True) as pruning_forest_bar:
pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]')
for i in pruning_forest_bar:
best_similarity = 100000
found_index = 0
with tqdm(range(len(tree_list)), disable=True) as tree_list_bar:
tree_list_bar.set_description(f'[Tree selection s={self._extracted_forest_size} #{i}]')
for j in tree_list_bar:
lonely_tree = tree_list[j]
del tree_list[j]
val_mean = np.mean(np.asarray(val_scores), axis=0)
val_score = self._score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity):
found_index = j
best_similarity = temp_similarity
tree_list.insert(j, lonely_tree)
val_scores.insert(j, lonely_tree.predict(X_val))
tree_list_bar.update(1)
self._selected_trees.append(tree_list[found_index])
del tree_list[found_index]
del val_scores[found_index]
for _ in pruning_forest_bar: # pour chaque arbre a extraire
# get indexes of trees to take into account
idx_trees_to_consider = np.arange(val_predictions.shape[0])[mask_trees_to_consider]
val_predictions_to_consider = val_predictions[idx_trees_to_consider]
nb_trees_to_consider = val_predictions_to_consider.shape[0]
if param == "similarity_predictions":
# this matrix has zero on the diag and 1/(L-1) everywhere else.
# When multiplying left the matrix of predictions (having L lines) by this zero_diag_matrix (square L), the result has on each
# line, the average of all other lines in the initial matrix of predictions
zero_diag_matrix = np.ones((nb_trees_to_consider, nb_trees_to_consider)) * (1 / (nb_trees_to_consider - 1))
np.fill_diagonal(zero_diag_matrix, 0)
leave_one_tree_out_predictions_val = zero_diag_matrix @ val_predictions_to_consider
leave_one_tree_out_scores_val = self._score_metric(leave_one_tree_out_predictions_val, y_val)
# difference with base forest is actually useless
# delta_score = forest_score - leave_one_tree_out_scores_val
# get index of tree to remove
index_worse_tree = int(np.argmax(leave_one_tree_out_scores_val)) # correlation and MSE: both greater is worse
elif param == "similarity_similarities":
correlation_matrix = val_predictions_to_consider @ val_predictions_to_consider.T
average_correlation_by_tree = np.average(correlation_matrix, axis=1)
# get index of tree to remove
index_worse_tree = int(np.argmax(average_correlation_by_tree)) # correlation and MSE: both greater is worse
index_worse_tree_in_base_forest = idx_trees_to_consider[index_worse_tree]
trees_to_remove.append(tree_list[index_worse_tree_in_base_forest])
mask_trees_to_consider[index_worse_tree_in_base_forest] = False
pruning_forest_bar.update(1)
self._selected_trees = set(self._selected_trees)
pruned_forest = list(set(forest) - self._selected_trees)
pruned_forest = list(set(tree_list) - set(trees_to_remove))
self._selected_trees = pruned_forest
self._estimator.estimators_ = pruned_forest
def score(self, X, y):
test_list = list()
for mod in self._estimator.estimators_:
test_pred = mod.predict(X)
test_list.append(test_pred)
test_list = np.array(test_list)
test_mean = np.mean(test_list, axis=0)
score = self._score_metric(test_mean, y)
test_predictions = np.empty((len(self._estimator.estimators_), X.shape[0]))
for idx_tree, mod in enumerate(self._estimator.estimators_):
test_predictions[idx_tree, :] = mod.predict(X)
test_mean = np.mean(test_predictions, axis=0)
score = self._score_metric(test_mean, y)[0]
return score
def predict_base_estimator(self, X):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment