Skip to content
Snippets Groups Projects
Commit 6992da59 authored by Luc Giffon's avatar Luc Giffon
Browse files

fix bugs similarity regressor + optimize

parent d6d303ec
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
This commit is part of merge request !23. Comments created here will be created in the context of that merge request.
import time
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
...@@ -11,13 +13,11 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -11,13 +13,11 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
""" """
def __init__(self, models_parameters, score_metric=mean_squared_error): def __init__(self, models_parameters):
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1) random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
self._selected_trees = list()
@property @property
def models_parameters(self): def models_parameters(self):
...@@ -27,57 +27,96 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -27,57 +27,96 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
def selected_trees(self): def selected_trees(self):
return self._selected_trees return self._selected_trees
def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
def fit(self, X_train, y_train, X_val, y_val): def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train) self._estimator.fit(X_train, y_train)
y_val_pred = self._estimator.predict(X_val) # param = self._models_parameters.extraction_strategy
forest_pred = self._score_metric(y_val, y_val_pred) param = "similarity_predictions"
forest = self._estimator.estimators_
tree_list = list(self._estimator.estimators_)
val_scores = list() #
# if param == "similarity_similarities":
# pass
# elif param == "similarity_predictions":
# pass
# else:
# raise ValueError
# get score of base forest on val
tree_list = list(self._estimator.estimators_) # get score of base forest on val
trees_to_remove = list()
# get score of each single tree of forest on val
val_predictions = np.empty((len(tree_list), X_val.shape[0]))
with tqdm(tree_list) as tree_pred_bar: with tqdm(tree_list) as tree_pred_bar:
tree_pred_bar.set_description('[Initial tree predictions]') tree_pred_bar.set_description('[Initial tree predictions]')
for tree in tree_pred_bar: for idx_tree, tree in enumerate(tree_pred_bar):
val_scores.append(tree.predict(X_val)) val_predictions[idx_tree, :] = tree.predict(X_val)
tree_pred_bar.update(1) tree_pred_bar.update(1)
with tqdm(range(self._extracted_forest_size), disable=True) as pruning_forest_bar: # boolean mask of trees to take into account for next evaluation of trees importance
mask_trees_to_consider = np.ones(val_predictions.shape[0], dtype=bool)
# the technique does backward selection, that is: trees are removed one after an other
nb_tree_to_remove = len(tree_list) - self._extracted_forest_size
with tqdm(range(nb_tree_to_remove), disable=True) as pruning_forest_bar:
pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]') pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]')
for i in pruning_forest_bar: for _ in pruning_forest_bar: # pour chaque arbre a extraire
best_similarity = 100000 # get indexes of trees to take into account
found_index = 0 idx_trees_to_consider = np.arange(val_predictions.shape[0])[mask_trees_to_consider]
with tqdm(range(len(tree_list)), disable=True) as tree_list_bar: val_predictions_to_consider = val_predictions[idx_trees_to_consider]
tree_list_bar.set_description(f'[Tree selection s={self._extracted_forest_size} #{i}]') nb_trees_to_consider = val_predictions_to_consider.shape[0]
for j in tree_list_bar:
lonely_tree = tree_list[j] if param == "similarity_predictions":
del tree_list[j] # this matrix has zero on the diag and 1/(L-1) everywhere else.
val_mean = np.mean(np.asarray(val_scores), axis=0) # When multiplying left the matrix of predictions (having L lines) by this zero_diag_matrix (square L), the result has on each
val_score = self._score_metric(val_mean, y_val) # line, the average of all other lines in the initial matrix of predictions
temp_similarity = abs(forest_pred - val_score) zero_diag_matrix = np.ones((nb_trees_to_consider, nb_trees_to_consider)) * (1 / (nb_trees_to_consider - 1))
if (temp_similarity < best_similarity): np.fill_diagonal(zero_diag_matrix, 0)
found_index = j
best_similarity = temp_similarity leave_one_tree_out_predictions_val = zero_diag_matrix @ val_predictions_to_consider
tree_list.insert(j, lonely_tree) leave_one_tree_out_scores_val = self._score_metric(leave_one_tree_out_predictions_val, y_val)
val_scores.insert(j, lonely_tree.predict(X_val)) # difference with base forest is actually useless
tree_list_bar.update(1) # delta_score = forest_score - leave_one_tree_out_scores_val
self._selected_trees.append(tree_list[found_index])
del tree_list[found_index] # get index of tree to remove
del val_scores[found_index] index_worse_tree = int(np.argmax(leave_one_tree_out_scores_val)) # correlation and MSE: both greater is worse
elif param == "similarity_similarities":
correlation_matrix = val_predictions_to_consider @ val_predictions_to_consider.T
average_correlation_by_tree = np.average(correlation_matrix, axis=1)
# get index of tree to remove
index_worse_tree = int(np.argmax(average_correlation_by_tree)) # correlation and MSE: both greater is worse
index_worse_tree_in_base_forest = idx_trees_to_consider[index_worse_tree]
trees_to_remove.append(tree_list[index_worse_tree_in_base_forest])
mask_trees_to_consider[index_worse_tree_in_base_forest] = False
pruning_forest_bar.update(1) pruning_forest_bar.update(1)
self._selected_trees = set(self._selected_trees) pruned_forest = list(set(tree_list) - set(trees_to_remove))
pruned_forest = list(set(forest) - self._selected_trees)
self._selected_trees = pruned_forest
self._estimator.estimators_ = pruned_forest self._estimator.estimators_ = pruned_forest
def score(self, X, y): def score(self, X, y):
test_list = list() test_predictions = np.empty((len(self._estimator.estimators_), X.shape[0]))
for mod in self._estimator.estimators_: for idx_tree, mod in enumerate(self._estimator.estimators_):
test_pred = mod.predict(X) test_predictions[idx_tree, :] = mod.predict(X)
test_list.append(test_pred)
test_list = np.array(test_list) test_mean = np.mean(test_predictions, axis=0)
test_mean = np.mean(test_list, axis=0) score = self._score_metric(test_mean, y)[0]
score = self._score_metric(test_mean, y)
return score return score
def predict_base_estimator(self, X): def predict_base_estimator(self, X):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment