Skip to content
Snippets Groups Projects
Commit bab07f41 authored by Luc Giffon's avatar Luc Giffon
Browse files

similarityforest now handle classification

parent 6992da59
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
import time
from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression
from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor
......@@ -53,72 +54,85 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
lst_pruned_forest.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]])
self._selected_trees = lst_pruned_forest
self._estimator.estimators_ = lst_pruned_forest
# self._estimator.estimators_ = lst_pruned_forest
def score(self, X, y):
predictions = np.empty((len(self._estimator.estimators_), X.shape[0]))
for idx_tree, tree in enumerate(self._estimator.estimators_):
predictions[idx_tree, :] = tree.predict(X)
final_predictions = self._aggregate(predictions)
final_predictions = self.predict(X)
score = self._score_metric(final_predictions, y)[0]
return score
def predict(self, X):
return self._estimator.predict(X)
predictions = np.empty((len(self._selected_trees), X.shape[0]))
for idx_tree, tree in enumerate(self._selected_trees):
predictions[idx_tree, :] = tree.predict(X)
final_predictions = self._aggregate(predictions)
return final_predictions
def predict_base_estimator(self, X):
return self._estimator.predict(X)
def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true)
best_tree_index = self._best(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
@abstractmethod
def _score_metric(self, y_preds, y_true):
"""
get score of each predictors in y_preds
y_preds.shape == (nb_trees, nb_sample)
y_true.shape == (1, nb_sample)
:param y_preds:
:param y_true:
:return:
"""
pass
@staticmethod
@abstractmethod
def _get_best_tree_index(self, y_preds, y_true):
def _best(array):
"""
return index of best element in array
:param array:
:return:
"""
pass
@abstractmethod
def _aggregate(self, predictions):
"""
Aggregates votes of predictors in predictions
predictions shape: (nb_trees, nb_samples)
:param predictions:
:return:
"""
pass
class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
def _aggregate(self, predictions):
return np.mean(predictions, axis=0)
return aggregation_regression(predictions)
def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
return score_metric_mse(y_preds, y_true)
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
@staticmethod
def _best(array):
return np.argmin(array)
def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true)
best_tree_index = np.argmin(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
def _aggregate(self, predictions):
return np.sign(np.sum(predictions, axis=0))
return aggregation_classification(predictions)
def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
return score_metric_indicator(y_preds, y_true)
bool_arr_correct_predictions = y_preds == y_true
return np.average(bool_arr_correct_predictions, axis=1)
def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true)
best_tree_index = np.argmax(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
\ No newline at end of file
@staticmethod
def _best(array):
return np.argmax(array)
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.data.task import Task
......@@ -29,6 +29,8 @@ class ModelFactory(object):
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestClassifier(model_parameters)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION:
......
......@@ -7,17 +7,22 @@ from abc import abstractmethod, ABCMeta
import numpy as np
from tqdm import tqdm
from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
class SimilarityForest(BaseEstimator, metaclass=ABCMeta):
"""
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
"""
similarity_similarities = "similarity_similarities"
similarity_predictions = "similarity_predictions"
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._selected_trees = list()
@property
def models_parameters(self):
......@@ -27,32 +32,20 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
def selected_trees(self):
return self._selected_trees
def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
def predict(self, X):
predictions = np.empty((len(self._selected_trees), X.shape[0]))
for idx_tree, tree in enumerate(self._selected_trees):
predictions[idx_tree, :] = tree.predict(X)
final_predictions = self._aggregate(predictions)
return final_predictions
def predict_base_estimator(self, X):
return self._estimator.predict(X)
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
# param = self._models_parameters.extraction_strategy
param = "similarity_predictions"
#
# if param == "similarity_similarities":
# pass
# elif param == "similarity_predictions":
# pass
# else:
# raise ValueError
param = self._models_parameters.extraction_strategy
# get score of base forest on val
tree_list = list(self._estimator.estimators_) # get score of base forest on val
......@@ -78,7 +71,7 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
val_predictions_to_consider = val_predictions[idx_trees_to_consider]
nb_trees_to_consider = val_predictions_to_consider.shape[0]
if param == "similarity_predictions":
if param == self.similarity_predictions:
# this matrix has zero on the diag and 1/(L-1) everywhere else.
# When multiplying left the matrix of predictions (having L lines) by this zero_diag_matrix (square L), the result has on each
# line, the average of all other lines in the initial matrix of predictions
......@@ -86,6 +79,7 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
np.fill_diagonal(zero_diag_matrix, 0)
leave_one_tree_out_predictions_val = zero_diag_matrix @ val_predictions_to_consider
leave_one_tree_out_predictions_val = self._activation(leave_one_tree_out_predictions_val) # identity for regression; sign for classification
leave_one_tree_out_scores_val = self._score_metric(leave_one_tree_out_predictions_val, y_val)
# difference with base forest is actually useless
# delta_score = forest_score - leave_one_tree_out_scores_val
......@@ -93,13 +87,16 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
# get index of tree to remove
index_worse_tree = int(np.argmax(leave_one_tree_out_scores_val)) # correlation and MSE: both greater is worse
elif param == "similarity_similarities":
elif param == self.similarity_similarities:
correlation_matrix = val_predictions_to_consider @ val_predictions_to_consider.T
average_correlation_by_tree = np.average(correlation_matrix, axis=1)
# get index of tree to remove
index_worse_tree = int(np.argmax(average_correlation_by_tree)) # correlation and MSE: both greater is worse
else:
raise ValueError("Unknown similarity method {}. Should be {} or {}".format(param, self.similarity_similarities, self.similarity_predictions))
index_worse_tree_in_base_forest = idx_trees_to_consider[index_worse_tree]
trees_to_remove.append(tree_list[index_worse_tree_in_base_forest])
mask_trees_to_consider[index_worse_tree_in_base_forest] = False
......@@ -108,16 +105,50 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
pruned_forest = list(set(tree_list) - set(trees_to_remove))
self._selected_trees = pruned_forest
self._estimator.estimators_ = pruned_forest
def score(self, X, y):
test_predictions = np.empty((len(self._estimator.estimators_), X.shape[0]))
for idx_tree, mod in enumerate(self._estimator.estimators_):
test_predictions[idx_tree, :] = mod.predict(X)
test_mean = np.mean(test_predictions, axis=0)
score = self._score_metric(test_mean, y)[0]
final_predictions = self.predict(X)
score = self._score_metric(final_predictions, y)[0]
return score
def predict_base_estimator(self, X):
return self._estimator.predict(X)
@abstractmethod
def _score_metric(self, y_preds, y_true):
pass
@abstractmethod
def _aggregate(self, predictions):
"""
Aggregates votes of predictors in predictions
predictions shape: (nb_trees, nb_samples)
:param predictions:
:return:
"""
pass
@abstractmethod
def _activation(self, leave_one_tree_out_predictions_val):
pass
class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta):
def _aggregate(self, predictions):
return aggregation_regression(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_mse(y_preds, y_true)
def _activation(self, predictions):
return predictions
class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta):
def _aggregate(self, predictions):
return aggregation_classification(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_indicator(y_preds, y_true)
def _activation(self, predictions):
return np.sign(predictions)
import numpy as np
def score_metric_mse(y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
def score_metric_indicator(y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
bool_arr_correct_predictions = y_preds == y_true
return np.average(bool_arr_correct_predictions, axis=1)
def aggregation_classification(predictions):
return np.sign(np.sum(predictions, axis=0))
def aggregation_regression(predictions):
return np.mean(predictions, axis=0)
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory
......@@ -122,7 +122,7 @@ class Trainer(object):
y_pred = np.sign(y_pred)
y_pred = np.where(y_pred == 0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred)
elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor, KMeansForestClassifier]:
elif type(model) in [SimilarityForestRegressor, SimilarityForestClassifier, KMeansForestRegressor, EnsembleSelectionForestRegressor, KMeansForestClassifier]:
result = model.score(X, y_true)
return result
......@@ -130,7 +130,7 @@ class Trainer(object):
if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, SimilarityForestRegressor]:
y_pred = model.predict_base_estimator(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) == RandomForestClassifier:
......@@ -139,8 +139,6 @@ class Trainer(object):
elif type(model) is RandomForestRegressor:
y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [ KMeansForestClassifier]:
result = model.score(X, y_true)
return result
def compute_results(self, model, models_dir):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment