Skip to content
Snippets Groups Projects
Commit bab07f41 authored by Luc Giffon's avatar Luc Giffon
Browse files

similarityforest now handle classification

parent 6992da59
No related branches found
1 merge request!23Resolve "integration-sota"
import time import time
from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression
from bolsonaro.utils import tqdm_joblib from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor
...@@ -53,72 +54,85 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): ...@@ -53,72 +54,85 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
lst_pruned_forest.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]]) lst_pruned_forest.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]])
self._selected_trees = lst_pruned_forest self._selected_trees = lst_pruned_forest
self._estimator.estimators_ = lst_pruned_forest # self._estimator.estimators_ = lst_pruned_forest
def score(self, X, y): def score(self, X, y):
predictions = np.empty((len(self._estimator.estimators_), X.shape[0])) final_predictions = self.predict(X)
for idx_tree, tree in enumerate(self._estimator.estimators_):
predictions[idx_tree, :] = tree.predict(X)
final_predictions = self._aggregate(predictions)
score = self._score_metric(final_predictions, y)[0] score = self._score_metric(final_predictions, y)[0]
return score return score
def predict(self, X): def predict(self, X):
return self._estimator.predict(X) predictions = np.empty((len(self._selected_trees), X.shape[0]))
for idx_tree, tree in enumerate(self._selected_trees):
predictions[idx_tree, :] = tree.predict(X)
final_predictions = self._aggregate(predictions)
return final_predictions
def predict_base_estimator(self, X): def predict_base_estimator(self, X):
return self._estimator.predict(X) return self._estimator.predict(X)
def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true)
best_tree_index = self._best(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
@abstractmethod @abstractmethod
def _score_metric(self, y_preds, y_true): def _score_metric(self, y_preds, y_true):
"""
get score of each predictors in y_preds
y_preds.shape == (nb_trees, nb_sample)
y_true.shape == (1, nb_sample)
:param y_preds:
:param y_true:
:return:
"""
pass pass
@staticmethod
@abstractmethod @abstractmethod
def _get_best_tree_index(self, y_preds, y_true): def _best(array):
"""
return index of best element in array
:param array:
:return:
"""
pass pass
@abstractmethod @abstractmethod
def _aggregate(self, predictions): def _aggregate(self, predictions):
"""
Aggregates votes of predictors in predictions
predictions shape: (nb_trees, nb_samples)
:param predictions:
:return:
"""
pass pass
class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta): class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
def _aggregate(self, predictions): def _aggregate(self, predictions):
return np.mean(predictions, axis=0) return aggregation_regression(predictions)
def _score_metric(self, y_preds, y_true): def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1: return score_metric_mse(y_preds, y_true)
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
diff = y_preds - y_true @staticmethod
squared_diff = diff ** 2 def _best(array):
mean_squared_diff = np.mean(squared_diff, axis=1) return np.argmin(array)
return mean_squared_diff
def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true)
best_tree_index = np.argmin(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta): class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
def _aggregate(self, predictions): def _aggregate(self, predictions):
return np.sign(np.sum(predictions, axis=0)) return aggregation_classification(predictions)
def _score_metric(self, y_preds, y_true): def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1: return score_metric_indicator(y_preds, y_true)
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
bool_arr_correct_predictions = y_preds == y_true @staticmethod
return np.average(bool_arr_correct_predictions, axis=1) def _best(array):
return np.argmax(array)
def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true)
best_tree_index = np.argmax(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
\ No newline at end of file
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
...@@ -29,6 +29,8 @@ class ModelFactory(object): ...@@ -29,6 +29,8 @@ class ModelFactory(object):
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'kmeans': elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestClassifier(model_parameters) return KMeansForestClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestClassifier(model_parameters)
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION: elif task == Task.REGRESSION:
......
...@@ -7,17 +7,22 @@ from abc import abstractmethod, ABCMeta ...@@ -7,17 +7,22 @@ from abc import abstractmethod, ABCMeta
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
class SimilarityForest(BaseEstimator, metaclass=ABCMeta):
""" """
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
""" """
similarity_similarities = "similarity_similarities"
similarity_predictions = "similarity_predictions"
def __init__(self, models_parameters): def __init__(self, models_parameters):
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1) random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._selected_trees = list()
@property @property
def models_parameters(self): def models_parameters(self):
...@@ -27,32 +32,20 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -27,32 +32,20 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
def selected_trees(self): def selected_trees(self):
return self._selected_trees return self._selected_trees
def _score_metric(self, y_preds, y_true): def predict(self, X):
if len(y_true.shape) == 1: predictions = np.empty((len(self._selected_trees), X.shape[0]))
y_true = y_true[np.newaxis, :] for idx_tree, tree in enumerate(self._selected_trees):
if len(y_preds.shape) == 1: predictions[idx_tree, :] = tree.predict(X)
y_preds = y_preds[np.newaxis, :] final_predictions = self._aggregate(predictions)
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true" return final_predictions
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
def predict_base_estimator(self, X):
return self._estimator.predict(X)
def fit(self, X_train, y_train, X_val, y_val): def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train) self._estimator.fit(X_train, y_train)
# param = self._models_parameters.extraction_strategy param = self._models_parameters.extraction_strategy
param = "similarity_predictions"
#
# if param == "similarity_similarities":
# pass
# elif param == "similarity_predictions":
# pass
# else:
# raise ValueError
# get score of base forest on val # get score of base forest on val
tree_list = list(self._estimator.estimators_) # get score of base forest on val tree_list = list(self._estimator.estimators_) # get score of base forest on val
...@@ -78,7 +71,7 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -78,7 +71,7 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
val_predictions_to_consider = val_predictions[idx_trees_to_consider] val_predictions_to_consider = val_predictions[idx_trees_to_consider]
nb_trees_to_consider = val_predictions_to_consider.shape[0] nb_trees_to_consider = val_predictions_to_consider.shape[0]
if param == "similarity_predictions": if param == self.similarity_predictions:
# this matrix has zero on the diag and 1/(L-1) everywhere else. # this matrix has zero on the diag and 1/(L-1) everywhere else.
# When multiplying left the matrix of predictions (having L lines) by this zero_diag_matrix (square L), the result has on each # When multiplying left the matrix of predictions (having L lines) by this zero_diag_matrix (square L), the result has on each
# line, the average of all other lines in the initial matrix of predictions # line, the average of all other lines in the initial matrix of predictions
...@@ -86,6 +79,7 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -86,6 +79,7 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
np.fill_diagonal(zero_diag_matrix, 0) np.fill_diagonal(zero_diag_matrix, 0)
leave_one_tree_out_predictions_val = zero_diag_matrix @ val_predictions_to_consider leave_one_tree_out_predictions_val = zero_diag_matrix @ val_predictions_to_consider
leave_one_tree_out_predictions_val = self._activation(leave_one_tree_out_predictions_val) # identity for regression; sign for classification
leave_one_tree_out_scores_val = self._score_metric(leave_one_tree_out_predictions_val, y_val) leave_one_tree_out_scores_val = self._score_metric(leave_one_tree_out_predictions_val, y_val)
# difference with base forest is actually useless # difference with base forest is actually useless
# delta_score = forest_score - leave_one_tree_out_scores_val # delta_score = forest_score - leave_one_tree_out_scores_val
...@@ -93,13 +87,16 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -93,13 +87,16 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
# get index of tree to remove # get index of tree to remove
index_worse_tree = int(np.argmax(leave_one_tree_out_scores_val)) # correlation and MSE: both greater is worse index_worse_tree = int(np.argmax(leave_one_tree_out_scores_val)) # correlation and MSE: both greater is worse
elif param == "similarity_similarities": elif param == self.similarity_similarities:
correlation_matrix = val_predictions_to_consider @ val_predictions_to_consider.T correlation_matrix = val_predictions_to_consider @ val_predictions_to_consider.T
average_correlation_by_tree = np.average(correlation_matrix, axis=1) average_correlation_by_tree = np.average(correlation_matrix, axis=1)
# get index of tree to remove # get index of tree to remove
index_worse_tree = int(np.argmax(average_correlation_by_tree)) # correlation and MSE: both greater is worse index_worse_tree = int(np.argmax(average_correlation_by_tree)) # correlation and MSE: both greater is worse
else:
raise ValueError("Unknown similarity method {}. Should be {} or {}".format(param, self.similarity_similarities, self.similarity_predictions))
index_worse_tree_in_base_forest = idx_trees_to_consider[index_worse_tree] index_worse_tree_in_base_forest = idx_trees_to_consider[index_worse_tree]
trees_to_remove.append(tree_list[index_worse_tree_in_base_forest]) trees_to_remove.append(tree_list[index_worse_tree_in_base_forest])
mask_trees_to_consider[index_worse_tree_in_base_forest] = False mask_trees_to_consider[index_worse_tree_in_base_forest] = False
...@@ -108,16 +105,50 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -108,16 +105,50 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
pruned_forest = list(set(tree_list) - set(trees_to_remove)) pruned_forest = list(set(tree_list) - set(trees_to_remove))
self._selected_trees = pruned_forest self._selected_trees = pruned_forest
self._estimator.estimators_ = pruned_forest
def score(self, X, y): def score(self, X, y):
test_predictions = np.empty((len(self._estimator.estimators_), X.shape[0])) final_predictions = self.predict(X)
for idx_tree, mod in enumerate(self._estimator.estimators_): score = self._score_metric(final_predictions, y)[0]
test_predictions[idx_tree, :] = mod.predict(X)
test_mean = np.mean(test_predictions, axis=0)
score = self._score_metric(test_mean, y)[0]
return score return score
def predict_base_estimator(self, X): @abstractmethod
return self._estimator.predict(X) def _score_metric(self, y_preds, y_true):
pass
@abstractmethod
def _aggregate(self, predictions):
"""
Aggregates votes of predictors in predictions
predictions shape: (nb_trees, nb_samples)
:param predictions:
:return:
"""
pass
@abstractmethod
def _activation(self, leave_one_tree_out_predictions_val):
pass
class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta):
def _aggregate(self, predictions):
return aggregation_regression(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_mse(y_preds, y_true)
def _activation(self, predictions):
return predictions
class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta):
def _aggregate(self, predictions):
return aggregation_classification(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_indicator(y_preds, y_true)
def _activation(self, predictions):
return np.sign(predictions)
import numpy as np
def score_metric_mse(y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
def score_metric_indicator(y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
bool_arr_correct_predictions = y_preds == y_true
return np.average(bool_arr_correct_predictions, axis=1)
def aggregation_classification(predictions):
return np.sign(np.sum(predictions, axis=0))
def aggregation_regression(predictions):
return np.mean(predictions, axis=0)
from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
...@@ -122,7 +122,7 @@ class Trainer(object): ...@@ -122,7 +122,7 @@ class Trainer(object):
y_pred = np.sign(y_pred) y_pred = np.sign(y_pred)
y_pred = np.where(y_pred == 0, 1, y_pred) y_pred = np.where(y_pred == 0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred) result = self._classification_score_metric(y_true, y_pred)
elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor, KMeansForestClassifier]: elif type(model) in [SimilarityForestRegressor, SimilarityForestClassifier, KMeansForestRegressor, EnsembleSelectionForestRegressor, KMeansForestClassifier]:
result = model.score(X, y_true) result = model.score(X, y_true)
return result return result
...@@ -130,7 +130,7 @@ class Trainer(object): ...@@ -130,7 +130,7 @@ class Trainer(object):
if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
y_pred = model.predict_base_estimator(X) y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, SimilarityForestRegressor]:
y_pred = model.predict_base_estimator(X) y_pred = model.predict_base_estimator(X)
result = self._base_classification_score_metric(y_true, y_pred) result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) == RandomForestClassifier: elif type(model) == RandomForestClassifier:
...@@ -139,8 +139,6 @@ class Trainer(object): ...@@ -139,8 +139,6 @@ class Trainer(object):
elif type(model) is RandomForestRegressor: elif type(model) is RandomForestRegressor:
y_pred = model.predict(X) y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [ KMeansForestClassifier]:
result = model.score(X, y_true)
return result return result
def compute_results(self, model, models_dir): def compute_results(self, model, models_dir):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment