Skip to content
Snippets Groups Projects
Commit 96b83f3a authored by Luc Giffon's avatar Luc Giffon
Browse files

master inheritance for state of the art prunign techniques

parent bd349760
Branches
No related tags found
1 merge request!23Resolve "integration-sota"
import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta
import numpy as np
from tqdm import tqdm
from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator
class ForestPruningSOTA(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._selected_trees = list()
self._base_estimator = self.init_estimator(models_parameters)
@staticmethod
@abstractmethod
def init_estimator(model_parameters):
pass
@abstractmethod
def _fit(self, X_train, y_train, X_val, y_val):
pass
@property
def models_parameters(self):
return self._models_parameters
@property
def selected_trees(self):
return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val):
pruned_forest = self._fit(X_train, y_train, X_val, y_val)
assert len(pruned_forest) == self._extracted_forest_size, "Pruned forest size isn't the size of expected forest: {} != {}".format(len(pruned_forest), self._extracted_forest_size)
self._selected_trees = pruned_forest
def _base_estimator_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self._base_estimator.estimators_]).T
return base_predictions
def _selected_tree_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self.selected_trees]).T
return base_predictions
def predict(self, X):
predictions = self._selected_tree_predictions(X).T
final_predictions = self._aggregate(predictions)
return final_predictions
def predict_base_estimator(self, X):
return self._base_estimator.predict(X)
def score(self, X, y):
final_predictions = self.predict(X)
score = self._score_metric(final_predictions, y)[0]
return score
@staticmethod
@abstractmethod
def _best_score_idx(array):
"""
return index of best element in array
:param array:
:return:
"""
pass
@staticmethod
@abstractmethod
def _worse_score_idx(array):
"""
return index of worse element in array
:param array:
:return:
"""
pass
@abstractmethod
def _score_metric(self, y_preds, y_true):
"""
get score of each predictors in y_preds
y_preds.shape == (nb_trees, nb_sample)
y_true.shape == (1, nb_sample)
:param y_preds:
:param y_true:
:return:
"""
pass
@abstractmethod
def _aggregate(self, predictions):
"""
Aggregates votes of predictors in predictions
predictions shape: (nb_trees, nb_samples)
:param predictions:
:return:
"""
pass
\ No newline at end of file
import time import time
from bolsonaro.models.forest_pruning_sota import ForestPruningSOTA
from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression
from bolsonaro.utils import tqdm_joblib from bolsonaro.utils import tqdm_joblib
...@@ -14,38 +15,12 @@ from joblib import Parallel, delayed ...@@ -14,38 +15,12 @@ from joblib import Parallel, delayed
from tqdm import tqdm from tqdm import tqdm
class KmeansForest(BaseEstimator, metaclass=ABCMeta): class KmeansForest(ForestPruningSOTA, metaclass=ABCMeta):
"""
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
""" """
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
"""
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._selected_trees = list()
self._base_estimator = self.init_estimator(models_parameters)
@staticmethod
@abstractmethod
def init_estimator(model_parameters):
pass
def _base_estimator_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self._base_estimator.estimators_]).T
return base_predictions
def _selected_tree_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self.selected_trees]).T
return base_predictions
@property
def models_parameters(self):
return self._models_parameters
@property
def selected_trees(self):
return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val): def _fit(self, X_train, y_train, X_val, y_val):
self._base_estimator.fit(X_train, y_train) self._base_estimator.fit(X_train, y_train)
predictions_val = self._base_estimator_predictions(X_val).T predictions_val = self._base_estimator_predictions(X_val).T
...@@ -62,65 +37,15 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): ...@@ -62,65 +37,15 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
best_tree_index = self._get_best_tree_index(predictions_val_cluster, y_val) best_tree_index = self._get_best_tree_index(predictions_val_cluster, y_val)
lst_pruned_forest.append(self._base_estimator.estimators_[index_trees_cluster[best_tree_index]]) lst_pruned_forest.append(self._base_estimator.estimators_[index_trees_cluster[best_tree_index]])
self._selected_trees = lst_pruned_forest return lst_pruned_forest
def score(self, X, y):
final_predictions = self.predict(X)
score = self._score_metric(final_predictions, y)[0]
return score
def predict(self, X):
predictions = self._selected_tree_predictions(X).T
final_predictions = self._aggregate(predictions)
return final_predictions
def predict_base_estimator(self, X):
return self._base_estimator.predict(X)
def _get_best_tree_index(self, y_preds, y_true): def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true) score = self._score_metric(y_preds, y_true)
best_tree_index = self._best(score) # get best scoring tree (the one with lowest mse) best_tree_index = self._best_score_idx(score) # get best scoring tree (the one with lowest mse)
return best_tree_index return best_tree_index
@abstractmethod
def _score_metric(self, y_preds, y_true):
"""
get score of each predictors in y_preds
y_preds.shape == (nb_trees, nb_sample)
y_true.shape == (1, nb_sample)
:param y_preds:
:param y_true:
:return:
"""
pass
@staticmethod
@abstractmethod
def _best(array):
"""
return index of best element in array
:param array:
:return:
"""
pass
@abstractmethod
def _aggregate(self, predictions):
"""
Aggregates votes of predictors in predictions
predictions shape: (nb_trees, nb_samples)
:param predictions:
:return:
"""
pass
class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta): class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
@staticmethod @staticmethod
def init_estimator(model_parameters): def init_estimator(model_parameters):
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
...@@ -133,12 +58,15 @@ class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta): ...@@ -133,12 +58,15 @@ class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
return score_metric_mse(y_preds, y_true) return score_metric_mse(y_preds, y_true)
@staticmethod @staticmethod
def _best(array): def _best_score_idx(array):
return np.argmin(array) return np.argmin(array)
@staticmethod
def _worse_score_idx(array):
return np.argmax(array)
class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
@staticmethod @staticmethod
def init_estimator(model_parameters): def init_estimator(model_parameters):
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
...@@ -161,5 +89,9 @@ class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta): ...@@ -161,5 +89,9 @@ class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
return predictions return predictions
@staticmethod @staticmethod
def _best(array): def _best_score_idx(array):
return np.argmax(array) return np.argmax(array)
@staticmethod
def _worse_score_idx(array):
return np.argmin(array)
...@@ -7,53 +7,18 @@ from abc import abstractmethod, ABCMeta ...@@ -7,53 +7,18 @@ from abc import abstractmethod, ABCMeta
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from bolsonaro.models.forest_pruning_sota import ForestPruningSOTA
from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator
class SimilarityForest(BaseEstimator, metaclass=ABCMeta): class SimilarityForest(ForestPruningSOTA, metaclass=ABCMeta):
""" """
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
""" """
similarity_similarities = "similarity_similarities" similarity_similarities = "similarity_similarities"
similarity_predictions = "similarity_predictions" similarity_predictions = "similarity_predictions"
def __init__(self, models_parameters): def _fit(self, X_train, y_train, X_val, y_val):
self._models_parameters = models_parameters
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._selected_trees = list()
self._base_estimator = self.init_estimator(models_parameters)
@staticmethod
@abstractmethod
def init_estimator(model_parameters):
pass
@property
def models_parameters(self):
return self._models_parameters
@property
def selected_trees(self):
return self._selected_trees
def _base_estimator_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self._base_estimator.estimators_]).T
return base_predictions
def _selected_tree_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self.selected_trees]).T
return base_predictions
def predict(self, X):
predictions = self._selected_tree_predictions(X).T
final_predictions = self._aggregate(predictions)
return final_predictions
def predict_base_estimator(self, X):
return self._base_estimator.predict(X)
def fit(self, X_train, y_train, X_val, y_val):
self._base_estimator.fit(X_train, y_train) self._base_estimator.fit(X_train, y_train)
param = self._models_parameters.extraction_strategy param = self._models_parameters.extraction_strategy
...@@ -91,7 +56,7 @@ class SimilarityForest(BaseEstimator, metaclass=ABCMeta): ...@@ -91,7 +56,7 @@ class SimilarityForest(BaseEstimator, metaclass=ABCMeta):
# delta_score = forest_score - leave_one_tree_out_scores_val # delta_score = forest_score - leave_one_tree_out_scores_val
# get index of tree to remove # get index of tree to remove
index_worse_tree = int(np.argmax(leave_one_tree_out_scores_val)) # correlation and MSE: both greater is worse index_worse_tree = int(self._worse_score_idx(leave_one_tree_out_scores_val))
elif param == self.similarity_similarities: elif param == self.similarity_similarities:
correlation_matrix = val_predictions_to_consider @ val_predictions_to_consider.T correlation_matrix = val_predictions_to_consider @ val_predictions_to_consider.T
...@@ -109,34 +74,14 @@ class SimilarityForest(BaseEstimator, metaclass=ABCMeta): ...@@ -109,34 +74,14 @@ class SimilarityForest(BaseEstimator, metaclass=ABCMeta):
pruning_forest_bar.update(1) pruning_forest_bar.update(1)
pruned_forest = list(set(tree_list) - set(trees_to_remove)) pruned_forest = list(set(tree_list) - set(trees_to_remove))
return pruned_forest
self._selected_trees = pruned_forest
def score(self, X, y):
final_predictions = self.predict(X)
score = self._score_metric(final_predictions, y)[0]
return score
@abstractmethod
def _score_metric(self, y_preds, y_true):
pass
@abstractmethod
def _aggregate(self, predictions):
"""
Aggregates votes of predictors in predictions
predictions shape: (nb_trees, nb_samples)
:param predictions:
:return:
"""
pass
@abstractmethod @abstractmethod
def _activation(self, leave_one_tree_out_predictions_val): def _activation(self, leave_one_tree_out_predictions_val):
pass pass
class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta): class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta):
@staticmethod @staticmethod
...@@ -153,6 +98,13 @@ class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta): ...@@ -153,6 +98,13 @@ class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta):
def _activation(self, predictions): def _activation(self, predictions):
return predictions return predictions
@staticmethod
def _best_score_idx(array):
return np.argmin(array)
@staticmethod
def _worse_score_idx(array):
return np.argmax(array)
class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta): class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta):
...@@ -179,3 +131,11 @@ class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta): ...@@ -179,3 +131,11 @@ class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta):
predictions_0_1 = super()._base_estimator_predictions(X) predictions_0_1 = super()._base_estimator_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2 predictions = (predictions_0_1 - 0.5) * 2
return predictions return predictions
@staticmethod
def _best_score_idx(array):
return np.argmax(array)
@staticmethod
def _worse_score_idx(array):
return np.argmin(array)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment