Skip to content
Snippets Groups Projects
Commit 8d0b3931 authored by Luc Giffon's avatar Luc Giffon
Browse files

implement binary classification with kmeans sota

parent 8ec2871e
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
......@@ -13,17 +13,15 @@ from joblib import Parallel, delayed
from tqdm import tqdm
class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
class KmeansForest(BaseEstimator, metaclass=ABCMeta):
"""
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
"""
def __init__(self, models_parameters, score_metric=mean_squared_error):
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
"""
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=2)
random_state=self._models_parameters.seed, n_jobs=2)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
self._selected_trees = list()
@property
......@@ -37,7 +35,6 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
predictions_val = np.empty((len(self._estimator.estimators_), X_val.shape[0]))
predictions = np.empty((len(self._estimator.estimators_), X_train.shape[0]))
for i_tree, tree in enumerate(self._estimator.estimators_):
......@@ -48,64 +45,84 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
labels = np.array(kmeans.labels_)
# start_np_version = time.time()
pruned_forest_1 = list()
lst_pruned_forest = list()
for cluster_idx in range(self._extracted_forest_size): # pourrait être parallelise
index_trees_cluster = np.where(labels == cluster_idx)[0]
index_trees_cluster = np.where(labels == cluster_idx)[0]
predictions_val_cluster = predictions_val[index_trees_cluster] # get predictions of trees in cluster
if self._score_metric == mean_squared_error:
# compute the mean squared error of all trees at once usng numpy machinery
diff = predictions_val_cluster - y_val
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
best_tree_index = np.argmin(mean_squared_diff) # get best scoring tree (the one with lowest mse)
pruned_forest_1.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]])
else:
raise ValueError
# stop_np_version = time.time()
# print("Time np {}".format(stop_np_version - start_np_version))
# start_paralel_version = time.time()
# # For each cluster select the best tree on the validation set
# extracted_forest_sizes = list(range(self._extracted_forest_size))
# with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
# pruned_forest = Parallel(n_jobs=2)(delayed(self._prune_forest_job)(prune_forest_job_pb, extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) for i in range(self._extracted_forest_size))
# stop_paralel_version = time.time()
# print("Time paralel {}".format(stop_paralel_version - start_paralel_version))
# assert all([t1 is t2 for (t1, t2) in zip(pruned_forest_1, pruned_forest)])
self._selected_trees = pruned_forest_1
self._estimator.estimators_ = pruned_forest_1
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
cluster = Parallel(n_jobs=2)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update()
return self._estimator.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._estimator.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update()
return tree_pred
best_tree_index = self._get_best_tree_index(predictions_val_cluster, y_val)
lst_pruned_forest.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]])
def predict(self, X):
return self._estimator.predict(X)
self._selected_trees = lst_pruned_forest
self._estimator.estimators_ = lst_pruned_forest
def score(self, X, y):
predictions = list()
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X))
predictions = np.array(predictions)
mean_predictions = np.mean(predictions, axis=0)
score = self._score_metric(mean_predictions, y)
predictions = np.empty((len(self._estimator.estimators_), X.shape[0]))
for idx_tree, tree in enumerate(self._estimator.estimators_):
predictions[idx_tree, :] = tree.predict(X)
final_predictions = self._aggregate(predictions)
score = self._score_metric(final_predictions, y)[0]
return score
def predict(self, X):
return self._estimator.predict(X)
def predict_base_estimator(self, X):
return self._estimator.predict(X)
@abstractmethod
def _score_metric(self, y_preds, y_true):
pass
@abstractmethod
def _get_best_tree_index(self, y_preds, y_true):
pass
@abstractmethod
def _aggregate(self, predictions):
pass
class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
def _aggregate(self, predictions):
return np.mean(predictions, axis=0)
def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true)
best_tree_index = np.argmin(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
def _aggregate(self, predictions):
return np.sign(np.sum(predictions, axis=0))
def _score_metric(self, y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
bool_arr_correct_predictions = y_preds == y_true
return np.average(bool_arr_correct_predictions, axis=1)
def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true)
best_tree_index = np.argmax(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
if __name__ == "__main__":
from sklearn import datasets
from bolsonaro.models.model_parameters import ModelParameters
......
......@@ -2,7 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.data.task import Task
......@@ -27,6 +27,8 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestClassifier(model_parameters)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION:
......
......@@ -2,7 +2,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task
......@@ -122,7 +122,7 @@ class Trainer(object):
y_pred = np.sign(y_pred)
y_pred = np.where(y_pred == 0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred)
elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor, KMeansForestClassifier]:
result = model.score(X, y_true)
return result
......@@ -139,6 +139,8 @@ class Trainer(object):
elif type(model) is RandomForestRegressor:
y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [ KMeansForestClassifier]:
result = model.score(X, y_true)
return result
def compute_results(self, model, models_dir):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment