Skip to content
Snippets Groups Projects
Commit 24cb371b authored by Luc Giffon's avatar Luc Giffon
Browse files

solve bug 0 1 classification problem in omp

parent 5ee9422b
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
...@@ -3,7 +3,7 @@ import time ...@@ -3,7 +3,7 @@ import time
from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression
from bolsonaro.utils import tqdm_joblib from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
...@@ -20,10 +20,22 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): ...@@ -20,10 +20,22 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
""" """
def __init__(self, models_parameters): def __init__(self, models_parameters):
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=2)
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._selected_trees = list() self._selected_trees = list()
self._base_estimator = self.init_estimator(models_parameters)
@staticmethod
@abstractmethod
def init_estimator(model_parameters):
pass
def _base_estimator_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self._base_estimator.estimators_]).T
return base_predictions
def _selected_tree_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self.selected_trees]).T
return base_predictions
@property @property
def models_parameters(self): def models_parameters(self):
...@@ -34,13 +46,10 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): ...@@ -34,13 +46,10 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
return self._selected_trees return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val): def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train) self._base_estimator.fit(X_train, y_train)
predictions_val = np.empty((len(self._estimator.estimators_), X_val.shape[0])) predictions_val = self._base_estimator_predictions(X_val).T
predictions = np.empty((len(self._estimator.estimators_), X_train.shape[0])) predictions = self._base_estimator_predictions(X_train).T
for i_tree, tree in enumerate(self._estimator.estimators_):
predictions_val[i_tree, :] = tree.predict(X_val)
predictions[i_tree, :] = tree.predict(X_train)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
labels = np.array(kmeans.labels_) labels = np.array(kmeans.labels_)
...@@ -51,10 +60,9 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): ...@@ -51,10 +60,9 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
index_trees_cluster = np.where(labels == cluster_idx)[0] index_trees_cluster = np.where(labels == cluster_idx)[0]
predictions_val_cluster = predictions_val[index_trees_cluster] # get predictions of trees in cluster predictions_val_cluster = predictions_val[index_trees_cluster] # get predictions of trees in cluster
best_tree_index = self._get_best_tree_index(predictions_val_cluster, y_val) best_tree_index = self._get_best_tree_index(predictions_val_cluster, y_val)
lst_pruned_forest.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]]) lst_pruned_forest.append(self._base_estimator.estimators_[index_trees_cluster[best_tree_index]])
self._selected_trees = lst_pruned_forest self._selected_trees = lst_pruned_forest
# self._estimator.estimators_ = lst_pruned_forest
def score(self, X, y): def score(self, X, y):
final_predictions = self.predict(X) final_predictions = self.predict(X)
...@@ -62,14 +70,12 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): ...@@ -62,14 +70,12 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
return score return score
def predict(self, X): def predict(self, X):
predictions = np.empty((len(self._selected_trees), X.shape[0])) predictions = self._selected_tree_predictions(X).T
for idx_tree, tree in enumerate(self._selected_trees):
predictions[idx_tree, :] = tree.predict(X)
final_predictions = self._aggregate(predictions) final_predictions = self._aggregate(predictions)
return final_predictions return final_predictions
def predict_base_estimator(self, X): def predict_base_estimator(self, X):
return self._estimator.predict(X) return self._base_estimator.predict(X)
def _get_best_tree_index(self, y_preds, y_true): def _get_best_tree_index(self, y_preds, y_true):
score = self._score_metric(y_preds, y_true) score = self._score_metric(y_preds, y_true)
...@@ -114,6 +120,12 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): ...@@ -114,6 +120,12 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta): class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
@staticmethod
def init_estimator(model_parameters):
return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=2)
def _aggregate(self, predictions): def _aggregate(self, predictions):
return aggregation_regression(predictions) return aggregation_regression(predictions)
...@@ -127,12 +139,27 @@ class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta): ...@@ -127,12 +139,27 @@ class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta): class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
@staticmethod
def init_estimator(model_parameters):
return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=2)
def _aggregate(self, predictions): def _aggregate(self, predictions):
return aggregation_classification(predictions) return aggregation_classification(predictions)
def _score_metric(self, y_preds, y_true): def _score_metric(self, y_preds, y_true):
return score_metric_indicator(y_preds, y_true) return score_metric_indicator(y_preds, y_true)
def _selected_tree_predictions(self, X):
predictions_0_1 = super()._selected_tree_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
def _base_estimator_predictions(self, X):
predictions_0_1 = super()._base_estimator_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
@staticmethod @staticmethod
def _best(array): def _best(array):
return np.argmax(array) return np.argmax(array)
...@@ -28,7 +28,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): ...@@ -28,7 +28,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
return self._base_forest_estimator.score(X, y) return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X): def _base_estimator_predictions(self, X):
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T base_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
return base_predictions
@property @property
def forest(self): def forest(self):
......
...@@ -25,6 +25,11 @@ class OmpForestBinaryClassifier(SingleOmpForest): ...@@ -25,6 +25,11 @@ class OmpForestBinaryClassifier(SingleOmpForest):
return super().fit(X_forest, y_forest, X_omp, y_omp) return super().fit(X_forest, y_forest, X_omp, y_omp)
def _base_estimator_predictions(self, X):
predictions_0_1 = super()._base_estimator_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
def predict_no_weights(self, X): def predict_no_weights(self, X):
""" """
Apply the SingleOmpForest to X without using the weights. Apply the SingleOmpForest to X without using the weights.
...@@ -35,21 +40,18 @@ class OmpForestBinaryClassifier(SingleOmpForest): ...@@ -35,21 +40,18 @@ class OmpForestBinaryClassifier(SingleOmpForest):
:return: a np.array of the predictions of the entire forest :return: a np.array of the predictions of the entire forest
""" """
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]) forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D: if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
weights = self._omp.coef_ weights = self._omp.coef_
omp_trees_predictions = forest_predictions[weights != 0].T[1] omp_trees_predictions = forest_predictions[:, weights != 0]
# Here forest_pred is the probability of being class 1. # Here forest_pred is the probability of being class 1.
result_omp = np.mean(omp_trees_predictions, axis=1) result_omp = np.mean(omp_trees_predictions, axis=1)
result_omp = (result_omp - 0.5) * 2
return result_omp return result_omp
......
...@@ -130,7 +130,7 @@ class Trainer(object): ...@@ -130,7 +130,7 @@ class Trainer(object):
if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
y_pred = model.predict_base_estimator(X) y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, SimilarityForestRegressor]: elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, SimilarityForestClassifier]:
y_pred = model.predict_base_estimator(X) y_pred = model.predict_base_estimator(X)
result = self._base_classification_score_metric(y_true, y_pred) result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) == RandomForestClassifier: elif type(model) == RandomForestClassifier:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment