diff --git a/code/bolsonaro/models/kmeans_forest_regressor.py b/code/bolsonaro/models/kmeans_forest_regressor.py index 4f15372186898ba66792d39ac5c2f8b810e682f9..ba1d9b4e56d98fbb2607d12917eb9d75b013044d 100644 --- a/code/bolsonaro/models/kmeans_forest_regressor.py +++ b/code/bolsonaro/models/kmeans_forest_regressor.py @@ -3,7 +3,7 @@ import time from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression from bolsonaro.utils import tqdm_joblib -from sklearn.ensemble import RandomForestRegressor +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.metrics import mean_squared_error from sklearn.base import BaseEstimator from sklearn.cluster import KMeans @@ -20,10 +20,22 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): """ def __init__(self, models_parameters): self._models_parameters = models_parameters - self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, - random_state=self._models_parameters.seed, n_jobs=2) self._extracted_forest_size = self._models_parameters.extracted_forest_size self._selected_trees = list() + self._base_estimator = self.init_estimator(models_parameters) + + @staticmethod + @abstractmethod + def init_estimator(model_parameters): + pass + + def _base_estimator_predictions(self, X): + base_predictions = np.array([tree.predict(X) for tree in self._base_estimator.estimators_]).T + return base_predictions + + def _selected_tree_predictions(self, X): + base_predictions = np.array([tree.predict(X) for tree in self.selected_trees]).T + return base_predictions @property def models_parameters(self): @@ -34,13 +46,10 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): return self._selected_trees def fit(self, X_train, y_train, X_val, y_val): - self._estimator.fit(X_train, y_train) + self._base_estimator.fit(X_train, y_train) - predictions_val = np.empty((len(self._estimator.estimators_), X_val.shape[0])) - predictions = np.empty((len(self._estimator.estimators_), X_train.shape[0])) - for i_tree, tree in enumerate(self._estimator.estimators_): - predictions_val[i_tree, :] = tree.predict(X_val) - predictions[i_tree, :] = tree.predict(X_train) + predictions_val = self._base_estimator_predictions(X_val).T + predictions = self._base_estimator_predictions(X_train).T kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) labels = np.array(kmeans.labels_) @@ -51,10 +60,9 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): index_trees_cluster = np.where(labels == cluster_idx)[0] predictions_val_cluster = predictions_val[index_trees_cluster] # get predictions of trees in cluster best_tree_index = self._get_best_tree_index(predictions_val_cluster, y_val) - lst_pruned_forest.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]]) + lst_pruned_forest.append(self._base_estimator.estimators_[index_trees_cluster[best_tree_index]]) self._selected_trees = lst_pruned_forest - # self._estimator.estimators_ = lst_pruned_forest def score(self, X, y): final_predictions = self.predict(X) @@ -62,14 +70,12 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): return score def predict(self, X): - predictions = np.empty((len(self._selected_trees), X.shape[0])) - for idx_tree, tree in enumerate(self._selected_trees): - predictions[idx_tree, :] = tree.predict(X) + predictions = self._selected_tree_predictions(X).T final_predictions = self._aggregate(predictions) return final_predictions def predict_base_estimator(self, X): - return self._estimator.predict(X) + return self._base_estimator.predict(X) def _get_best_tree_index(self, y_preds, y_true): score = self._score_metric(y_preds, y_true) @@ -114,6 +120,12 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta): class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta): + + @staticmethod + def init_estimator(model_parameters): + return RandomForestRegressor(**model_parameters.hyperparameters, + random_state=model_parameters.seed, n_jobs=2) + def _aggregate(self, predictions): return aggregation_regression(predictions) @@ -127,12 +139,27 @@ class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta): class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta): + @staticmethod + def init_estimator(model_parameters): + return RandomForestClassifier(**model_parameters.hyperparameters, + random_state=model_parameters.seed, n_jobs=2) + def _aggregate(self, predictions): return aggregation_classification(predictions) def _score_metric(self, y_preds, y_true): return score_metric_indicator(y_preds, y_true) + def _selected_tree_predictions(self, X): + predictions_0_1 = super()._selected_tree_predictions(X) + predictions = (predictions_0_1 - 0.5) * 2 + return predictions + + def _base_estimator_predictions(self, X): + predictions_0_1 = super()._base_estimator_predictions(X) + predictions = (predictions_0_1 - 0.5) * 2 + return predictions + @staticmethod def _best(array): return np.argmax(array) diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py index d539f45314a244c410453bb84f726502c6ffe082..90a6fc30edac540b23cf907a3fbd440ec60a86cc 100644 --- a/code/bolsonaro/models/omp_forest.py +++ b/code/bolsonaro/models/omp_forest.py @@ -28,7 +28,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): return self._base_forest_estimator.score(X, y) def _base_estimator_predictions(self, X): - return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T + base_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T + return base_predictions @property def forest(self): diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py index 7a22337b2fcf48b5181e4971836470e17d0f4f62..4255eeb129aa04f797c1fd690809f5163b66c5c2 100644 --- a/code/bolsonaro/models/omp_forest_classifier.py +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -25,6 +25,11 @@ class OmpForestBinaryClassifier(SingleOmpForest): return super().fit(X_forest, y_forest, X_omp, y_omp) + def _base_estimator_predictions(self, X): + predictions_0_1 = super()._base_estimator_predictions(X) + predictions = (predictions_0_1 - 0.5) * 2 + return predictions + def predict_no_weights(self, X): """ Apply the SingleOmpForest to X without using the weights. @@ -35,21 +40,18 @@ class OmpForestBinaryClassifier(SingleOmpForest): :return: a np.array of the predictions of the entire forest """ - forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]) + forest_predictions = self._base_estimator_predictions(X) if self._models_parameters.normalize_D: - forest_predictions = forest_predictions.T forest_predictions /= self._forest_norms - forest_predictions = forest_predictions.T weights = self._omp.coef_ - omp_trees_predictions = forest_predictions[weights != 0].T[1] + omp_trees_predictions = forest_predictions[:, weights != 0] # Here forest_pred is the probability of being class 1. result_omp = np.mean(omp_trees_predictions, axis=1) - result_omp = (result_omp - 0.5) * 2 return result_omp diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 1adb387ca5cf639b8bea16b72d27b46ac190fb14..8d82b3d6872655e6406944f8bb4e7f1be058f20f 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -130,7 +130,7 @@ class Trainer(object): if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: y_pred = model.predict_base_estimator(X) result = self._base_regression_score_metric(y_true, y_pred) - elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, SimilarityForestRegressor]: + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, SimilarityForestClassifier]: y_pred = model.predict_base_estimator(X) result = self._base_classification_score_metric(y_true, y_pred) elif type(model) == RandomForestClassifier: