Charly Lamothe
--- a/code/bolsonaro/models/kmeans_forest_regressor.py

+ 48

− 34
+++ b/code/bolsonaro/models/kmeans_forest_regressor.py

+ 48

− 34
 import time

+from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression
 from bolsonaro.utils import tqdm_joblib

 from sklearn.ensemble import RandomForestRegressor
 @@ -53,72 +54,85 @@ class KmeansForest(BaseEstimator, metaclass=ABCMeta):
            lst_pruned_forest.append(self._estimator.estimators_[index_trees_cluster[best_tree_index]])

        self._selected_trees = lst_pruned_forest
-        self._estimator.estimators_ = lst_pruned_forest
+        # self._estimator.estimators_ = lst_pruned_forest

    def score(self, X, y):
-        predictions = np.empty((len(self._estimator.estimators_), X.shape[0]))
-        for idx_tree, tree in enumerate(self._estimator.estimators_):
-            predictions[idx_tree, :] = tree.predict(X)
-        final_predictions = self._aggregate(predictions)
+        final_predictions = self.predict(X)
        score = self._score_metric(final_predictions, y)[0]
        return score

    def predict(self, X):
-        return self._estimator.predict(X)
+        predictions = np.empty((len(self._selected_trees), X.shape[0]))
+        for idx_tree, tree in enumerate(self._selected_trees):
+            predictions[idx_tree, :] = tree.predict(X)
+        final_predictions = self._aggregate(predictions)
+        return final_predictions

    def predict_base_estimator(self, X):
        return self._estimator.predict(X)

+    def _get_best_tree_index(self, y_preds, y_true):
+        score = self._score_metric(y_preds, y_true)
+        best_tree_index = self._best(score)  # get best scoring tree (the one with lowest mse)
+        return best_tree_index
+
    @abstractmethod
    def _score_metric(self, y_preds, y_true):
+        """
+        get score of each predictors in y_preds
+
+        y_preds.shape == (nb_trees, nb_sample)
+        y_true.shape == (1, nb_sample)
+
+        :param y_preds:
+        :param y_true:
+        :return:
+        """
        pass

+    @staticmethod
    @abstractmethod
-    def _get_best_tree_index(self, y_preds, y_true):
+    def _best(array):
+        """
+        return index of best element in array
+
+        :param array:
+        :return:
+        """
        pass

    @abstractmethod
    def _aggregate(self, predictions):
+        """
+        Aggregates votes of predictors in predictions
+
+        predictions shape: (nb_trees, nb_samples)
+        :param predictions:
+        :return:
+        """
        pass

 class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):

    def _aggregate(self, predictions):
-        return np.mean(predictions, axis=0)
+        return aggregation_regression(predictions)

    def _score_metric(self, y_preds, y_true):
-        if len(y_true.shape) == 1:
-            y_true = y_true[np.newaxis, :]
-        if len(y_preds.shape) == 1:
-            y_preds = y_preds[np.newaxis, :]
-        assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
+        return score_metric_mse(y_preds, y_true)

-        diff = y_preds - y_true
-        squared_diff = diff ** 2
-        mean_squared_diff = np.mean(squared_diff, axis=1)
-        return mean_squared_diff
+    @staticmethod
+    def _best(array):
+        return np.argmin(array)

-    def _get_best_tree_index(self, y_preds, y_true):
-        score = self._score_metric(y_preds, y_true)
-        best_tree_index = np.argmin(score)  # get best scoring tree (the one with lowest mse)
-        return best_tree_index

 class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):

    def _aggregate(self, predictions):
-        return np.sign(np.sum(predictions, axis=0))
+        return aggregation_classification(predictions)

    def _score_metric(self, y_preds, y_true):
-        if len(y_true.shape) == 1:
-            y_true = y_true[np.newaxis, :]
-        if len(y_preds.shape) == 1:
-            y_preds = y_preds[np.newaxis, :]
-        assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
+        return score_metric_indicator(y_preds, y_true)

-        bool_arr_correct_predictions = y_preds == y_true
-        return np.average(bool_arr_correct_predictions, axis=1)
-
-    def _get_best_tree_index(self, y_preds, y_true):
-        score = self._score_metric(y_preds, y_true)
-        best_tree_index = np.argmax(score)  # get best scoring tree (the one with lowest mse)
-        return best_tree_index
 \ No newline at end of file
+    @staticmethod
+    def _best(array):
+        return np.argmax(array)