omp_forest.py 5.56 KB
Newer Older
Charly Lamothe's avatar
Charly Lamothe committed
1
2
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
3

Charly Lamothe's avatar
Charly Lamothe committed
4
from abc import abstractmethod, ABCMeta
5
6
7
8
9
10
import numpy as np
from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.base import BaseEstimator


class OmpForest(BaseEstimator, metaclass=ABCMeta):
11

12
13
14
15
16
17
18
19
20
    def __init__(self, models_parameters, base_forest_estimator):
        self._base_forest_estimator = base_forest_estimator
        self._models_parameters = models_parameters
        self._logger = LoggerFactory.create(LOG_PATH, __name__)

    @property
    def models_parameters(self):
        return self._models_parameters

21
22
23
    def predict_base_estimator(self, X):
        return self._base_forest_estimator.predict(X)

24
25
26
27
28
29
30
31
32
33
34
35
    def score_base_estimator(self, X, y):
        return self._base_forest_estimator.score(X, y)

    def _base_estimator_predictions(self, X):
        return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T

    @property
    def forest(self):
        return self._base_forest_estimator.estimators_

    # sklearn baseestimator api methods
    def fit(self, X_forest, y_forest, X_omp, y_omp):
Léo Bouscarrat's avatar
Léo Bouscarrat committed
36
37
        # print(y_forest.shape)
        # print(set([type(y) for y in y_forest]))
38
        self._base_forest_estimator.fit(X_forest, y_forest)
39
        self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
        return self

    def _extract_subforest(self, X, y):
        """
        Given an already estimated regressor: apply OMP to get the weight of each tree.

        The X data is used for interrogation of every tree in the forest. The y data
        is used for finding the weights in OMP.

        :param X: (n_sample, n_features) array
        :param y: (n_sample,) array
        :return:
        """
        self._logger.debug("Forest make prediction on X")
        D = self._base_estimator_predictions(X)

        if self._models_parameters.normalize_D:
            # question: maybe consider other kinds of normalization.. centering?
            self._logger.debug("Compute norm of predicted vectors on X")
            self._forest_norms = np.linalg.norm(D, axis=0)
            D /= self._forest_norms

        self._logger.debug("Apply orthogonal maching pursuit on forest for {} extracted trees."
                           .format(self._models_parameters.extracted_forest_size))

65
66
67
68
69
70
71
        self.fit_omp(D, y)

    @staticmethod
    def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False):
        if normalize_weights:
            # we can normalize weights (by their sum) so that they sum to 1
            # and they can be interpreted as impact percentages for interpretability.
Léo Bouscarrat's avatar
Léo Bouscarrat committed
72
            # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) --> I don't see why
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100

            # question: je comprend pas le truc avec nonszero?
            # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
            coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :]  # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
            unsigned_coef = (coef_signs * omp_obj.coef_).squeeze()
            intercept = omp_obj.intercept_

            adjusted_forest_predictions = base_predictions * coef_signs
            predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept

        else:
            predictions = omp_obj.predict(base_predictions)

        return predictions

    @abstractmethod
    def fit_omp(self, atoms, objective):
        pass

    @abstractmethod
    def predict(self, X):
        pass

    @abstractmethod
    def score(self, X, y):
        pass

class SingleOmpForest(OmpForest):
101

102
103
104
105
106
107
108
109
110
111
112
    def __init__(self, models_parameters, base_forest_estimator):
        # fit_intercept shouldn't be set to False as the data isn't necessarily centered here
        # normalization is handled outsite OMP
        self._omp = OrthogonalMatchingPursuit(
            n_nonzero_coefs=models_parameters.extracted_forest_size,
            fit_intercept=True, normalize=False)

        super().__init__(models_parameters, base_forest_estimator)

    def fit_omp(self, atoms, objective):
        self._omp.fit(atoms, objective)
113
114
115
116
117
118
119
120
121
122
123
124
125

    def predict(self, X):
        """
        Apply the SingleOmpForest to X.

        Make all the base tree predictions then apply the OMP weights for pruning.

        :param X:
        :return:
        """
        forest_predictions = self._base_estimator_predictions(X)

        if self._models_parameters.normalize_D:
126
            forest_predictions = forest_predictions.T
127
            forest_predictions /= self._forest_norms
128
            forest_predictions = forest_predictions.T
129

Charly Lamothe's avatar
Charly Lamothe committed
130
        return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
Léo Bouscarrat's avatar
Léo Bouscarrat committed
131
132
133
134
135
136
137
138
139
140
141
142
143

    def predict_no_weights(self, X):
        """
        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest
        :return: a np.array of the predictions of the entire forest
        """
        forest_predictions = self._base_estimator_predictions(X).T

        if self._models_parameters.normalize_D:
144
            forest_predictions = forest_predictions.T
Léo Bouscarrat's avatar
Léo Bouscarrat committed
145
            forest_predictions /= self._forest_norms
146
            forest_predictions = forest_predictions.T
Léo Bouscarrat's avatar
Léo Bouscarrat committed
147
148

        weights = self._omp.coef_
Léo Bouscarrat's avatar
Léo Bouscarrat committed
149
        omp_trees_indices = np.nonzero(weights)[0]
Léo Bouscarrat's avatar
Léo Bouscarrat committed
150
151
152

        select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)
        return select_trees