omp_forest_classifier.py 8.29 KB
Newer Older
Charly Lamothe's avatar
Charly Lamothe committed
1
2
from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest
from bolsonaro.utils import binarize_class_data
3

Charly Lamothe's avatar
Charly Lamothe committed
4
import numpy as np
5
from sklearn.ensemble import RandomForestClassifier
6
7
from sklearn.linear_model import OrthogonalMatchingPursuit

8

9
10
11
12
13
class OmpForestBinaryClassifier(SingleOmpForest):

    DEFAULT_SCORE_METRIC = 'indicator'

    def __init__(self, models_parameters):
Charly Lamothe's avatar
Charly Lamothe committed
14
        estimator = RandomForestClassifier(**models_parameters.hyperparameters,
15
16
17
18
                                           random_state=models_parameters.seed, n_jobs=-1)
        super().__init__(models_parameters, estimator)

    def _check_classes(self, y):
Charly Lamothe's avatar
Charly Lamothe committed
19
        assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier must be {-1, +1}"
20
21
22
23
24
25
26

    def fit(self, X_forest, y_forest, X_omp, y_omp):
        self._check_classes(y_forest)
        self._check_classes(y_omp)

        return super().fit(X_forest, y_forest, X_omp, y_omp)

27
28
29
30
31
32
33
34
35
    def predict_no_weights(self, X):
        """
        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest
        :return: a np.array of the predictions of the entire forest
        """
36
37

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_])
38
39

        if self._models_parameters.normalize_D:
40
            forest_predictions = forest_predictions.T
41
            forest_predictions /= self._forest_norms
42
            forest_predictions = forest_predictions.T
43
44
45
46

        weights = self._omp.coef_
        omp_trees_indices = np.nonzero(weights)

47
48
49
50
51
52
        omp_trees_predictions = forest_predictions[omp_trees_indices].T[1]

        # Here forest_pred is the probability of being class 1.

        result_omp = np.mean(omp_trees_predictions, axis=1)

53
        result_omp = (result_omp - 0.5) * 2
54

55
        return result_omp
56

57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
    def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
        """
        Evaluate OMPForestClassifer on (`X`, `y`) using `metric`

        :param X:
        :param y:
        :param metric: might be "indicator"
        :return:
        """
        predictions = self.predict(X)

        if metric == 'indicator':
            evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1))
        else:
            raise ValueError("Unsupported metric '{}'.".format(metric))

        return evaluation


76
class OmpForestMulticlassClassifier(OmpForest):
Charly Lamothe's avatar
Charly Lamothe committed
77

78
    DEFAULT_SCORE_METRIC = 'indicator'
Charly Lamothe's avatar
Charly Lamothe committed
79

80
    def __init__(self, models_parameters):
Charly Lamothe's avatar
Charly Lamothe committed
81
        estimator = RandomForestClassifier(**models_parameters.hyperparameters,
82
                                           random_state=models_parameters.seed, n_jobs=-1)
83
84
85
86
87
88
89
90
91
92
        super().__init__(models_parameters, estimator)
        # question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest
        self._dct_class_omp = {}

    def fit_omp(self, atoms, objective):
        assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__)
        possible_classes = sorted(set(objective))
        for class_label in possible_classes:
            atoms_binary = binarize_class_data(atoms, class_label, inplace=False)
            objective_binary = binarize_class_data(objective, class_label, inplace=False)
Léo Bouscarrat's avatar
Léo Bouscarrat committed
93
            # TODO: peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
94
95
96
97
98
99
100
101
            omp_class = OrthogonalMatchingPursuit(
                n_nonzero_coefs=self.models_parameters.extracted_forest_size,
                fit_intercept=True, normalize=False)
            omp_class.fit(atoms_binary, objective_binary)
            self._dct_class_omp[class_label] = omp_class
        return self._dct_class_omp

    def predict(self, X):
Léo Bouscarrat's avatar
Léo Bouscarrat committed
102
103
104
        '''forest_predictions = self._base_estimator_predictions(X)

        print(forest_predictions.shape)
105
106
107
108
109
110
111
112
113

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        label_names = []
        preds = []
        for class_label, omp_class in self._dct_class_omp.items():
            label_names.append(class_label)
            atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False)
Léo Bouscarrat's avatar
Léo Bouscarrat committed
114
            print(atoms_binary.shape)
115
116
            preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))

Léo Bouscarrat's avatar
Léo Bouscarrat committed
117
118
119
120
121
122
123
        # TODO: verifier que ce n'est pas bugué ici

        preds = np.array(preds).T'''

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T

        if self._models_parameters.normalize_D:
124
            forest_predictions = forest_predictions.T
Léo Bouscarrat's avatar
Léo Bouscarrat committed
125
            forest_predictions /= self._forest_norms
126
            forest_predictions = forest_predictions.T
Léo Bouscarrat's avatar
Léo Bouscarrat committed
127
128
129
130
131
132
133
134
135

        label_names = []
        preds = []
        num_class = 0
        for class_label, omp_class in self._dct_class_omp.items():
            label_names.append(class_label)
            atoms_binary = (forest_predictions[num_class] - 0.5) * 2 # centré réduit de 0/1 à -1/1
            preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))
            num_class += 1
136
137
138
139
140

        preds = np.array(preds).T
        max_preds = np.argmax(preds, axis=1)
        return np.array(label_names)[max_preds]

Léo Bouscarrat's avatar
Léo Bouscarrat committed
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
    def predict_no_weights(self, X):
        """
        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest
        :return: a np.array of the predictions of the entire forest
        """

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        label_names = []
        preds = []
        num_class = 0
        for class_label, omp_class in self._dct_class_omp.items():
            weights = omp_class.coef_
            omp_trees_indices = np.nonzero(weights)
            label_names.append(class_label)
            atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1
Léo Bouscarrat's avatar
Léo Bouscarrat committed
164
            preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices))
Léo Bouscarrat's avatar
Léo Bouscarrat committed
165
166
167
168
169
170
            num_class += 1

        preds = np.array(preds).T
        max_preds = np.argmax(preds, axis=1)
        return np.array(label_names)[max_preds]

171
172
173
174
    def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
        predictions = self.predict(X)

        if metric == 'indicator':
Luc Giffon's avatar
Luc Giffon committed
175
            evaluation = np.sum(np.ones_like(predictions)[predictions == y]) / X.shape[0]
176
177
178
179
        else:
            raise ValueError("Unsupported metric '{}'.".format(metric))

        return evaluation
180

Léo Bouscarrat's avatar
Léo Bouscarrat committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
    @staticmethod
    def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False):
        if normalize_weights:
            # we can normalize weights (by their sum) so that they sum to 1
            # and they can be interpreted as impact percentages for interpretability.
            # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) --> I don't see why

            # question: je comprend pas le truc avec nonszero?
            # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
            coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :]  # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
            unsigned_coef = (coef_signs * omp_obj.coef_).squeeze()
            intercept = omp_obj.intercept_

            adjusted_forest_predictions = base_predictions * coef_signs
            predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept

        else:
            predictions = omp_obj.predict(base_predictions)

        return predictions

202

203
204
205
206
207
if __name__ == "__main__":
    forest = RandomForestClassifier(n_estimators=10)
    X = np.random.rand(10, 5)
    y = np.random.choice([-1, +1], 10)
    forest.fit(X, y)
Léo Bouscarrat's avatar
Léo Bouscarrat committed
208
    print(forest.predict(np.random.rand(10, 5)))