omp_forest_classifier.py 8.02 KB
Newer Older
Charly Lamothe's avatar
Charly Lamothe committed
1
2
from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest
from bolsonaro.utils import binarize_class_data
3

Charly Lamothe's avatar
Charly Lamothe committed
4
import numpy as np
5
from sklearn.ensemble import RandomForestClassifier
6
7
from sklearn.linear_model import OrthogonalMatchingPursuit

8

9
10
11
12
13
class OmpForestBinaryClassifier(SingleOmpForest):

    DEFAULT_SCORE_METRIC = 'indicator'

    def __init__(self, models_parameters):
Charly Lamothe's avatar
Charly Lamothe committed
14
        estimator = RandomForestClassifier(**models_parameters.hyperparameters,
15
16
17
18
                                           random_state=models_parameters.seed, n_jobs=-1)
        super().__init__(models_parameters, estimator)

    def _check_classes(self, y):
Charly Lamothe's avatar
Charly Lamothe committed
19
        assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier must be {-1, +1}"
20
21
22
23
24
25
26

    def fit(self, X_forest, y_forest, X_omp, y_omp):
        self._check_classes(y_forest)
        self._check_classes(y_omp)

        return super().fit(X_forest, y_forest, X_omp, y_omp)

27
28
29
30
31
32
33
34
35
    def predict_no_weights(self, X):
        """
        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest
        :return: a np.array of the predictions of the entire forest
        """
36
37

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_])
38
39
40
41
42

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        weights = self._omp.coef_
43
        omp_trees_predictions = forest_predictions[weights != 0].T[1]
44
45
46
47
48

        # Here forest_pred is the probability of being class 1.

        result_omp = np.mean(omp_trees_predictions, axis=1)

49
        result_omp = (result_omp - 0.5) * 2
50

51
        return result_omp
52

53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
        """
        Evaluate OMPForestClassifer on (`X`, `y`) using `metric`

        :param X:
        :param y:
        :param metric: might be "indicator"
        :return:
        """
        predictions = self.predict(X)

        if metric == 'indicator':
            evaluation = np.abs(np.mean(np.abs(np.sign(predictions) - y) - 1))
        else:
            raise ValueError("Unsupported metric '{}'.".format(metric))

        return evaluation


72
class OmpForestMulticlassClassifier(OmpForest):
Charly Lamothe's avatar
Charly Lamothe committed
73

74
    DEFAULT_SCORE_METRIC = 'indicator'
Charly Lamothe's avatar
Charly Lamothe committed
75

76
    def __init__(self, models_parameters):
Charly Lamothe's avatar
Charly Lamothe committed
77
        estimator = RandomForestClassifier(**models_parameters.hyperparameters,
78
                                           random_state=models_parameters.seed, n_jobs=-1)
79
80
81
82
83
84
85
86
87
88
        super().__init__(models_parameters, estimator)
        # question: peut-être initialiser les omps dans le __init__? comme pour le SingleOmpForest
        self._dct_class_omp = {}

    def fit_omp(self, atoms, objective):
        assert len(self._dct_class_omp) == 0, "fit_omp can be called only once on {}".format(self.__class__.__name__)
        possible_classes = sorted(set(objective))
        for class_label in possible_classes:
            atoms_binary = binarize_class_data(atoms, class_label, inplace=False)
            objective_binary = binarize_class_data(objective, class_label, inplace=False)
Léo Bouscarrat's avatar
Léo Bouscarrat committed
89
            # TODO: peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
90
91
92
93
94
95
96
97
            omp_class = OrthogonalMatchingPursuit(
                n_nonzero_coefs=self.models_parameters.extracted_forest_size,
                fit_intercept=True, normalize=False)
            omp_class.fit(atoms_binary, objective_binary)
            self._dct_class_omp[class_label] = omp_class
        return self._dct_class_omp

    def predict(self, X):
Léo Bouscarrat's avatar
Léo Bouscarrat committed
98
99
100
        '''forest_predictions = self._base_estimator_predictions(X)

        print(forest_predictions.shape)
101
102
103
104
105
106
107
108
109

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        label_names = []
        preds = []
        for class_label, omp_class in self._dct_class_omp.items():
            label_names.append(class_label)
            atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False)
Léo Bouscarrat's avatar
Léo Bouscarrat committed
110
            print(atoms_binary.shape)
111
112
            preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))

Léo Bouscarrat's avatar
Léo Bouscarrat committed
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
        # TODO: verifier que ce n'est pas bugué ici

        preds = np.array(preds).T'''

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        label_names = []
        preds = []
        num_class = 0
        for class_label, omp_class in self._dct_class_omp.items():
            label_names.append(class_label)
            atoms_binary = (forest_predictions[num_class] - 0.5) * 2 # centré réduit de 0/1 à -1/1
            preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))
            num_class += 1
130
131
132
133
134

        preds = np.array(preds).T
        max_preds = np.argmax(preds, axis=1)
        return np.array(label_names)[max_preds]

Léo Bouscarrat's avatar
Léo Bouscarrat committed
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
    def predict_no_weights(self, X):
        """
        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest
        :return: a np.array of the predictions of the entire forest
        """

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        label_names = []
        preds = []
        num_class = 0
        for class_label, omp_class in self._dct_class_omp.items():
            weights = omp_class.coef_
            omp_trees_indices = np.nonzero(weights)
            label_names.append(class_label)
            atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1
Léo Bouscarrat's avatar
Léo Bouscarrat committed
158
            preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices))
Léo Bouscarrat's avatar
Léo Bouscarrat committed
159
160
161
162
163
164
            num_class += 1

        preds = np.array(preds).T
        max_preds = np.argmax(preds, axis=1)
        return np.array(label_names)[max_preds]

165
166
167
168
    def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
        predictions = self.predict(X)

        if metric == 'indicator':
Luc Giffon's avatar
Luc Giffon committed
169
            evaluation = np.sum(np.ones_like(predictions)[predictions == y]) / X.shape[0]
170
171
172
173
        else:
            raise ValueError("Unsupported metric '{}'.".format(metric))

        return evaluation
174

Léo Bouscarrat's avatar
Léo Bouscarrat committed
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
    @staticmethod
    def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False):
        if normalize_weights:
            # we can normalize weights (by their sum) so that they sum to 1
            # and they can be interpreted as impact percentages for interpretability.
            # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) --> I don't see why

            # question: je comprend pas le truc avec nonszero?
            # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
            coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :]  # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
            unsigned_coef = (coef_signs * omp_obj.coef_).squeeze()
            intercept = omp_obj.intercept_

            adjusted_forest_predictions = base_predictions * coef_signs
            predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept

        else:
            predictions = omp_obj.predict(base_predictions)

        return predictions

196

197
198
199
200
201
if __name__ == "__main__":
    forest = RandomForestClassifier(n_estimators=10)
    X = np.random.rand(10, 5)
    y = np.random.choice([-1, +1], 10)
    forest.fit(X, y)
Léo Bouscarrat's avatar
Léo Bouscarrat committed
202
    print(forest.predict(np.random.rand(10, 5)))