trainer.py 10.4 KB
Newer Older
Leo Bouscarrat's avatar
Leo Bouscarrat committed
1
2
3
4
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
Charly Lamothe's avatar
Charly Lamothe committed
5
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
6
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
Leo Bouscarrat's avatar
Leo Bouscarrat committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task
from . import LOG_PATH

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import time
import datetime
import numpy as np


class Trainer(object):
    """
    Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method.
    """

    def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score,
        base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score):
        """

        :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes
        """
        self._dataset = dataset
        self._logger = LoggerFactory.create(LOG_PATH, __name__)
        self._regression_score_metric = regression_score_metric
        self._classification_score_metric = classification_score_metric
        self._base_regression_score_metric = base_regression_score_metric
        self._base_classification_score_metric = base_classification_score_metric
        self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
            else classification_score_metric.__name__
        self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
            else base_classification_score_metric.__name__

    @property
    def score_metric_name(self):
        return self._score_metric_name

    @property
    def base_score_metric_name(self):
        return self._base_score_metric_name

    def init(self, model, subsets_used='train,dev'):
        if type(model) in [RandomForestRegressor, RandomForestClassifier]:
            if subsets_used == 'train,dev':
                self._X_forest = self._dataset.X_train
                self._y_forest = self._dataset.y_train
            else:
                self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
                self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])    
            self._logger.debug('Fitting the forest on train subset')
        elif model.models_parameters.subsets_used == 'train,dev':
            self._X_forest = self._dataset.X_train
            self._y_forest = self._dataset.y_train
            self._X_omp = self._dataset.X_dev
            self._y_omp = self._dataset.y_dev
            self._logger.debug('Fitting the forest on train subset and OMP on dev subset.')
        elif model.models_parameters.subsets_used == 'train+dev,train+dev':
            self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
            self._X_omp = self._X_forest
            self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
            self._y_omp = self._y_forest
            self._logger.debug('Fitting both the forest and OMP on train+dev subsets.')
        elif model.models_parameters.subsets_used == 'train,train+dev':
            self._X_forest = self._dataset.X_train
            self._y_forest = self._dataset.y_train
            self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
            self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
        else:
            raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))

77
    def train(self, model, extracted_forest_size=None):
Leo Bouscarrat's avatar
Leo Bouscarrat committed
78
79
80
81
82
83
84
85
        """
        :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
            OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
        :return:
        """
        self._logger.debug('Training model using train set...')
        self._begin_time = time.time()
        if type(model) in [RandomForestRegressor, RandomForestClassifier]:
86
            if extracted_forest_size is not None:
87
                estimators_index = np.arange(len(model.estimators_))
88
89
90
                np.random.shuffle(estimators_index)
                choosen_estimators = estimators_index[:extracted_forest_size]
                model.estimators_ = np.array(model.estimators_)[choosen_estimators]
91
92
93
94
95
            else:
                model.fit(
                    X=self._X_forest,
                    y=self._y_forest
                )
Leo Bouscarrat's avatar
Leo Bouscarrat committed
96
97
98
99
100
101
102
103
104
105
        else:
            model.fit(
                self._X_forest,
                self._y_forest,
                self._X_omp,
                self._y_omp
            )
        self._end_time = time.time()

    def __score_func(self, model, X, y_true, weights=True):
106
        if type(model) in [OmpForestRegressor, RandomForestRegressor]:
Leo Bouscarrat's avatar
Leo Bouscarrat committed
107
108
109
110
111
112
113
114
115
116
117
            if weights:
                y_pred = model.predict(X)
            else:
                y_pred = model.predict_no_weights(X)
            result = self._regression_score_metric(y_true, y_pred)
        elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
            if weights:
                y_pred = model.predict(X)
            else:
                y_pred = model.predict_no_weights(X)
            if type(model) is OmpForestBinaryClassifier:
118
                y_pred = np.sign(y_pred)
119
                y_pred = np.where(y_pred == 0, 1, y_pred)
Leo Bouscarrat's avatar
Leo Bouscarrat committed
120
            result = self._classification_score_metric(y_true, y_pred)
121
        elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
122
            result = model.score(X, y_true)
Leo Bouscarrat's avatar
Leo Bouscarrat committed
123
124
125
        return result

    def __score_func_base(self, model, X, y_true):
126
        if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
Leo Bouscarrat's avatar
Leo Bouscarrat committed
127
128
129
130
131
132
133
134
            y_pred = model.predict_base_estimator(X)
            result = self._base_regression_score_metric(y_true, y_pred)
        elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
            y_pred = model.predict_base_estimator(X)
            result = self._base_classification_score_metric(y_true, y_pred)
        elif type(model) == RandomForestClassifier:
            y_pred = model.predict(X)
            result = self._base_classification_score_metric(y_true, y_pred)
135
        elif type(model) is RandomForestRegressor:
Leo Bouscarrat's avatar
Leo Bouscarrat committed
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
            y_pred = model.predict(X)
            result = self._base_regression_score_metric(y_true, y_pred)
        return result

    def compute_results(self, model, models_dir):
        """
        :param model: Object with
        :param models_dir: Where the results will be saved
        """

        model_weights = ''
        if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]:
            model_weights = model._omp.coef_
        elif type(model) == OmpForestMulticlassClassifier:
            model_weights = model._dct_class_omp
        elif type(model) == OmpForestBinaryClassifier:
            model_weights = model._omp

        results = ModelRawResults(
            model_weights=model_weights,
            training_time=self._end_time - self._begin_time,
            datetime=datetime.datetime.now(),
            train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
            dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev),
            test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test),
            train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
            dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
            test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
            score_metric=self._score_metric_name,
            base_score_metric=self._base_score_metric_name
        )
        results.save(models_dir)
        self._logger.info("Base performance on test: {}".format(results.test_score_base))
        self._logger.info("Performance on test: {}".format(results.test_score))

        self._logger.info("Base performance on train: {}".format(results.train_score_base))
        self._logger.info("Performance on train: {}".format(results.train_score))

        self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
        self._logger.info("Performance on dev: {}".format(results.dev_score))

        if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
            results = ModelRawResults(
                model_weights='',
                training_time=self._end_time - self._begin_time,
                datetime=datetime.datetime.now(),
                train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False),
                dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False),
                test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
                train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
                dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
                test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
                score_metric=self._score_metric_name,
                base_score_metric=self._base_score_metric_name
            )
            results.save(models_dir+'_no_weights')
            self._logger.info("Base performance on test without weights: {}".format(results.test_score_base))
            self._logger.info("Performance on test: {}".format(results.test_score))

            self._logger.info("Base performance on train without weights: {}".format(results.train_score_base))
            self._logger.info("Performance on train: {}".format(results.train_score))

            self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
            self._logger.info("Performance on dev: {}".format(results.dev_score))