ensemble_selection_forest_regressor.py 3.95 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeRegressor
from abc import abstractmethod, ABCMeta
import numpy as np
from tqdm import tqdm


class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
    """
    'Ensemble selection from libraries of models' by Rich Caruana et al
    """

    def __init__(self, models_parameters, library, score_metric=mean_squared_error):
        self._models_parameters = models_parameters
        self._library = library
        self._extracted_forest_size = self._models_parameters.extracted_forest_size
        self._score_metric = score_metric

    @property
    def models_parameters(self):
        return self._models_parameters

    @property
    def library(self):
        return self._library

    def fit(self, X_train, y_train, X_val, y_val):
        scores_list = list()
        for estimator in self._library:
            val_score = self._score_metric(estimator.predict(X_val), y_val)
            scores_list.append(val_score)

34
        class_list = list(self._library)
35
36
37
38
39
40
41
42
43
44
45
46
47
        m = np.argmax(np.asarray(scores_list))
        self._ensemble_selected = [class_list[m]]
        temp_pred = class_list[m].predict(X_val)
        del class_list[m]
        for k in range(self._extracted_forest_size - 1):
            candidate_index = 0
            best_score = 100000
            for j in range(len(class_list)):
                temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val)))
                temp_mean = np.mean(temp_pred, axis=0)
                temp_score = self._score_metric(temp_mean, y_val)
                if (temp_score < best_score):
                    candidate_index = j
48
                    best_score = temp_score
49
50
51
52
53
54
55
                temp_pred = np.delete(temp_pred, -1, 0)
            self._ensemble_selected.append(class_list[candidate_index])
            temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val)))
            del class_list[candidate_index]

    def score(self, X, y):
        predictions = self._predict_base_estimator(X)
56
        return self._score_metric(predictions, y)
57
58
59
60
61

    def predict_base_estimator(self, X):
        predictions = list()
        for tree in self._ensemble_selected:
            predictions.append(tree.predict(X))
62
63
        mean_predictions = np.mean(np.array(predictions), axis=0)
        return mean_predictions
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90

    @staticmethod
    def generate_library(X_train, y_train, random_state=None):
        criterion_arr = ["mse"]#, "friedman_mse", "mae"]
        splitter_arr = ["best"]#, "random"]
        depth_arr = [i for i in range(5, 20, 1)]
        min_samples_split_arr = [i for i in range(2, 20, 1)]
        min_samples_leaf_arr = [i for i in range(2, 20, 1)]
        max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]

        library = list()
        with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
            len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \
            len(max_features_arr)) as bar:
            bar.set_description('Generating library')
            for criterion in criterion_arr:
                for splitter in splitter_arr:
                    for depth in depth_arr:
                        for min_samples_split in min_samples_split_arr:
                            for min_samples_leaf in min_samples_leaf_arr:
                                for max_features in max_features_arr:
                                    t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
                                        min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
                                    t.fit(X_train, y_train)
                                    library.append(t)
                                    bar.update(1)
        return library