kmeans_forest_regressor.py 3.51 KB
Newer Older
1
2
from bolsonaro.utils import tqdm_joblib

3
4
5
6
7
8
9
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans
from abc import abstractmethod, ABCMeta
import numpy as np
from scipy.stats import mode
10
11
from joblib import Parallel, delayed
from tqdm import tqdm
12
13
14
15
16
17
18


class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
    """
    On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
    """

19
    def __init__(self, models_parameters, score_metric=mean_squared_error):
20
        self._models_parameters = models_parameters
21
22
        self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
            random_state=self._models_parameters.seed, n_jobs=-1)
23
        self._extracted_forest_size = self._models_parameters.extracted_forest_size
24
        self._score_metric = score_metric
25
        self._selected_trees = list()
26
27
28
29
30

    @property
    def models_parameters(self):
        return self._models_parameters

31
32
33
34
    @property
    def selected_trees(self):
        return self._selected_trees

35
36
    def fit(self, X_train, y_train, X_val, y_val):
        self._estimator.fit(X_train, y_train)
37
38

        predictions = list()
39
        for tree in self._estimator.estimators_:
40
41
42
43
44
45
            predictions.append(tree.predict(X_train))
        predictions = np.array(predictions)

        kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
        labels = np.array(kmeans.labels_)

46
        # For each cluster select the best tree on the validation set
47
        extracted_forest_sizes = list(range(self._extracted_forest_size))
48
        with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
49
            pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
50
                extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
51
52
                for i in range(self._extracted_forest_size))

53
        self._selected_trees = pruned_forest
54
        self._estimator.estimators_ = pruned_forest
55

56
57
    def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
        index = np.where(labels == c)[0]
58
        with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
59
60
61
62
            cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, 
                y_val, score_metric) for i in range(len(index)))
        best_tree_index = np.argmax(cluster)
        prune_forest_job_pb.update()
63
        return self._estimator.estimators_[index[best_tree_index]]
64
65

    def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
66
        y_val_pred = self._estimator.estimators_[i].predict(X_val)
67
68
69
70
        tree_pred = score_metric(y_val, y_val_pred)
        cluster_job_pb.update()
        return tree_pred

71
    def predict(self, X):
72
        return self._estimator.predict(X)
73
74
75

    def score(self, X, y):
        predictions = list()
76
        for tree in self._estimator.estimators_:
77
78
79
            predictions.append(tree.predict(X))
        predictions = np.array(predictions)
        mean_predictions = np.mean(predictions, axis=0)
80
        score = self._score_metric(mean_predictions, y)
81
82
83
        return score

    def predict_base_estimator(self, X):
84
        return self._estimator.predict(X)