Skip to content
Snippets Groups Projects

WIP: Resolve "Adding new datasets"

Files

 
from sklearn.metrics import mean_squared_error
 
from sklearn.base import BaseEstimator
 
from sklearn.tree import DecisionTreeRegressor
 
from abc import abstractmethod, ABCMeta
 
import numpy as np
 
from tqdm import tqdm
 
 
 
class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
 
"""
 
'Ensemble selection from libraries of models' by Rich Caruana et al
 
"""
 
 
def __init__(self, models_parameters, library, score_metric=mean_squared_error):
 
self._models_parameters = models_parameters
 
self._library = library
 
self._extracted_forest_size = self._models_parameters.extracted_forest_size
 
self._score_metric = score_metric
 
 
@property
 
def models_parameters(self):
 
return self._models_parameters
 
 
@property
 
def library(self):
 
return self._library
 
 
def fit(self, X_train, y_train, X_val, y_val):
 
scores_list = list()
 
for estimator in self._library:
 
val_score = self._score_metric(estimator.predict(X_val), y_val)
 
scores_list.append(val_score)
 
 
class_list = list(self._library)
 
m = np.argmax(np.asarray(scores_list))
 
self._ensemble_selected = [class_list[m]]
 
temp_pred = class_list[m].predict(X_val)
 
del class_list[m]
 
for k in range(self._extracted_forest_size - 1):
 
candidate_index = 0
 
best_score = 100000
 
for j in range(len(class_list)):
 
temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val)))
 
temp_mean = np.mean(temp_pred, axis=0)
 
temp_score = self._score_metric(temp_mean, y_val)
 
if (temp_score < best_score):
 
candidate_index = j
 
best_score = temp_score
 
temp_pred = np.delete(temp_pred, -1, 0)
 
self._ensemble_selected.append(class_list[candidate_index])
 
temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val)))
 
del class_list[candidate_index]
 
 
def score(self, X, y):
 
predictions = self._predict_base_estimator(X)
 
return self._score_metric(predictions, y)
 
 
def predict_base_estimator(self, X):
 
predictions = list()
 
for tree in self._ensemble_selected:
 
predictions.append(tree.predict(X))
 
mean_predictions = np.mean(np.array(predictions), axis=0)
 
return mean_predictions
 
 
@staticmethod
 
def generate_library(X_train, y_train, random_state=None):
 
criterion_arr = ["mse"]#, "friedman_mse", "mae"]
 
splitter_arr = ["best"]#, "random"]
 
depth_arr = [i for i in range(5, 20, 1)]
 
min_samples_split_arr = [i for i in range(2, 20, 1)]
 
min_samples_leaf_arr = [i for i in range(2, 20, 1)]
 
max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
 
 
library = list()
 
with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
 
len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \
 
len(max_features_arr)) as bar:
 
bar.set_description('Generating library')
 
for criterion in criterion_arr:
 
for splitter in splitter_arr:
 
for depth in depth_arr:
 
for min_samples_split in min_samples_split_arr:
 
for min_samples_leaf in min_samples_leaf_arr:
 
for max_features in max_features_arr:
 
t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
 
min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
 
t.fit(X_train, y_train)
 
library.append(t)
 
bar.update(1)
 
return library
Loading