Skip to content
Snippets Groups Projects

WIP: Resolve "Adding new datasets"

Merged Charly Lamothe requested to merge 17-adding-new-datasets into master
Files
62
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeRegressor
from abc import abstractmethod, ABCMeta
import numpy as np
from tqdm import tqdm
class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
"""
'Ensemble selection from libraries of models' by Rich Caruana et al
"""
def __init__(self, models_parameters, library, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._library = library
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
@property
def library(self):
return self._library
def fit(self, X_train, y_train, X_val, y_val):
scores_list = list()
for estimator in self._library:
val_score = self._score_metric(estimator.predict(X_val), y_val)
scores_list.append(val_score)
class_list = list(self._library)
m = np.argmax(np.asarray(scores_list))
self._ensemble_selected = [class_list[m]]
temp_pred = class_list[m].predict(X_val)
del class_list[m]
for k in range(self._extracted_forest_size - 1):
candidate_index = 0
best_score = 100000
for j in range(len(class_list)):
temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val)))
temp_mean = np.mean(temp_pred, axis=0)
temp_score = self._score_metric(temp_mean, y_val)
if (temp_score < best_score):
candidate_index = j
best_score = temp_score
temp_pred = np.delete(temp_pred, -1, 0)
self._ensemble_selected.append(class_list[candidate_index])
temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val)))
del class_list[candidate_index]
def score(self, X, y):
predictions = self._predict_base_estimator(X)
return self._score_metric(predictions, y)
def predict_base_estimator(self, X):
predictions = list()
for tree in self._ensemble_selected:
predictions.append(tree.predict(X))
mean_predictions = np.mean(np.array(predictions), axis=0)
return mean_predictions
@staticmethod
def generate_library(X_train, y_train, random_state=None):
criterion_arr = ["mse"]#, "friedman_mse", "mae"]
splitter_arr = ["best"]#, "random"]
depth_arr = [i for i in range(5, 20, 1)]
min_samples_split_arr = [i for i in range(2, 20, 1)]
min_samples_leaf_arr = [i for i in range(2, 20, 1)]
max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
library = list()
with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \
len(max_features_arr)) as bar:
bar.set_description('Generating library')
for criterion in criterion_arr:
for splitter in splitter_arr:
for depth in depth_arr:
for min_samples_split in min_samples_split_arr:
for min_samples_leaf in min_samples_leaf_arr:
for max_features in max_features_arr:
t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
t.fit(X_train, y_train)
library.append(t)
bar.update(1)
return library
Loading