Skip to content
Snippets Groups Projects
Commit 8d52496f authored by Luc Giffon's avatar Luc Giffon
Browse files

fix and optimise ensemble selection forest regressor

parent 96b83f3a
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeRegressor from sklearn.tree import DecisionTreeRegressor
...@@ -5,91 +8,129 @@ from abc import abstractmethod, ABCMeta ...@@ -5,91 +8,129 @@ from abc import abstractmethod, ABCMeta
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from bolsonaro.models.forest_pruning_sota import ForestPruningSOTA
from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator
class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): class EnsembleSelectionForest(ForestPruningSOTA, metaclass=ABCMeta):
""" """
'Ensemble selection from libraries of models' by Rich Caruana et al 'Ensemble selection from libraries of models' by Rich Caruana et al
""" """
def __init__(self, models_parameters, library, score_metric=mean_squared_error): def _fit(self, X_train, y_train, X_val, y_val):
self._models_parameters = models_parameters self._base_estimator.fit(X_train, y_train)
self._library = library
self._extracted_forest_size = self._models_parameters.extracted_forest_size val_predictions = self._base_estimator_predictions(X_val).T
self._score_metric = score_metric scores_predictions_val = self._score_metric(val_predictions, y_val)
self._selected_trees = list() idx_best_score = self._best_score_idx(scores_predictions_val)
@property lst_pruned_forest = [self._base_estimator.estimators_[idx_best_score]]
def models_parameters(self):
return self._models_parameters nb_selected_trees = 1
mean_so_far = val_predictions[idx_best_score]
@property while nb_selected_trees < self._extracted_forest_size:
def library(self): # every new tree is selected with replacement as specified in the base paper
return self._library # mean update formula: u_{t+1} = (n_t * u_t + x_t) / (n_t + 1)
mean_prediction_subset_with_extra_tree = (nb_selected_trees * mean_so_far + val_predictions) / (nb_selected_trees + 1)
@property predictions_subset_with_extra_tree = self._activation(mean_prediction_subset_with_extra_tree)
def selected_trees(self): scores_subset_with_extra_tree = self._score_metric(predictions_subset_with_extra_tree, y_val)
return self._selected_trees idx_best_extra_tree = self._best_score_idx(scores_subset_with_extra_tree)
lst_pruned_forest.append(self._base_estimator.estimators_[idx_best_extra_tree])
def fit(self, X_train, y_train, X_val, y_val):
scores_list = list() mean_so_far = mean_prediction_subset_with_extra_tree[idx_best_extra_tree]
for estimator in self._library: nb_selected_trees += 1
val_score = self._score_metric(estimator.predict(X_val), y_val)
scores_list.append(val_score) return lst_pruned_forest
class_list = list(self._library)
m = np.argmax(np.asarray(scores_list)) @abstractmethod
self._selected_trees = [class_list[m]] def _activation(self, leave_one_tree_out_predictions_val):
temp_pred = class_list[m].predict(X_val) pass
del class_list[m]
for k in range(self._extracted_forest_size - 1):
candidate_index = 0 class EnsembleSelectionForestClassifier(EnsembleSelectionForest, metaclass=ABCMeta):
best_score = 100000 @staticmethod
for j in range(len(class_list)): def init_estimator(model_parameters):
temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val))) return RandomForestClassifier(**model_parameters.hyperparameters,
temp_mean = np.mean(temp_pred, axis=0) random_state=model_parameters.seed, n_jobs=2)
temp_score = self._score_metric(temp_mean, y_val)
if (temp_score < best_score): def _aggregate(self, predictions):
candidate_index = j return aggregation_classification(predictions)
best_score = temp_score
temp_pred = np.delete(temp_pred, -1, 0) def _score_metric(self, y_preds, y_true):
self._selected_trees.append(class_list[candidate_index]) return score_metric_indicator(y_preds, y_true)
temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val)))
del class_list[candidate_index] def _activation(self, predictions):
return np.sign(predictions)
def score(self, X, y):
predictions = self.predict_base_estimator(X) def _selected_tree_predictions(self, X):
return self._score_metric(predictions, y) predictions_0_1 = super()._selected_tree_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
def predict_base_estimator(self, X): return predictions
predictions = list()
for tree in self._selected_trees: def _base_estimator_predictions(self, X):
predictions.append(tree.predict(X)) predictions_0_1 = super()._base_estimator_predictions(X)
mean_predictions = np.mean(np.array(predictions), axis=0) predictions = (predictions_0_1 - 0.5) * 2
return mean_predictions return predictions
@staticmethod @staticmethod
def generate_library(X_train, y_train, random_state=None): def _best_score_idx(array):
criterion_arr = ["mse"]#, "friedman_mse", "mae"] return np.argmax(array)
splitter_arr = ["best"]#, "random"]
depth_arr = [i for i in range(5, 20, 1)] @staticmethod
min_samples_split_arr = [i for i in range(2, 20, 1)] def _worse_score_idx(array):
min_samples_leaf_arr = [i for i in range(2, 20, 1)] return np.argmin(array)
max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
library = list() class EnsembleSelectionForestRegressor(EnsembleSelectionForest, metaclass=ABCMeta):
with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \ @staticmethod
len(max_features_arr)) as bar: def init_estimator(model_parameters):
bar.set_description('Generating library') return RandomForestRegressor(**model_parameters.hyperparameters,
for criterion in criterion_arr: random_state=model_parameters.seed, n_jobs=2)
for splitter in splitter_arr:
for depth in depth_arr: def _aggregate(self, predictions):
for min_samples_split in min_samples_split_arr: return aggregation_regression(predictions)
for min_samples_leaf in min_samples_leaf_arr:
for max_features in max_features_arr: def _score_metric(self, y_preds, y_true):
t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split, return score_metric_mse(y_preds, y_true)
min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
t.fit(X_train, y_train) def _activation(self, predictions):
library.append(t) return predictions
bar.update(1)
return library @staticmethod
def _best_score_idx(array):
return np.argmin(array)
@staticmethod
def _worse_score_idx(array):
return np.argmax(array)
# @staticmethod
# def generate_library(X_train, y_train, random_state=None):
# criterion_arr = ["mse"]#, "friedman_mse", "mae"]
# splitter_arr = ["best"]#, "random"]
# depth_arr = [i for i in range(5, 20, 1)]
# min_samples_split_arr = [i for i in range(2, 20, 1)]
# min_samples_leaf_arr = [i for i in range(2, 20, 1)]
# max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
#
# library = list()
# with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
# len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \
# len(max_features_arr)) as bar:
# bar.set_description('Generating library')
# for criterion in criterion_arr:
# for splitter in splitter_arr:
# for depth in depth_arr:
# for min_samples_split in min_samples_split_arr:
# for min_samples_leaf in min_samples_leaf_arr:
# for max_features in max_features_arr:
# t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
# min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
# t.fit(X_train, y_train)
# library.append(t)
# bar.update(1)
# return library
...@@ -3,7 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor ...@@ -3,7 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor, EnsembleSelectionForestClassifier
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
...@@ -27,6 +27,8 @@ class ModelFactory(object): ...@@ -27,6 +27,8 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'none': elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'ensemble':
return EnsembleSelectionForestClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'kmeans': elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestClassifier(model_parameters) return KMeansForestClassifier(model_parameters)
elif model_parameters.extraction_strategy in ['similarity_similarities', 'similarity_predictions']: elif model_parameters.extraction_strategy in ['similarity_similarities', 'similarity_predictions']:
...@@ -44,7 +46,7 @@ class ModelFactory(object): ...@@ -44,7 +46,7 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'kmeans': elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters) return KMeansForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'ensemble': elif model_parameters.extraction_strategy == 'ensemble':
return EnsembleSelectionForestRegressor(model_parameters, library=library) return EnsembleSelectionForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'none': elif model_parameters.extraction_strategy == 'none':
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
......
...@@ -55,7 +55,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -55,7 +55,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
trainer = Trainer(dataset) trainer = Trainer(dataset)
if parameters['extraction_strategy'] == 'ensemble': # if parameters['extraction_strategy'] == 'ensemble':
if False:
library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed) library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed)
else: else:
library = None library = None
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment