Skip to content
Snippets Groups Projects
Select Git revision
  • bf240b77f2f4698e77f958ba8f4ab2cc71164bd7
  • master default protected
  • correlation
  • 24-non-negative-omp
  • 15-integration-sota
  • 20-coherence-des-arbres-de-predictions
  • 19-add-some-tests
  • 13-visualization
  • 17-adding-new-datasets
  • 12-experiment-pipeline
  • 14-correction-of-multiclass-classif
  • archive/10-gridsearching-of-the-base-forest
  • archive/farah_notation_and_related_work
  • archive/wip_clean_scripts
  • archive/4-implement-omp_forest_classifier
  • archive/5-add-plots-2
  • archive/Leo_Add_first_notebook
17 results

ensemble_selection_forest_regressor.py

Blame
  • ensemble_selection_forest_regressor.py 3.94 KiB
    from sklearn.metrics import mean_squared_error
    from sklearn.base import BaseEstimator
    from sklearn.tree import DecisionTreeRegressor
    from abc import abstractmethod, ABCMeta
    import numpy as np
    from tqdm import tqdm
    
    
    class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
        """
        'Ensemble selection from libraries of models' by Rich Caruana et al
        """
    
        def __init__(self, models_parameters, library, score_metric=mean_squared_error):
            self._models_parameters = models_parameters
            self._library = library
            self._extracted_forest_size = self._models_parameters.extracted_forest_size
            self._score_metric = score_metric
    
        @property
        def models_parameters(self):
            return self._models_parameters
    
        @property
        def library(self):
            return self._library
    
        def fit(self, X_train, y_train, X_val, y_val):
            scores_list = list()
            for estimator in self._library:
                val_score = self._score_metric(estimator.predict(X_val), y_val)
                scores_list.append(val_score)
    
            class_list = list(self._library)
            m = np.argmax(np.asarray(scores_list))
            self._ensemble_selected = [class_list[m]]
            temp_pred = class_list[m].predict(X_val)
            del class_list[m]
            for k in range(self._extracted_forest_size - 1):
                candidate_index = 0
                best_score = 100000
                for j in range(len(class_list)):
                    temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val)))
                    temp_mean = np.mean(temp_pred, axis=0)
                    temp_score = self._score_metric(temp_mean, y_val)
                    if (temp_score < best_score):
                        candidate_index = j
                        best_score = tmp_score
                    temp_pred = np.delete(temp_pred, -1, 0)
                self._ensemble_selected.append(class_list[candidate_index])
                temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val)))
                del class_list[candidate_index]
    
        def score(self, X, y):
            predictions = self._predict_base_estimator(X)
            mean_predictions = np.mean(predictions, axis=0)
            return self._score_metric(mean_predictions, y)
    
        def predict_base_estimator(self, X):
            predictions = list()
            for tree in self._ensemble_selected:
                predictions.append(tree.predict(X))
            return np.array(predictions)
    
        @staticmethod
        def generate_library(X_train, y_train, random_state=None):
            criterion_arr = ["mse"]#, "friedman_mse", "mae"]
            splitter_arr = ["best"]#, "random"]
            depth_arr = [i for i in range(5, 20, 1)]
            min_samples_split_arr = [i for i in range(2, 20, 1)]
            min_samples_leaf_arr = [i for i in range(2, 20, 1)]
            max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
    
            library = list()
            with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
                len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \
                len(max_features_arr)) as bar:
                bar.set_description('Generating library')
                for criterion in criterion_arr:
                    for splitter in splitter_arr:
                        for depth in depth_arr:
                            for min_samples_split in min_samples_split_arr:
                                for min_samples_leaf in min_samples_leaf_arr:
                                    for max_features in max_features_arr:
                                        t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
                                            min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
                                        t.fit(X_train, y_train)
                                        library.append(t)
                                        bar.update(1)
            return library