28c3d8747ab945a10ba0d89ecc17f29700827ad5 to 1db36b5d8d77b35b78984c19d13b7ab879e43d38 · Luc Giffon / bolsonaro

Some changes are not shown.

For a faster browsing experience, only 100 of 213 files are shown. Download one of the files below to see all changes.

code/bolsonaro/data/dataset_loader.py

+25
−4

Original line number
Diff line number
Diff line

from bolsonaro.data.dataset import Dataset

from bolsonaro.data.dataset_parameters import DatasetParameters

from bolsonaro.data.task import Task

from bolsonaro.utils import change_binary_func_load

from bolsonaro.utils import change_binary_func_load, change_binary_func_openml

from sklearn.datasets import load_boston, load_iris, load_diabetes, \

    load_digits, load_linnerud, load_wine, load_breast_cancer

from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \

    fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \

    fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing

    fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing, \

    fetch_openml

from sklearn.model_selection import train_test_split

from sklearn import preprocessing

import random

@@ -30,13 +31,15 @@ class DatasetLoader(object):

    dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine',

        'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',

        'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds']

        'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds', 'steel-plates',

        'kr-vs-kp', 'kin8nm', 'spambase', 'musk', 'gamma']

    dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5,

        'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15,

        '20newsgroups_vectorized':3, 'lfw_people':3,

        'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3,

        'diamonds': 15}

        'diamonds': 15, 'steel-plates': 15, 'kr-vs-kp': 15, 'kin8nm': 15,

        'spambase': 15, 'musk': 15, 'gamma': 15}

    @staticmethod

    def load(dataset_parameters):

@@ -103,6 +106,24 @@ class DatasetLoader(object):

            df['clarity'] = label_clarity.fit_transform(df['clarity'])

            X, y = df.drop(['price'], axis=1), df['price']

            task = Task.REGRESSION

        elif name == 'steel-plates':

            dataset_loading_func = change_binary_func_openml('steel-plates-fault')

            task = Task.BINARYCLASSIFICATION

        elif name == 'kr-vs-kp':

            dataset_loading_func = change_binary_func_openml('kr-vs-kp')

            task = Task.BINARYCLASSIFICATION

        elif name == 'kin8nm':

            X, y = fetch_openml('kin8nm', return_X_y=True)

            task = Task.REGRESSION

        elif name == 'spambase':

            dataset_loading_func = change_binary_func_openml('spambase')

            task = Task.BINARYCLASSIFICATION

        elif name == 'musk':

            dataset_loading_func = change_binary_func_openml('musk')

            task = Task.BINARYCLASSIFICATION

        elif name == 'gamma':

            dataset_loading_func = change_binary_func_openml('MagicTelescope')

            task = Task.BINARYCLASSIFICATION

        else:

            raise ValueError("Unsupported dataset '{}'".format(name))

code/bolsonaro/models/kmeans_forest_regressor.py0 → 100644

+78
−0

Original line number
Diff line number
Diff line

from bolsonaro.utils import tqdm_joblib

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

from sklearn.base import BaseEstimator

from sklearn.cluster import KMeans

from abc import abstractmethod, ABCMeta

import numpy as np

from scipy.stats import mode

from joblib import Parallel, delayed

from tqdm import tqdm

class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):

    """

    On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.

    """

    def __init__(self, models_parameters, score_metric=mean_squared_error):

        self._models_parameters = models_parameters

        self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,

            random_state=self._models_parameters.seed, n_jobs=-1)

        self._extracted_forest_size = self._models_parameters.extracted_forest_size

        self._score_metric = score_metric

    @property

    def models_parameters(self):

        return self._models_parameters

    def fit(self, X_train, y_train, X_val, y_val):

        self._estimator.fit(X_train, y_train)

        predictions = list()

        for tree in self._estimator.estimators_:

            predictions.append(tree.predict(X_train))

        predictions = np.array(predictions)

        kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)

        labels = np.array(kmeans.labels_)

        # For each cluster select the best tree on the validation set

        extracted_forest_sizes = list(range(self._extracted_forest_size))

        with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:

            pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,

                extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)

                for i in range(self._extracted_forest_size))

        self._estimator.estimators_ = pruned_forest

    def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):

        index = np.where(labels == c)[0]

        with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:

            cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, 

                y_val, score_metric) for i in range(len(index)))

        best_tree_index = np.argmax(cluster)

        prune_forest_job_pb.update()

        return self._estimator.estimators_[index[best_tree_index]]

    def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):

        y_val_pred = self._estimator.estimators_[i].predict(X_val)

        tree_pred = score_metric(y_val, y_val_pred)

        cluster_job_pb.update()

        return tree_pred

    def predict(self, X):

        return self._estimator.predict(X)

    def score(self, X, y):

        predictions = list()

        for tree in self._estimator.estimators_:

            predictions.append(tree.predict(X))

        predictions = np.array(predictions)

        mean_predictions = np.mean(predictions, axis=0)

        score = self._score_metric(mean_predictions, y)

        return score

    def predict_base_estimator(self, X):

        return self._estimator.predict(X)

code/bolsonaro/models/model_factory.py

+12
−3

Original line number
Diff line number
Diff line

@@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om

from bolsonaro.models.omp_forest_regressor import OmpForestRegressor

from bolsonaro.models.model_parameters import ModelParameters

from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor

from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor

from bolsonaro.data.task import Task

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

@@ -22,9 +23,11 @@ class ModelFactory(object):

            elif model_parameters.extraction_strategy == 'random':

                return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,

                    random_state=model_parameters.seed)

            else:

            elif model_parameters.extraction_strategy == 'none':

                return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],

                    random_state=model_parameters.seed)

            else:

                raise ValueError('Invalid extraction strategy')

        elif task == Task.REGRESSION:

            if model_parameters.extraction_strategy == 'omp':

                return OmpForestRegressor(model_parameters)

@@ -33,15 +36,21 @@ class ModelFactory(object):

                    random_state=model_parameters.seed)

            elif model_parameters.extraction_strategy == 'similarity':

                return SimilarityForestRegressor(model_parameters)

            else:

            elif model_parameters.extraction_strategy == 'kmeans':

                return KMeansForestRegressor(model_parameters)

            elif model_parameters.extraction_strategy == 'none':

                return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],

                    random_state=model_parameters.seed)

            else:

                raise ValueError('Invalid extraction strategy')

        elif task == Task.MULTICLASSIFICATION:

            if model_parameters.extraction_strategy == 'omp':

                return OmpForestMulticlassClassifier(model_parameters)

            elif model_parameters.extraction_strategy == 'random':

                return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,

                    random_state=model_parameters.seed)

            else:

            elif model_parameters.extraction_strategy == 'none':

                return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],

                    random_state=model_parameters.seed)

            else:

                raise ValueError('Invalid extraction strategy')

code/bolsonaro/models/model_raw_results.py

+8
−6

Original line number
Diff line number
Diff line

@@ -6,12 +6,12 @@ import datetime

class ModelRawResults(object):

    def __init__(self, model_object, training_time,

    def __init__(self, model_weights, training_time,

        datetime, train_score, dev_score, test_score,

        train_score_base, dev_score_base,

        test_score_base, score_metric, base_score_metric):

        self._model_object = model_object

        self._model_weights = model_weights

        self._training_time = training_time

        self._datetime = datetime

        self._train_score = train_score

@@ -24,8 +24,8 @@ class ModelRawResults(object):

        self._base_score_metric = base_score_metric

    @property

    def model_object(self):

        return self.model_object

    def model_weights(self):

        return self.model_weights

    @property

    def training_time(self):

@@ -68,6 +68,8 @@ class ModelRawResults(object):

        return self._base_score_metric

    def save(self, models_dir):

        if not os.path.exists(models_dir):

            os.mkdir(models_dir)

        save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle',

            self.__dict__)

code/bolsonaro/models/omp_forest.py

+25
−1

Original line number
Diff line number
Diff line

@@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator

class OmpForest(BaseEstimator, metaclass=ABCMeta):

    def __init__(self, models_parameters, base_forest_estimator):

        self._base_forest_estimator = base_forest_estimator

        self._models_parameters = models_parameters

@@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):

        return self._base_forest_estimator.score(X, y)

    def _base_estimator_predictions(self, X):

        # We need to use predict_proba to get the probabilities of each class

        return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T

    @property

@@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):

    # sklearn baseestimator api methods

    def fit(self, X_forest, y_forest, X_omp, y_omp):

        # print(y_forest.shape)

        # print(set([type(y) for y in y_forest]))

        self._base_forest_estimator.fit(X_forest, y_forest)

        self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit

        return self

@@ -96,6 +98,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):

        pass

class SingleOmpForest(OmpForest):

    def __init__(self, models_parameters, base_forest_estimator):

        # fit_intercept shouldn't be set to False as the data isn't necessarily centered here

        # normalization is handled outsite OMP

@@ -123,3 +126,24 @@ class SingleOmpForest(OmpForest):

            forest_predictions /= self._forest_norms

        return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)

    def predict_no_weights(self, X):

        """

        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest

        :return: a np.array of the predictions of the entire forest

        """

        forest_predictions = self._base_estimator_predictions(X).T

        if self._models_parameters.normalize_D:

            forest_predictions /= self._forest_norms

        weights = self._omp.coef_

        omp_trees_indices = np.nonzero(weights)[0]

        select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)

        print(len(omp_trees_indices))

        return select_trees

code/bolsonaro/models/omp_forest_classifier.py

+58
−0

Original line number
Diff line number
Diff line

@@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest):

        return super().fit(X_forest, y_forest, X_omp, y_omp)

    def predict_no_weights(self, X):

        """

        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest

        :return: a np.array of the predictions of the entire forest

        """

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_])

        if self._models_parameters.normalize_D:

            forest_predictions /= self._forest_norms

        weights = self._omp.coef_

        omp_trees_indices = np.nonzero(weights)

        omp_trees_predictions = forest_predictions[omp_trees_indices].T[1]

        # Here forest_pred is the probability of being class 1.

        result_omp = np.mean(omp_trees_predictions, axis=1)

        result_omp = (result_omp - 0.5) * 2

        return result_omp

    def score(self, X, y, metric=DEFAULT_SCORE_METRIC):

        """

        Evaluate OMPForestClassifer on (`X`, `y`) using `metric`

@@ -106,6 +134,36 @@ class OmpForestMulticlassClassifier(OmpForest):

        max_preds = np.argmax(preds, axis=1)

        return np.array(label_names)[max_preds]

    def predict_no_weights(self, X):

        """

        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest

        :return: a np.array of the predictions of the entire forest

        """

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T

        if self._models_parameters.normalize_D:

            forest_predictions /= self._forest_norms

        label_names = []

        preds = []

        num_class = 0

        for class_label, omp_class in self._dct_class_omp.items():

            weights = omp_class.coef_

            omp_trees_indices = np.nonzero(weights)

            label_names.append(class_label)

            atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1

            preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices))

            num_class += 1

        preds = np.array(preds).T

        max_preds = np.argmax(preds, axis=1)

        return np.array(label_names)[max_preds]

    def score(self, X, y, metric=DEFAULT_SCORE_METRIC):

        predictions = self.predict(X)

Compare revisions

Source

Target

Files

Some changes are not shown.

code/bolsonaro/data/dataset_loader.py

code/bolsonaro/models/kmeans_forest_regressor.py

code/bolsonaro/models/model_factory.py

code/bolsonaro/models/model_raw_results.py

code/bolsonaro/models/omp_forest.py

code/bolsonaro/models/omp_forest_classifier.py