Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • 12-experiment-pipeline
  • 13-visualization
  • 14-correction-of-multiclass-classif
  • 15-integration-sota
  • 17-adding-new-datasets
  • 19-add-some-tests
  • 20-coherence-des-arbres-de-predictions
  • 24-non-negative-omp
  • correlation
  • master
  • archive/10-gridsearching-of-the-base-forest
  • archive/4-implement-omp_forest_classifier
  • archive/5-add-plots-2
  • archive/Leo_Add_first_notebook
  • archive/farah_notation_and_related_work
  • archive/wip_clean_scripts
16 results

Target

Select target project
No results found
Select Git revision
  • 12-experiment-pipeline
  • 13-visualization
  • 14-correction-of-multiclass-classif
  • 15-integration-sota
  • 17-adding-new-datasets
  • 19-add-some-tests
  • 20-coherence-des-arbres-de-predictions
  • 24-non-negative-omp
  • correlation
  • master
  • archive/10-gridsearching-of-the-base-forest
  • archive/4-implement-omp_forest_classifier
  • archive/5-add-plots-2
  • archive/Leo_Add_first_notebook
  • archive/farah_notation_and_related_work
  • archive/wip_clean_scripts
16 results
Show changes
213 files
+ 2440
6257
Compare changes
  • Side-by-side
  • Inline

Files

Original line number Diff line number Diff line
from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load
from bolsonaro.utils import change_binary_func_load, change_binary_func_openml

from sklearn.datasets import load_boston, load_iris, load_diabetes, \
    load_digits, load_linnerud, load_wine, load_breast_cancer
from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
    fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
    fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
    fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing, \
    fetch_openml
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import random
@@ -30,13 +31,15 @@ class DatasetLoader(object):

    dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine',
        'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people',
        'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds']
        'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds', 'steel-plates',
        'kr-vs-kp', 'kin8nm', 'spambase', 'musk', 'gamma']

    dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5,
        'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15,
        '20newsgroups_vectorized':3, 'lfw_people':3,
        'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3,
        'diamonds': 15}
        'diamonds': 15, 'steel-plates': 15, 'kr-vs-kp': 15, 'kin8nm': 15,
        'spambase': 15, 'musk': 15, 'gamma': 15}

    @staticmethod
    def load(dataset_parameters):
@@ -103,6 +106,24 @@ class DatasetLoader(object):
            df['clarity'] = label_clarity.fit_transform(df['clarity'])
            X, y = df.drop(['price'], axis=1), df['price']
            task = Task.REGRESSION
        elif name == 'steel-plates':
            dataset_loading_func = change_binary_func_openml('steel-plates-fault')
            task = Task.BINARYCLASSIFICATION
        elif name == 'kr-vs-kp':
            dataset_loading_func = change_binary_func_openml('kr-vs-kp')
            task = Task.BINARYCLASSIFICATION
        elif name == 'kin8nm':
            X, y = fetch_openml('kin8nm', return_X_y=True)
            task = Task.REGRESSION
        elif name == 'spambase':
            dataset_loading_func = change_binary_func_openml('spambase')
            task = Task.BINARYCLASSIFICATION
        elif name == 'musk':
            dataset_loading_func = change_binary_func_openml('musk')
            task = Task.BINARYCLASSIFICATION
        elif name == 'gamma':
            dataset_loading_func = change_binary_func_openml('MagicTelescope')
            task = Task.BINARYCLASSIFICATION
        else:
            raise ValueError("Unsupported dataset '{}'".format(name))

Original line number Diff line number Diff line
from bolsonaro.utils import tqdm_joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans
from abc import abstractmethod, ABCMeta
import numpy as np
from scipy.stats import mode
from joblib import Parallel, delayed
from tqdm import tqdm


class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
    """
    On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
    """

    def __init__(self, models_parameters, score_metric=mean_squared_error):
        self._models_parameters = models_parameters
        self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
            random_state=self._models_parameters.seed, n_jobs=-1)
        self._extracted_forest_size = self._models_parameters.extracted_forest_size
        self._score_metric = score_metric

    @property
    def models_parameters(self):
        return self._models_parameters

    def fit(self, X_train, y_train, X_val, y_val):
        self._estimator.fit(X_train, y_train)

        predictions = list()
        for tree in self._estimator.estimators_:
            predictions.append(tree.predict(X_train))
        predictions = np.array(predictions)

        kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
        labels = np.array(kmeans.labels_)

        # For each cluster select the best tree on the validation set
        extracted_forest_sizes = list(range(self._extracted_forest_size))
        with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
            pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
                extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
                for i in range(self._extracted_forest_size))

        self._estimator.estimators_ = pruned_forest

    def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
        index = np.where(labels == c)[0]
        with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
            cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, 
                y_val, score_metric) for i in range(len(index)))
        best_tree_index = np.argmax(cluster)
        prune_forest_job_pb.update()
        return self._estimator.estimators_[index[best_tree_index]]

    def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
        y_val_pred = self._estimator.estimators_[i].predict(X_val)
        tree_pred = score_metric(y_val, y_val_pred)
        cluster_job_pb.update()
        return tree_pred

    def predict(self, X):
        return self._estimator.predict(X)

    def score(self, X, y):
        predictions = list()
        for tree in self._estimator.estimators_:
            predictions.append(tree.predict(X))
        predictions = np.array(predictions)
        mean_predictions = np.mean(predictions, axis=0)
        score = self._score_metric(mean_predictions, y)
        return score

    def predict_base_estimator(self, X):
        return self._estimator.predict(X)
Original line number Diff line number Diff line
@@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.data.task import Task

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
@@ -22,9 +23,11 @@ class ModelFactory(object):
            elif model_parameters.extraction_strategy == 'random':
                return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
                    random_state=model_parameters.seed)
            else:
            elif model_parameters.extraction_strategy == 'none':
                return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
                    random_state=model_parameters.seed)
            else:
                raise ValueError('Invalid extraction strategy')
        elif task == Task.REGRESSION:
            if model_parameters.extraction_strategy == 'omp':
                return OmpForestRegressor(model_parameters)
@@ -33,15 +36,21 @@ class ModelFactory(object):
                    random_state=model_parameters.seed)
            elif model_parameters.extraction_strategy == 'similarity':
                return SimilarityForestRegressor(model_parameters)
            else:
            elif model_parameters.extraction_strategy == 'kmeans':
                return KMeansForestRegressor(model_parameters)
            elif model_parameters.extraction_strategy == 'none':
                return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
                    random_state=model_parameters.seed)
            else:
                raise ValueError('Invalid extraction strategy')
        elif task == Task.MULTICLASSIFICATION:
            if model_parameters.extraction_strategy == 'omp':
                return OmpForestMulticlassClassifier(model_parameters)
            elif model_parameters.extraction_strategy == 'random':
                return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
                    random_state=model_parameters.seed)
            else:
            elif model_parameters.extraction_strategy == 'none':
                return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
                    random_state=model_parameters.seed)
            else:
                raise ValueError('Invalid extraction strategy')
Original line number Diff line number Diff line
@@ -6,12 +6,12 @@ import datetime

class ModelRawResults(object):

    def __init__(self, model_object, training_time,
    def __init__(self, model_weights, training_time,
        datetime, train_score, dev_score, test_score,
        train_score_base, dev_score_base,
        test_score_base, score_metric, base_score_metric):

        self._model_object = model_object
        self._model_weights = model_weights
        self._training_time = training_time
        self._datetime = datetime
        self._train_score = train_score
@@ -24,8 +24,8 @@ class ModelRawResults(object):
        self._base_score_metric = base_score_metric

    @property
    def model_object(self):
        return self.model_object
    def model_weights(self):
        return self.model_weights

    @property
    def training_time(self):
@@ -68,6 +68,8 @@ class ModelRawResults(object):
        return self._base_score_metric

    def save(self, models_dir):
        if not os.path.exists(models_dir):
            os.mkdir(models_dir)
        save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle',
            self.__dict__)

Original line number Diff line number Diff line
@@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator


class OmpForest(BaseEstimator, metaclass=ABCMeta):

    def __init__(self, models_parameters, base_forest_estimator):
        self._base_forest_estimator = base_forest_estimator
        self._models_parameters = models_parameters
@@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
        return self._base_forest_estimator.score(X, y)

    def _base_estimator_predictions(self, X):
        # We need to use predict_proba to get the probabilities of each class
        return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T

    @property
@@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):

    # sklearn baseestimator api methods
    def fit(self, X_forest, y_forest, X_omp, y_omp):
        # print(y_forest.shape)
        # print(set([type(y) for y in y_forest]))
        self._base_forest_estimator.fit(X_forest, y_forest)
        self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit
        return self
@@ -96,6 +98,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
        pass

class SingleOmpForest(OmpForest):

    def __init__(self, models_parameters, base_forest_estimator):
        # fit_intercept shouldn't be set to False as the data isn't necessarily centered here
        # normalization is handled outsite OMP
@@ -123,3 +126,24 @@ class SingleOmpForest(OmpForest):
            forest_predictions /= self._forest_norms

        return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)

    def predict_no_weights(self, X):
        """
        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest
        :return: a np.array of the predictions of the entire forest
        """
        forest_predictions = self._base_estimator_predictions(X).T

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        weights = self._omp.coef_
        omp_trees_indices = np.nonzero(weights)[0]

        select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)
        print(len(omp_trees_indices))
        return select_trees
Original line number Diff line number Diff line
@@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest):

        return super().fit(X_forest, y_forest, X_omp, y_omp)

    def predict_no_weights(self, X):
        """
        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest
        :return: a np.array of the predictions of the entire forest
        """

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_])

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        weights = self._omp.coef_
        omp_trees_indices = np.nonzero(weights)

        omp_trees_predictions = forest_predictions[omp_trees_indices].T[1]

        # Here forest_pred is the probability of being class 1.

        result_omp = np.mean(omp_trees_predictions, axis=1)

        result_omp = (result_omp - 0.5) * 2

        return result_omp

    def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
        """
        Evaluate OMPForestClassifer on (`X`, `y`) using `metric`
@@ -106,6 +134,36 @@ class OmpForestMulticlassClassifier(OmpForest):
        max_preds = np.argmax(preds, axis=1)
        return np.array(label_names)[max_preds]

    def predict_no_weights(self, X):
        """
        Apply the SingleOmpForest to X without using the weights.

        Make all the base tree predictions

        :param X: a Forest
        :return: a np.array of the predictions of the entire forest
        """

        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms

        label_names = []
        preds = []
        num_class = 0
        for class_label, omp_class in self._dct_class_omp.items():
            weights = omp_class.coef_
            omp_trees_indices = np.nonzero(weights)
            label_names.append(class_label)
            atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1
            preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices))
            num_class += 1

        preds = np.array(preds).T
        max_preds = np.argmax(preds, axis=1)
        return np.array(label_names)[max_preds]

    def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
        predictions = self.predict(X)