213 files + 2440 − 6257 Inline Compare changes Side-by-side Inline Show whitespace changes Files 213 Some changes are not shown. For a faster browsing experience, only 100 of 213 files are shown. Download one of the files below to see all changes. code/bolsonaro/data/dataset_loader.py +25 −4 Original line number Diff line number Diff line from bolsonaro.data.dataset import Dataset from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.task import Task from bolsonaro.utils import change_binary_func_load from bolsonaro.utils import change_binary_func_load, change_binary_func_openml from sklearn.datasets import load_boston, load_iris, load_diabetes, \ load_digits, load_linnerud, load_wine, load_breast_cancer from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing, \ fetch_openml from sklearn.model_selection import train_test_split from sklearn import preprocessing import random Loading @@ -30,13 +31,15 @@ class DatasetLoader(object): dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine', 'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people', 'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds'] 'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds', 'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'musk', 'gamma'] dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5, 'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15, '20newsgroups_vectorized':3, 'lfw_people':3, 'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3, 'diamonds': 15} 'diamonds': 15, 'steel-plates': 15, 'kr-vs-kp': 15, 'kin8nm': 15, 'spambase': 15, 'musk': 15, 'gamma': 15} @staticmethod def load(dataset_parameters): Loading Loading @@ -103,6 +106,24 @@ class DatasetLoader(object): df['clarity'] = label_clarity.fit_transform(df['clarity']) X, y = df.drop(['price'], axis=1), df['price'] task = Task.REGRESSION elif name == 'steel-plates': dataset_loading_func = change_binary_func_openml('steel-plates-fault') task = Task.BINARYCLASSIFICATION elif name == 'kr-vs-kp': dataset_loading_func = change_binary_func_openml('kr-vs-kp') task = Task.BINARYCLASSIFICATION elif name == 'kin8nm': X, y = fetch_openml('kin8nm', return_X_y=True) task = Task.REGRESSION elif name == 'spambase': dataset_loading_func = change_binary_func_openml('spambase') task = Task.BINARYCLASSIFICATION elif name == 'musk': dataset_loading_func = change_binary_func_openml('musk') task = Task.BINARYCLASSIFICATION elif name == 'gamma': dataset_loading_func = change_binary_func_openml('MagicTelescope') task = Task.BINARYCLASSIFICATION else: raise ValueError("Unsupported dataset '{}'".format(name)) Loading code/bolsonaro/models/kmeans_forest_regressor.py 0 → 100644 +78 −0 Original line number Diff line number Diff line from bolsonaro.utils import tqdm_joblib from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from sklearn.base import BaseEstimator from sklearn.cluster import KMeans from abc import abstractmethod, ABCMeta import numpy as np from scipy.stats import mode from joblib import Parallel, delayed from tqdm import tqdm class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): """ On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan. """ def __init__(self, models_parameters, score_metric=mean_squared_error): self._models_parameters = models_parameters self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, random_state=self._models_parameters.seed, n_jobs=-1) self._extracted_forest_size = self._models_parameters.extracted_forest_size self._score_metric = score_metric @property def models_parameters(self): return self._models_parameters def fit(self, X_train, y_train, X_val, y_val): self._estimator.fit(X_train, y_train) predictions = list() for tree in self._estimator.estimators_: predictions.append(tree.predict(X_train)) predictions = np.array(predictions) kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) labels = np.array(kmeans.labels_) # For each cluster select the best tree on the validation set extracted_forest_sizes = list(range(self._extracted_forest_size)) with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb: pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb, extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) for i in range(self._extracted_forest_size)) self._estimator.estimators_ = pruned_forest def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): index = np.where(labels == c)[0] with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb: cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, y_val, score_metric) for i in range(len(index))) best_tree_index = np.argmax(cluster) prune_forest_job_pb.update() return self._estimator.estimators_[index[best_tree_index]] def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric): y_val_pred = self._estimator.estimators_[i].predict(X_val) tree_pred = score_metric(y_val, y_val_pred) cluster_job_pb.update() return tree_pred def predict(self, X): return self._estimator.predict(X) def score(self, X, y): predictions = list() for tree in self._estimator.estimators_: predictions.append(tree.predict(X)) predictions = np.array(predictions) mean_predictions = np.mean(predictions, axis=0) score = self._score_metric(mean_predictions, y) return score def predict_base_estimator(self, X): return self._estimator.predict(X) code/bolsonaro/models/model_factory.py +12 −3 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor from bolsonaro.data.task import Task from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier Loading @@ -22,9 +23,11 @@ class ModelFactory(object): elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) else: elif model_parameters.extraction_strategy == 'none': return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') elif task == Task.REGRESSION: if model_parameters.extraction_strategy == 'omp': return OmpForestRegressor(model_parameters) Loading @@ -33,15 +36,21 @@ class ModelFactory(object): random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'similarity': return SimilarityForestRegressor(model_parameters) else: elif model_parameters.extraction_strategy == 'kmeans': return KMeansForestRegressor(model_parameters) elif model_parameters.extraction_strategy == 'none': return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') elif task == Task.MULTICLASSIFICATION: if model_parameters.extraction_strategy == 'omp': return OmpForestMulticlassClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) else: elif model_parameters.extraction_strategy == 'none': return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') code/bolsonaro/models/model_raw_results.py +8 −6 Original line number Diff line number Diff line Loading @@ -6,12 +6,12 @@ import datetime class ModelRawResults(object): def __init__(self, model_object, training_time, def __init__(self, model_weights, training_time, datetime, train_score, dev_score, test_score, train_score_base, dev_score_base, test_score_base, score_metric, base_score_metric): self._model_object = model_object self._model_weights = model_weights self._training_time = training_time self._datetime = datetime self._train_score = train_score Loading @@ -24,8 +24,8 @@ class ModelRawResults(object): self._base_score_metric = base_score_metric @property def model_object(self): return self.model_object def model_weights(self): return self.model_weights @property def training_time(self): Loading Loading @@ -68,6 +68,8 @@ class ModelRawResults(object): return self._base_score_metric def save(self, models_dir): if not os.path.exists(models_dir): os.mkdir(models_dir) save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle', self.__dict__) Loading code/bolsonaro/models/omp_forest.py +25 −1 Original line number Diff line number Diff line Loading @@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator class OmpForest(BaseEstimator, metaclass=ABCMeta): def __init__(self, models_parameters, base_forest_estimator): self._base_forest_estimator = base_forest_estimator self._models_parameters = models_parameters Loading @@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): return self._base_forest_estimator.score(X, y) def _base_estimator_predictions(self, X): # We need to use predict_proba to get the probabilities of each class return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T @property Loading @@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): # sklearn baseestimator api methods def fit(self, X_forest, y_forest, X_omp, y_omp): # print(y_forest.shape) # print(set([type(y) for y in y_forest])) self._base_forest_estimator.fit(X_forest, y_forest) self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit return self Loading Loading @@ -96,6 +98,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): pass class SingleOmpForest(OmpForest): def __init__(self, models_parameters, base_forest_estimator): # fit_intercept shouldn't be set to False as the data isn't necessarily centered here # normalization is handled outsite OMP Loading Loading @@ -123,3 +126,24 @@ class SingleOmpForest(OmpForest): forest_predictions /= self._forest_norms return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) def predict_no_weights(self, X): """ Apply the SingleOmpForest to X without using the weights. Make all the base tree predictions :param X: a Forest :return: a np.array of the predictions of the entire forest """ forest_predictions = self._base_estimator_predictions(X).T if self._models_parameters.normalize_D: forest_predictions /= self._forest_norms weights = self._omp.coef_ omp_trees_indices = np.nonzero(weights)[0] select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0) print(len(omp_trees_indices)) return select_trees code/bolsonaro/models/omp_forest_classifier.py +58 −0 Original line number Diff line number Diff line Loading @@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest): return super().fit(X_forest, y_forest, X_omp, y_omp) def predict_no_weights(self, X): """ Apply the SingleOmpForest to X without using the weights. Make all the base tree predictions :param X: a Forest :return: a np.array of the predictions of the entire forest """ forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]) if self._models_parameters.normalize_D: forest_predictions /= self._forest_norms weights = self._omp.coef_ omp_trees_indices = np.nonzero(weights) omp_trees_predictions = forest_predictions[omp_trees_indices].T[1] # Here forest_pred is the probability of being class 1. result_omp = np.mean(omp_trees_predictions, axis=1) result_omp = (result_omp - 0.5) * 2 return result_omp def score(self, X, y, metric=DEFAULT_SCORE_METRIC): """ Evaluate OMPForestClassifer on (`X`, `y`) using `metric` Loading Loading @@ -106,6 +134,36 @@ class OmpForestMulticlassClassifier(OmpForest): max_preds = np.argmax(preds, axis=1) return np.array(label_names)[max_preds] def predict_no_weights(self, X): """ Apply the SingleOmpForest to X without using the weights. Make all the base tree predictions :param X: a Forest :return: a np.array of the predictions of the entire forest """ forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T if self._models_parameters.normalize_D: forest_predictions /= self._forest_norms label_names = [] preds = [] num_class = 0 for class_label, omp_class in self._dct_class_omp.items(): weights = omp_class.coef_ omp_trees_indices = np.nonzero(weights) label_names.append(class_label) atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1 preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices)) num_class += 1 preds = np.array(preds).T max_preds = np.argmax(preds, axis=1) return np.array(label_names)[max_preds] def score(self, X, y, metric=DEFAULT_SCORE_METRIC): predictions = self.predict(X) Loading
code/bolsonaro/data/dataset_loader.py +25 −4 Original line number Diff line number Diff line from bolsonaro.data.dataset import Dataset from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.task import Task from bolsonaro.utils import change_binary_func_load from bolsonaro.utils import change_binary_func_load, change_binary_func_openml from sklearn.datasets import load_boston, load_iris, load_diabetes, \ load_digits, load_linnerud, load_wine, load_breast_cancer from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing, \ fetch_openml from sklearn.model_selection import train_test_split from sklearn import preprocessing import random Loading @@ -30,13 +31,15 @@ class DatasetLoader(object): dataset_names = ['boston', 'iris', 'diabetes', 'digits', 'linnerud', 'wine', 'breast_cancer', 'olivetti_faces', '20newsgroups_vectorized', 'lfw_people', 'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds'] 'lfw_pairs', 'covtype', 'rcv1', 'california_housing', 'diamonds', 'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'musk', 'gamma'] dataset_seed_numbers = {'boston':15, 'iris':15, 'diabetes':15, 'digits':5, 'linnerud':15, 'wine':15, 'breast_cancer':15, 'olivetti_faces':15, '20newsgroups_vectorized':3, 'lfw_people':3, 'lfw_pairs':3, 'covtype':3, 'rcv1':3, 'california_housing':3, 'diamonds': 15} 'diamonds': 15, 'steel-plates': 15, 'kr-vs-kp': 15, 'kin8nm': 15, 'spambase': 15, 'musk': 15, 'gamma': 15} @staticmethod def load(dataset_parameters): Loading Loading @@ -103,6 +106,24 @@ class DatasetLoader(object): df['clarity'] = label_clarity.fit_transform(df['clarity']) X, y = df.drop(['price'], axis=1), df['price'] task = Task.REGRESSION elif name == 'steel-plates': dataset_loading_func = change_binary_func_openml('steel-plates-fault') task = Task.BINARYCLASSIFICATION elif name == 'kr-vs-kp': dataset_loading_func = change_binary_func_openml('kr-vs-kp') task = Task.BINARYCLASSIFICATION elif name == 'kin8nm': X, y = fetch_openml('kin8nm', return_X_y=True) task = Task.REGRESSION elif name == 'spambase': dataset_loading_func = change_binary_func_openml('spambase') task = Task.BINARYCLASSIFICATION elif name == 'musk': dataset_loading_func = change_binary_func_openml('musk') task = Task.BINARYCLASSIFICATION elif name == 'gamma': dataset_loading_func = change_binary_func_openml('MagicTelescope') task = Task.BINARYCLASSIFICATION else: raise ValueError("Unsupported dataset '{}'".format(name)) Loading
code/bolsonaro/models/kmeans_forest_regressor.py 0 → 100644 +78 −0 Original line number Diff line number Diff line from bolsonaro.utils import tqdm_joblib from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from sklearn.base import BaseEstimator from sklearn.cluster import KMeans from abc import abstractmethod, ABCMeta import numpy as np from scipy.stats import mode from joblib import Parallel, delayed from tqdm import tqdm class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): """ On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan. """ def __init__(self, models_parameters, score_metric=mean_squared_error): self._models_parameters = models_parameters self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, random_state=self._models_parameters.seed, n_jobs=-1) self._extracted_forest_size = self._models_parameters.extracted_forest_size self._score_metric = score_metric @property def models_parameters(self): return self._models_parameters def fit(self, X_train, y_train, X_val, y_val): self._estimator.fit(X_train, y_train) predictions = list() for tree in self._estimator.estimators_: predictions.append(tree.predict(X_train)) predictions = np.array(predictions) kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) labels = np.array(kmeans.labels_) # For each cluster select the best tree on the validation set extracted_forest_sizes = list(range(self._extracted_forest_size)) with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb: pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb, extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) for i in range(self._extracted_forest_size)) self._estimator.estimators_ = pruned_forest def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): index = np.where(labels == c)[0] with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb: cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, y_val, score_metric) for i in range(len(index))) best_tree_index = np.argmax(cluster) prune_forest_job_pb.update() return self._estimator.estimators_[index[best_tree_index]] def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric): y_val_pred = self._estimator.estimators_[i].predict(X_val) tree_pred = score_metric(y_val, y_val_pred) cluster_job_pb.update() return tree_pred def predict(self, X): return self._estimator.predict(X) def score(self, X, y): predictions = list() for tree in self._estimator.estimators_: predictions.append(tree.predict(X)) predictions = np.array(predictions) mean_predictions = np.mean(predictions, axis=0) score = self._score_metric(mean_predictions, y) return score def predict_base_estimator(self, X): return self._estimator.predict(X)
code/bolsonaro/models/model_factory.py +12 −3 Original line number Diff line number Diff line Loading @@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor from bolsonaro.data.task import Task from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier Loading @@ -22,9 +23,11 @@ class ModelFactory(object): elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) else: elif model_parameters.extraction_strategy == 'none': return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') elif task == Task.REGRESSION: if model_parameters.extraction_strategy == 'omp': return OmpForestRegressor(model_parameters) Loading @@ -33,15 +36,21 @@ class ModelFactory(object): random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'similarity': return SimilarityForestRegressor(model_parameters) else: elif model_parameters.extraction_strategy == 'kmeans': return KMeansForestRegressor(model_parameters) elif model_parameters.extraction_strategy == 'none': return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') elif task == Task.MULTICLASSIFICATION: if model_parameters.extraction_strategy == 'omp': return OmpForestMulticlassClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) else: elif model_parameters.extraction_strategy == 'none': return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy')
code/bolsonaro/models/model_raw_results.py +8 −6 Original line number Diff line number Diff line Loading @@ -6,12 +6,12 @@ import datetime class ModelRawResults(object): def __init__(self, model_object, training_time, def __init__(self, model_weights, training_time, datetime, train_score, dev_score, test_score, train_score_base, dev_score_base, test_score_base, score_metric, base_score_metric): self._model_object = model_object self._model_weights = model_weights self._training_time = training_time self._datetime = datetime self._train_score = train_score Loading @@ -24,8 +24,8 @@ class ModelRawResults(object): self._base_score_metric = base_score_metric @property def model_object(self): return self.model_object def model_weights(self): return self.model_weights @property def training_time(self): Loading Loading @@ -68,6 +68,8 @@ class ModelRawResults(object): return self._base_score_metric def save(self, models_dir): if not os.path.exists(models_dir): os.mkdir(models_dir) save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle', self.__dict__) Loading
code/bolsonaro/models/omp_forest.py +25 −1 Original line number Diff line number Diff line Loading @@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator class OmpForest(BaseEstimator, metaclass=ABCMeta): def __init__(self, models_parameters, base_forest_estimator): self._base_forest_estimator = base_forest_estimator self._models_parameters = models_parameters Loading @@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): return self._base_forest_estimator.score(X, y) def _base_estimator_predictions(self, X): # We need to use predict_proba to get the probabilities of each class return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T @property Loading @@ -33,6 +33,8 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): # sklearn baseestimator api methods def fit(self, X_forest, y_forest, X_omp, y_omp): # print(y_forest.shape) # print(set([type(y) for y in y_forest])) self._base_forest_estimator.fit(X_forest, y_forest) self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit return self Loading Loading @@ -96,6 +98,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): pass class SingleOmpForest(OmpForest): def __init__(self, models_parameters, base_forest_estimator): # fit_intercept shouldn't be set to False as the data isn't necessarily centered here # normalization is handled outsite OMP Loading Loading @@ -123,3 +126,24 @@ class SingleOmpForest(OmpForest): forest_predictions /= self._forest_norms return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) def predict_no_weights(self, X): """ Apply the SingleOmpForest to X without using the weights. Make all the base tree predictions :param X: a Forest :return: a np.array of the predictions of the entire forest """ forest_predictions = self._base_estimator_predictions(X).T if self._models_parameters.normalize_D: forest_predictions /= self._forest_norms weights = self._omp.coef_ omp_trees_indices = np.nonzero(weights)[0] select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0) print(len(omp_trees_indices)) return select_trees
code/bolsonaro/models/omp_forest_classifier.py +58 −0 Original line number Diff line number Diff line Loading @@ -24,6 +24,34 @@ class OmpForestBinaryClassifier(SingleOmpForest): return super().fit(X_forest, y_forest, X_omp, y_omp) def predict_no_weights(self, X): """ Apply the SingleOmpForest to X without using the weights. Make all the base tree predictions :param X: a Forest :return: a np.array of the predictions of the entire forest """ forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]) if self._models_parameters.normalize_D: forest_predictions /= self._forest_norms weights = self._omp.coef_ omp_trees_indices = np.nonzero(weights) omp_trees_predictions = forest_predictions[omp_trees_indices].T[1] # Here forest_pred is the probability of being class 1. result_omp = np.mean(omp_trees_predictions, axis=1) result_omp = (result_omp - 0.5) * 2 return result_omp def score(self, X, y, metric=DEFAULT_SCORE_METRIC): """ Evaluate OMPForestClassifer on (`X`, `y`) using `metric` Loading Loading @@ -106,6 +134,36 @@ class OmpForestMulticlassClassifier(OmpForest): max_preds = np.argmax(preds, axis=1) return np.array(label_names)[max_preds] def predict_no_weights(self, X): """ Apply the SingleOmpForest to X without using the weights. Make all the base tree predictions :param X: a Forest :return: a np.array of the predictions of the entire forest """ forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T if self._models_parameters.normalize_D: forest_predictions /= self._forest_norms label_names = [] preds = [] num_class = 0 for class_label, omp_class in self._dct_class_omp.items(): weights = omp_class.coef_ omp_trees_indices = np.nonzero(weights) label_names.append(class_label) atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1 preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)/len(omp_trees_indices)) num_class += 1 preds = np.array(preds).T max_preds = np.argmax(preds, axis=1) return np.array(label_names)[max_preds] def score(self, X, y, metric=DEFAULT_SCORE_METRIC): predictions = self.predict(X) Loading