Skip to content
Snippets Groups Projects
Commit 2a24aacb authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Merge branch '15-integration-sota' into 'master'

Resolve "integration-sota"

Closes #15

See merge request !23
parents 4d4c0848 f41d9087
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
Showing
with 1785 additions and 336 deletions
from bolsonaro.data.dataset import Dataset from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load, change_binary_func_openml from bolsonaro.utils import change_binary_func_load, change_binary_func_openml, binarize_class_data
from sklearn.datasets import load_boston, load_iris, load_diabetes, \ from sklearn.datasets import load_boston, load_iris, load_diabetes, \
load_digits, load_linnerud, load_wine, load_breast_cancer load_digits, load_linnerud, load_wine, load_breast_cancer
...@@ -81,7 +81,9 @@ class DatasetLoader(object): ...@@ -81,7 +81,9 @@ class DatasetLoader(object):
elif name == 'lfw_pairs': elif name == 'lfw_pairs':
dataset = fetch_lfw_pairs() dataset = fetch_lfw_pairs()
X, y = dataset.data, dataset.target X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION possible_classes = sorted(set(y))
y = binarize_class_data(y, possible_classes[-1])
task = Task.BINARYCLASSIFICATION
elif name == 'covtype': elif name == 'covtype':
X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True) X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION task = Task.MULTICLASSIFICATION
......
import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeRegressor from sklearn.tree import DecisionTreeRegressor
...@@ -5,91 +8,103 @@ from abc import abstractmethod, ABCMeta ...@@ -5,91 +8,103 @@ from abc import abstractmethod, ABCMeta
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from bolsonaro.models.forest_pruning_sota import ForestPruningSOTA
from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator
class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): class EnsembleSelectionForest(ForestPruningSOTA, metaclass=ABCMeta):
""" """
'Ensemble selection from libraries of models' by Rich Caruana et al 'Ensemble selection from libraries of models' by Rich Caruana et al
""" """
def __init__(self, models_parameters, library, score_metric=mean_squared_error): def _fit(self, X_train, y_train, X_val, y_val):
self._models_parameters = models_parameters self._base_estimator.fit(X_train, y_train)
self._library = library
self._extracted_forest_size = self._models_parameters.extracted_forest_size val_predictions = self._base_estimator_predictions(X_val).T
self._score_metric = score_metric scores_predictions_val = self._score_metric(val_predictions, y_val)
self._selected_trees = list() idx_best_score = self._best_score_idx(scores_predictions_val)
@property lst_pruned_forest = [self._base_estimator.estimators_[idx_best_score]]
def models_parameters(self):
return self._models_parameters nb_selected_trees = 1
mean_so_far = val_predictions[idx_best_score]
@property while nb_selected_trees < self._extracted_forest_size:
def library(self): # every new tree is selected with replacement as specified in the base paper
return self._library
# this matrix contains at each line the predictions of the previous subset + the corresponding tree of the line
@property # mean update formula: u_{t+1} = (n_t * u_t + x_t) / (n_t + 1)
def selected_trees(self): mean_prediction_subset_with_extra_tree = (nb_selected_trees * mean_so_far + val_predictions) / (nb_selected_trees + 1)
return self._selected_trees predictions_subset_with_extra_tree = self._activation(mean_prediction_subset_with_extra_tree)
scores_subset_with_extra_tree = self._score_metric(predictions_subset_with_extra_tree, y_val)
def fit(self, X_train, y_train, X_val, y_val): idx_best_extra_tree = self._best_score_idx(scores_subset_with_extra_tree)
scores_list = list() lst_pruned_forest.append(self._base_estimator.estimators_[idx_best_extra_tree])
for estimator in self._library:
val_score = self._score_metric(estimator.predict(X_val), y_val) # update new mean prediction
scores_list.append(val_score) mean_so_far = mean_prediction_subset_with_extra_tree[idx_best_extra_tree]
nb_selected_trees += 1
class_list = list(self._library)
m = np.argmax(np.asarray(scores_list)) return lst_pruned_forest
self._selected_trees = [class_list[m]]
temp_pred = class_list[m].predict(X_val)
del class_list[m] @abstractmethod
for k in range(self._extracted_forest_size - 1): def _activation(self, leave_one_tree_out_predictions_val):
candidate_index = 0 pass
best_score = 100000
for j in range(len(class_list)):
temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val))) class EnsembleSelectionForestClassifier(EnsembleSelectionForest, metaclass=ABCMeta):
temp_mean = np.mean(temp_pred, axis=0) @staticmethod
temp_score = self._score_metric(temp_mean, y_val) def init_estimator(model_parameters):
if (temp_score < best_score): return RandomForestClassifier(**model_parameters.hyperparameters,
candidate_index = j random_state=model_parameters.seed, n_jobs=-1)
best_score = temp_score
temp_pred = np.delete(temp_pred, -1, 0) def _aggregate(self, predictions):
self._selected_trees.append(class_list[candidate_index]) return aggregation_classification(predictions)
temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val)))
del class_list[candidate_index] def _score_metric(self, y_preds, y_true):
return score_metric_indicator(y_preds, y_true)
def score(self, X, y):
predictions = self.predict_base_estimator(X) def _activation(self, predictions):
return self._score_metric(predictions, y) return np.sign(predictions)
def predict_base_estimator(self, X): def _selected_tree_predictions(self, X):
predictions = list() predictions_0_1 = super()._selected_tree_predictions(X)
for tree in self._selected_trees: predictions = (predictions_0_1 - 0.5) * 2
predictions.append(tree.predict(X)) return predictions
mean_predictions = np.mean(np.array(predictions), axis=0)
return mean_predictions def _base_estimator_predictions(self, X):
predictions_0_1 = super()._base_estimator_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
@staticmethod
def _best_score_idx(array):
return np.argmax(array)
@staticmethod
def _worse_score_idx(array):
return np.argmin(array)
class EnsembleSelectionForestRegressor(EnsembleSelectionForest, metaclass=ABCMeta):
@staticmethod
def init_estimator(model_parameters):
return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions):
return aggregation_regression(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_mse(y_preds, y_true)
def _activation(self, predictions):
return predictions
@staticmethod
def _best_score_idx(array):
return np.argmin(array)
@staticmethod @staticmethod
def generate_library(X_train, y_train, random_state=None): def _worse_score_idx(array):
criterion_arr = ["mse"]#, "friedman_mse", "mae"] return np.argmax(array)
splitter_arr = ["best"]#, "random"]
depth_arr = [i for i in range(5, 20, 1)]
min_samples_split_arr = [i for i in range(2, 20, 1)]
min_samples_leaf_arr = [i for i in range(2, 20, 1)]
max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
library = list()
with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \
len(max_features_arr)) as bar:
bar.set_description('Generating library')
for criterion in criterion_arr:
for splitter in splitter_arr:
for depth in depth_arr:
for min_samples_split in min_samples_split_arr:
for min_samples_leaf in min_samples_leaf_arr:
for max_features in max_features_arr:
t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
t.fit(X_train, y_train)
library.append(t)
bar.update(1)
return library
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta
import numpy as np
from tqdm import tqdm
from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator
class ForestPruningSOTA(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._selected_trees = list()
self._base_estimator = self.init_estimator(models_parameters)
@staticmethod
@abstractmethod
def init_estimator(model_parameters):
pass
@abstractmethod
def _fit(self, X_train, y_train, X_val, y_val):
pass
@property
def models_parameters(self):
return self._models_parameters
@property
def selected_trees(self):
return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val):
pruned_forest = self._fit(X_train, y_train, X_val, y_val)
assert len(pruned_forest) == self._extracted_forest_size, "Pruned forest size isn't the size of expected forest: {} != {}".format(len(pruned_forest), self._extracted_forest_size)
self._selected_trees = pruned_forest
def _base_estimator_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self._base_estimator.estimators_]).T
return base_predictions
def _selected_tree_predictions(self, X):
base_predictions = np.array([tree.predict(X) for tree in self.selected_trees]).T
return base_predictions
def predict(self, X):
predictions = self._selected_tree_predictions(X).T
final_predictions = self._aggregate(predictions)
return final_predictions
def predict_base_estimator(self, X):
return self._base_estimator.predict(X)
def score(self, X, y):
final_predictions = self.predict(X)
score = self._score_metric(final_predictions, y)[0]
return score
@staticmethod
@abstractmethod
def _best_score_idx(array):
"""
return index of best element in array
:param array:
:return:
"""
pass
@staticmethod
@abstractmethod
def _worse_score_idx(array):
"""
return index of worse element in array
:param array:
:return:
"""
pass
@abstractmethod
def _score_metric(self, y_preds, y_true):
"""
get score of each predictors in y_preds
y_preds.shape == (nb_trees, nb_sample)
y_true.shape == (1, nb_sample)
:param y_preds:
:param y_true:
:return:
"""
pass
@abstractmethod
def _aggregate(self, predictions):
"""
Aggregates votes of predictors in predictions
predictions shape: (nb_trees, nb_samples)
:param predictions:
:return:
"""
pass
\ No newline at end of file
import time
from bolsonaro.models.forest_pruning_sota import ForestPruningSOTA
from bolsonaro.models.utils import score_metric_mse, score_metric_indicator, aggregation_classification, aggregation_regression
from bolsonaro.utils import tqdm_joblib from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans from sklearn.cluster import KMeans
...@@ -11,74 +15,83 @@ from joblib import Parallel, delayed ...@@ -11,74 +15,83 @@ from joblib import Parallel, delayed
from tqdm import tqdm from tqdm import tqdm
class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): class KmeansForest(ForestPruningSOTA, metaclass=ABCMeta):
""" """
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan. On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
""" """
def __init__(self, models_parameters, score_metric=mean_squared_error): def _fit(self, X_train, y_train, X_val, y_val):
self._models_parameters = models_parameters self._base_estimator.fit(X_train, y_train)
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=2)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
self._selected_trees = list()
@property predictions_val = self._base_estimator_predictions(X_val).T
def models_parameters(self): predictions = self._base_estimator_predictions(X_train).T
return self._models_parameters
@property kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
def selected_trees(self): labels = np.array(kmeans.labels_)
return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val): # start_np_version = time.time()
self._estimator.fit(X_train, y_train) lst_pruned_forest = list()
for cluster_idx in range(self._extracted_forest_size): # pourrait être parallelise
index_trees_cluster = np.where(labels == cluster_idx)[0]
predictions_val_cluster = predictions_val[index_trees_cluster] # get predictions of trees in cluster
best_tree_index = self._get_best_tree_index(predictions_val_cluster, y_val)
lst_pruned_forest.append(self._base_estimator.estimators_[index_trees_cluster[best_tree_index]])
predictions = list() return lst_pruned_forest
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X_train))
predictions = np.array(predictions)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) def _get_best_tree_index(self, y_preds, y_true):
labels = np.array(kmeans.labels_) score = self._score_metric(y_preds, y_true)
best_tree_index = self._best_score_idx(score) # get best scoring tree (the one with lowest mse)
return best_tree_index
class KMeansForestRegressor(KmeansForest, metaclass=ABCMeta):
@staticmethod
def init_estimator(model_parameters):
return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions):
return aggregation_regression(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_mse(y_preds, y_true)
@staticmethod
def _best_score_idx(array):
return np.argmin(array)
@staticmethod
def _worse_score_idx(array):
return np.argmax(array)
class KMeansForestClassifier(KmeansForest, metaclass=ABCMeta):
@staticmethod
def init_estimator(model_parameters):
return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions):
return aggregation_classification(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_indicator(y_preds, y_true)
def _selected_tree_predictions(self, X):
predictions_0_1 = super()._selected_tree_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
def _base_estimator_predictions(self, X):
predictions_0_1 = super()._base_estimator_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
@staticmethod
def _best_score_idx(array):
return np.argmax(array)
# For each cluster select the best tree on the validation set @staticmethod
extracted_forest_sizes = list(range(self._extracted_forest_size)) def _worse_score_idx(array):
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb: return np.argmin(array)
pruned_forest = Parallel(n_jobs=2)(delayed(self._prune_forest_job)(prune_forest_job_pb,
extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
for i in range(self._extracted_forest_size))
self._selected_trees = pruned_forest
self._estimator.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
cluster = Parallel(n_jobs=2)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val,
y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update()
return self._estimator.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._estimator.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update()
return tree_pred
def predict(self, X):
return self._estimator.predict(X)
def score(self, X, y):
predictions = list()
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X))
predictions = np.array(predictions)
mean_predictions = np.mean(predictions, axis=0)
score = self._score_metric(mean_predictions, y)
return score
def predict_base_estimator(self, X):
return self._estimator.predict(X)
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor, EnsembleSelectionForestClassifier
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
...@@ -14,12 +14,12 @@ import pickle ...@@ -14,12 +14,12 @@ import pickle
class ModelFactory(object): class ModelFactory(object):
@staticmethod @staticmethod
def build(task, model_parameters, library=None): def build(task, model_parameters):
if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]: if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]:
raise ValueError("Unsupported task '{}'".format(task)) raise ValueError("Unsupported task '{}'".format(task))
if task == Task.BINARYCLASSIFICATION: if task == Task.BINARYCLASSIFICATION:
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy in ['omp', 'omp_distillation']:
return OmpForestBinaryClassifier(model_parameters) return OmpForestBinaryClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
...@@ -27,27 +27,33 @@ class ModelFactory(object): ...@@ -27,27 +27,33 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'none': elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'ensemble':
return EnsembleSelectionForestClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestClassifier(model_parameters)
elif model_parameters.extraction_strategy in ['similarity_similarities', 'similarity_predictions']:
return SimilarityForestClassifier(model_parameters)
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION: elif task == Task.REGRESSION:
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy in ['omp', 'omp_distillation']:
return OmpForestRegressor(model_parameters) return OmpForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity': elif model_parameters.extraction_strategy in ['similarity_similarities', 'similarity_predictions']:
return SimilarityForestRegressor(model_parameters) return SimilarityForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'kmeans': elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters) return KMeansForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'ensemble': elif model_parameters.extraction_strategy == 'ensemble':
return EnsembleSelectionForestRegressor(model_parameters, library=library) return EnsembleSelectionForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'none': elif model_parameters.extraction_strategy == 'none':
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
elif task == Task.MULTICLASSIFICATION: elif task == Task.MULTICLASSIFICATION:
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy in ['omp', 'omp_distillation']:
return OmpForestMulticlassClassifier(model_parameters) return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(**model_parameters.hyperparameters, return RandomForestClassifier(**model_parameters.hyperparameters,
......
...@@ -9,7 +9,11 @@ class ModelRawResults(object): ...@@ -9,7 +9,11 @@ class ModelRawResults(object):
def __init__(self, model_weights, training_time, def __init__(self, model_weights, training_time,
datetime, train_score, dev_score, test_score, datetime, train_score, dev_score, test_score,
train_score_base, dev_score_base, train_score_base, dev_score_base,
test_score_base, score_metric, base_score_metric): test_score_base, score_metric, base_score_metric,
#coherence='', correlation=''):
train_coherence='', dev_coherence='', test_coherence='',
train_correlation='', dev_correlation='', test_correlation='',
train_strength='', dev_strength='', test_strength=''):
self._model_weights = model_weights self._model_weights = model_weights
self._training_time = training_time self._training_time = training_time
...@@ -22,6 +26,17 @@ class ModelRawResults(object): ...@@ -22,6 +26,17 @@ class ModelRawResults(object):
self._test_score_base = test_score_base self._test_score_base = test_score_base
self._score_metric = score_metric self._score_metric = score_metric
self._base_score_metric = base_score_metric self._base_score_metric = base_score_metric
"""self._coherence = coherence
self._correlation = correlation"""
self._train_coherence = train_coherence
self._dev_coherence = dev_coherence
self._test_coherence = test_coherence
self._train_correlation = train_correlation
self._dev_correlation = dev_correlation
self._test_correlation = test_correlation
self._train_strength = train_strength
self._dev_strength = dev_strength
self._test_strength = test_strength
@property @property
def model_weights(self): def model_weights(self):
...@@ -67,6 +82,50 @@ class ModelRawResults(object): ...@@ -67,6 +82,50 @@ class ModelRawResults(object):
def base_score_metric(self): def base_score_metric(self):
return self._base_score_metric return self._base_score_metric
"""@property
def coherence(self):
return self._coherence
@property
def correlation(self):
return self._correlation"""
@property
def train_coherence(self):
return self._train_coherence
@property
def dev_coherence(self):
return self._dev_coherence
@property
def test_coherence(self):
return self._test_coherence
@property
def train_correlation(self):
return self._train_correlation
@property
def dev_correlation(self):
return self._dev_correlation
@property
def test_correlation(self):
return self._test_correlation
@property
def train_strength(self):
return self._train_strength
@property
def dev_strength(self):
return self._dev_strength
@property
def test_strength(self):
return self._test_strength
def save(self, models_dir): def save(self, models_dir):
if not os.path.exists(models_dir): if not os.path.exists(models_dir):
os.mkdir(models_dir) os.mkdir(models_dir)
......
...@@ -28,18 +28,20 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): ...@@ -28,18 +28,20 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
return self._base_forest_estimator.score(X, y) return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X): def _base_estimator_predictions(self, X):
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T base_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
return base_predictions
@property @property
def forest(self): def forest(self):
return self._base_forest_estimator.estimators_ return self._base_forest_estimator.estimators_
# sklearn baseestimator api methods # sklearn baseestimator api methods
def fit(self, X_forest, y_forest, X_omp, y_omp): def fit(self, X_forest, y_forest, X_omp, y_omp, use_distillation=False):
# print(y_forest.shape) # print(y_forest.shape)
# print(set([type(y) for y in y_forest])) # print(set([type(y) for y in y_forest]))
self._base_forest_estimator.fit(X_forest, y_forest) self._base_forest_estimator.fit(X_forest, y_forest)
self._extract_subforest(X_omp, y_omp) # type: OrthogonalMatchingPursuit self._extract_subforest(X_omp,
self.predict_base_estimator(X_omp) if use_distillation else y_omp) # type: OrthogonalMatchingPursuit
return self return self
def _extract_subforest(self, X, y): def _extract_subforest(self, X, y):
...@@ -151,11 +153,6 @@ class SingleOmpForest(OmpForest): ...@@ -151,11 +153,6 @@ class SingleOmpForest(OmpForest):
""" """
forest_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]) forest_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_])
if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
weights = self._omp.coef_ weights = self._omp.coef_
select_trees = np.mean(forest_predictions[weights != 0], axis=0) select_trees = np.mean(forest_predictions[weights != 0], axis=0)
return select_trees return select_trees
...@@ -19,11 +19,16 @@ class OmpForestBinaryClassifier(SingleOmpForest): ...@@ -19,11 +19,16 @@ class OmpForestBinaryClassifier(SingleOmpForest):
def _check_classes(self, y): def _check_classes(self, y):
assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier must be {-1, +1}" assert len(set(y).difference({-1, 1})) == 0, "Classes for binary classifier must be {-1, +1}"
def fit(self, X_forest, y_forest, X_omp, y_omp): def fit(self, X_forest, y_forest, X_omp, y_omp, use_distillation=False):
self._check_classes(y_forest) self._check_classes(y_forest)
self._check_classes(y_omp) self._check_classes(y_omp)
return super().fit(X_forest, y_forest, X_omp, y_omp) return super().fit(X_forest, y_forest, X_omp, y_omp, use_distillation=use_distillation)
def _base_estimator_predictions(self, X):
predictions_0_1 = super()._base_estimator_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
def predict_no_weights(self, X): def predict_no_weights(self, X):
""" """
...@@ -35,22 +40,15 @@ class OmpForestBinaryClassifier(SingleOmpForest): ...@@ -35,22 +40,15 @@ class OmpForestBinaryClassifier(SingleOmpForest):
:return: a np.array of the predictions of the entire forest :return: a np.array of the predictions of the entire forest
""" """
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]) forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
weights = self._omp.coef_ weights = self._omp.coef_
omp_trees_predictions = forest_predictions[weights != 0].T[1] omp_trees_predictions = forest_predictions[:, weights != 0]
# Here forest_pred is the probability of being class 1. # Here forest_pred is the probability of being class 1.
result_omp = np.mean(omp_trees_predictions, axis=1) result_omp = np.mean(omp_trees_predictions, axis=1)
result_omp = (result_omp - 0.5) * 2
return result_omp return result_omp
def score(self, X, y, metric=DEFAULT_SCORE_METRIC): def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
......
from sklearn.ensemble import RandomForestRegressor import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta from abc import abstractmethod, ABCMeta
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from bolsonaro.models.forest_pruning_sota import ForestPruningSOTA
from bolsonaro.models.utils import score_metric_mse, aggregation_regression, aggregation_classification, score_metric_indicator
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): class SimilarityForest(ForestPruningSOTA, metaclass=ABCMeta):
""" """
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
""" """
similarity_similarities = "similarity_similarities"
similarity_predictions = "similarity_predictions"
def _fit(self, X_train, y_train, X_val, y_val):
self._base_estimator.fit(X_train, y_train)
param = self._models_parameters.extraction_strategy
def __init__(self, models_parameters, score_metric=mean_squared_error): # get score of base forest on val
self._models_parameters = models_parameters tree_list = list(self._base_estimator.estimators_) # get score of base forest on val
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, trees_to_remove = list()
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size # get score of each single tree of forest on val
self._score_metric = score_metric val_predictions = self._base_estimator_predictions(X_val).T
self._selected_trees = list()
# boolean mask of trees to take into account for next evaluation of trees importance
@property mask_trees_to_consider = np.ones(val_predictions.shape[0], dtype=bool)
def models_parameters(self): # the technique does backward selection, that is: trees are removed one after an other
return self._models_parameters nb_tree_to_remove = len(tree_list) - self._extracted_forest_size
with tqdm(range(nb_tree_to_remove), disable=True) as pruning_forest_bar:
@property
def selected_trees(self):
return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
y_val_pred = self._estimator.predict(X_val)
forest_pred = self._score_metric(y_val, y_val_pred)
forest = self._estimator.estimators_
tree_list = list(self._estimator.estimators_)
val_scores = list()
with tqdm(tree_list) as tree_pred_bar:
tree_pred_bar.set_description('[Initial tree predictions]')
for tree in tree_pred_bar:
val_scores.append(tree.predict(X_val))
tree_pred_bar.update(1)
with tqdm(range(self._extracted_forest_size), disable=True) as pruning_forest_bar:
pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]') pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]')
for i in pruning_forest_bar: for _ in pruning_forest_bar: # pour chaque arbre a extraire
best_similarity = 100000 # get indexes of trees to take into account
found_index = 0 idx_trees_to_consider = np.arange(val_predictions.shape[0])[mask_trees_to_consider]
with tqdm(range(len(tree_list)), disable=True) as tree_list_bar: val_predictions_to_consider = val_predictions[idx_trees_to_consider]
tree_list_bar.set_description(f'[Tree selection s={self._extracted_forest_size} #{i}]') nb_trees_to_consider = val_predictions_to_consider.shape[0]
for j in tree_list_bar:
lonely_tree = tree_list[j] if param == self.similarity_predictions:
del tree_list[j] # this matrix has zero on the diag and 1/(L-1) everywhere else.
val_mean = np.mean(np.asarray(val_scores), axis=0) # When multiplying left the matrix of predictions (having L lines) by this zero_diag_matrix (square L), the result has on each
val_score = self._score_metric(val_mean, y_val) # line, the average of all other lines in the initial matrix of predictions
temp_similarity = abs(forest_pred - val_score) zero_diag_matrix = np.ones((nb_trees_to_consider, nb_trees_to_consider)) * (1 / (nb_trees_to_consider - 1))
if (temp_similarity < best_similarity): np.fill_diagonal(zero_diag_matrix, 0)
found_index = j
best_similarity = temp_similarity leave_one_tree_out_predictions_val = zero_diag_matrix @ val_predictions_to_consider
tree_list.insert(j, lonely_tree) leave_one_tree_out_predictions_val = self._activation(leave_one_tree_out_predictions_val) # identity for regression; sign for classification
val_scores.insert(j, lonely_tree.predict(X_val)) leave_one_tree_out_scores_val = self._score_metric(leave_one_tree_out_predictions_val, y_val)
tree_list_bar.update(1) # difference with base forest is actually useless
self._selected_trees.append(tree_list[found_index]) # delta_score = forest_score - leave_one_tree_out_scores_val
del tree_list[found_index]
del val_scores[found_index] # get index of tree to remove
index_worse_tree = int(self._worse_score_idx(leave_one_tree_out_scores_val))
elif param == self.similarity_similarities:
correlation_matrix = val_predictions_to_consider @ val_predictions_to_consider.T
average_correlation_by_tree = np.average(correlation_matrix, axis=1)
# get index of tree to remove
index_worse_tree = int(np.argmax(average_correlation_by_tree)) # correlation and MSE: both greater is worse
else:
raise ValueError("Unknown similarity method {}. Should be {} or {}".format(param, self.similarity_similarities, self.similarity_predictions))
index_worse_tree_in_base_forest = idx_trees_to_consider[index_worse_tree]
trees_to_remove.append(tree_list[index_worse_tree_in_base_forest])
mask_trees_to_consider[index_worse_tree_in_base_forest] = False
pruning_forest_bar.update(1) pruning_forest_bar.update(1)
self._selected_trees = set(self._selected_trees) pruned_forest = list(set(tree_list) - set(trees_to_remove))
pruned_forest = list(set(forest) - self._selected_trees) return pruned_forest
self._estimator.estimators_ = pruned_forest
@abstractmethod
def score(self, X, y): def _activation(self, leave_one_tree_out_predictions_val):
test_list = list() pass
for mod in self._estimator.estimators_:
test_pred = mod.predict(X)
test_list.append(test_pred)
test_list = np.array(test_list) class SimilarityForestRegressor(SimilarityForest, metaclass=ABCMeta):
test_mean = np.mean(test_list, axis=0)
score = self._score_metric(test_mean, y) @staticmethod
return score def init_estimator(model_parameters):
return RandomForestRegressor(**model_parameters.hyperparameters,
def predict_base_estimator(self, X): random_state=model_parameters.seed, n_jobs=-1)
return self._estimator.predict(X)
def _aggregate(self, predictions):
return aggregation_regression(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_mse(y_preds, y_true)
def _activation(self, predictions):
return predictions
@staticmethod
def _best_score_idx(array):
return np.argmin(array)
@staticmethod
def _worse_score_idx(array):
return np.argmax(array)
class SimilarityForestClassifier(SimilarityForest, metaclass=ABCMeta):
@staticmethod
def init_estimator(model_parameters):
return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed, n_jobs=-1)
def _aggregate(self, predictions):
return aggregation_classification(predictions)
def _score_metric(self, y_preds, y_true):
return score_metric_indicator(y_preds, y_true)
def _activation(self, predictions):
return np.sign(predictions)
def _selected_tree_predictions(self, X):
predictions_0_1 = super()._selected_tree_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
def _base_estimator_predictions(self, X):
predictions_0_1 = super()._base_estimator_predictions(X)
predictions = (predictions_0_1 - 0.5) * 2
return predictions
@staticmethod
def _best_score_idx(array):
return np.argmax(array)
@staticmethod
def _worse_score_idx(array):
return np.argmin(array)
import numpy as np
def score_metric_mse(y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
diff = y_preds - y_true
squared_diff = diff ** 2
mean_squared_diff = np.mean(squared_diff, axis=1)
return mean_squared_diff
def score_metric_indicator(y_preds, y_true):
if len(y_true.shape) == 1:
y_true = y_true[np.newaxis, :]
if len(y_preds.shape) == 1:
y_preds = y_preds[np.newaxis, :]
assert y_preds.shape[1] == y_true.shape[1], "Number of examples to compare should be the same in y_preds and y_true"
bool_arr_correct_predictions = y_preds == y_true
return np.average(bool_arr_correct_predictions, axis=1)
def aggregation_classification(predictions):
return np.sign(np.sum(predictions, axis=0))
def aggregation_regression(predictions):
return np.mean(predictions, axis=0)
from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor, SimilarityForestClassifier
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor, KMeansForestClassifier
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor, EnsembleSelectionForestClassifier
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from . import LOG_PATH from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.preprocessing import normalize
import time import time
import datetime import datetime
import numpy as np import numpy as np
...@@ -38,7 +39,6 @@ class Trainer(object): ...@@ -38,7 +39,6 @@ class Trainer(object):
else classification_score_metric.__name__ else classification_score_metric.__name__
self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else base_classification_score_metric.__name__ else base_classification_score_metric.__name__
self._selected_trees = ''
@property @property
def score_metric_name(self): def score_metric_name(self):
...@@ -77,7 +77,7 @@ class Trainer(object): ...@@ -77,7 +77,7 @@ class Trainer(object):
else: else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
def train(self, model, extracted_forest_size=None): def train(self, model, extracted_forest_size=None, seed=None, use_distillation=False):
""" """
:param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier. OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
...@@ -88,6 +88,7 @@ class Trainer(object): ...@@ -88,6 +88,7 @@ class Trainer(object):
if type(model) in [RandomForestRegressor, RandomForestClassifier]: if type(model) in [RandomForestRegressor, RandomForestClassifier]:
if extracted_forest_size is not None: if extracted_forest_size is not None:
estimators_index = np.arange(len(model.estimators_)) estimators_index = np.arange(len(model.estimators_))
np.random.seed(seed)
np.random.shuffle(estimators_index) np.random.shuffle(estimators_index)
choosen_estimators = estimators_index[:extracted_forest_size] choosen_estimators = estimators_index[:extracted_forest_size]
model.estimators_ = np.array(model.estimators_)[choosen_estimators] model.estimators_ = np.array(model.estimators_)[choosen_estimators]
...@@ -96,14 +97,23 @@ class Trainer(object): ...@@ -96,14 +97,23 @@ class Trainer(object):
X=self._X_forest, X=self._X_forest,
y=self._y_forest y=self._y_forest
) )
self._selected_trees = model.estimators_
else: else:
model.fit( if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier] and \
self._X_forest, use_distillation:
self._y_forest, model.fit(
self._X_omp, self._X_forest, # X_train or X_train+X_dev
self._y_omp self._y_forest,
) self._X_omp, # X_train+X_dev or X_dev
self._y_omp,
use_distillation=use_distillation
)
else:
model.fit(
self._X_forest, # X_train or X_train+X_dev
self._y_forest,
self._X_omp, # X_train+X_dev or X_dev
self._y_omp
)
self._end_time = time.time() self._end_time = time.time()
def __score_func(self, model, X, y_true, weights=True): def __score_func(self, model, X, y_true, weights=True):
...@@ -122,7 +132,8 @@ class Trainer(object): ...@@ -122,7 +132,8 @@ class Trainer(object):
y_pred = np.sign(y_pred) y_pred = np.sign(y_pred)
y_pred = np.where(y_pred == 0, 1, y_pred) y_pred = np.where(y_pred == 0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred) result = self._classification_score_metric(y_true, y_pred)
elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: elif type(model) in [SimilarityForestRegressor, SimilarityForestClassifier, KMeansForestRegressor, EnsembleSelectionForestRegressor, KMeansForestClassifier,
EnsembleSelectionForestClassifier]:
result = model.score(X, y_true) result = model.score(X, y_true)
return result return result
...@@ -130,7 +141,7 @@ class Trainer(object): ...@@ -130,7 +141,7 @@ class Trainer(object):
if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
y_pred = model.predict_base_estimator(X) y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, KMeansForestClassifier, SimilarityForestClassifier, EnsembleSelectionForestClassifier]:
y_pred = model.predict_base_estimator(X) y_pred = model.predict_base_estimator(X)
result = self._base_classification_score_metric(y_true, y_pred) result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) == RandomForestClassifier: elif type(model) == RandomForestClassifier:
...@@ -141,7 +152,17 @@ class Trainer(object): ...@@ -141,7 +152,17 @@ class Trainer(object):
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
return result return result
def compute_results(self, model, models_dir): def _evaluate_predictions(self, X, aggregation_function, selected_trees):
predictions = np.array([tree.predict(X) for tree in selected_trees])
predictions = normalize(predictions)
return aggregation_function(np.abs((predictions @ predictions.T - np.eye(len(predictions)))))
def _compute_forest_strength(self, X, y, metric_function, selected_trees):
return np.mean([metric_function(y, tree.predict(X)) for tree in selected_trees])
def compute_results(self, model, models_dir, subsets_used='train+dev,train+dev'):
""" """
:param model: Object with :param model: Object with
:param models_dir: Where the results will be saved :param models_dir: Where the results will be saved
...@@ -155,25 +176,72 @@ class Trainer(object): ...@@ -155,25 +176,72 @@ class Trainer(object):
elif type(model) == OmpForestBinaryClassifier: elif type(model) == OmpForestBinaryClassifier:
model_weights = model._omp model_weights = model._omp
if type(model) in [SimilarityForestRegressor, EnsembleSelectionForestRegressor, KMeansForestRegressor]: if type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor,
self._selected_trees = model.selected_trees SimilarityForestClassifier, KMeansForestClassifier, EnsembleSelectionForestClassifier]:
selected_trees = model.selected_trees
elif type(model) in [OmpForestRegressor, OmpForestMulticlassClassifier, OmpForestBinaryClassifier]:
selected_trees = np.asarray(model.forest)[model._omp.coef_ != 0]
elif type(model) in [RandomForestRegressor, RandomForestClassifier]:
selected_trees = model.estimators_
if len(self._selected_trees) > 0: if len(selected_trees) > 0:
target_selected_tree = int(os.path.split(models_dir)[-1])
if target_selected_tree != len(selected_trees):
raise ValueError(f'Invalid selected tree number target_selected_tree:{target_selected_tree} - len(selected_trees):{len(selected_trees)}')
with open(os.path.join(models_dir, 'selected_trees.pickle'), 'wb') as output_file: with open(os.path.join(models_dir, 'selected_trees.pickle'), 'wb') as output_file:
pickle.dump(self._selected_trees, output_file) pickle.dump(selected_trees, output_file)
strength_metric = self._regression_score_metric if self._dataset.task == Task.REGRESSION else self._classification_score_metric
# Reeeally dirty to put that here but otherwise it's not thread safe...
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
if subsets_used == 'train,dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
else:
X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
X_omp = self._dataset.X_dev
y_omp = self._dataset.y_dev
elif model.models_parameters.subsets_used == 'train,dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
X_omp = self._dataset.X_dev
y_omp = self._dataset.y_dev
elif model.models_parameters.subsets_used == 'train+dev,train+dev':
X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
X_omp = X_forest
y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
y_omp = y_forest
elif model.models_parameters.subsets_used == 'train,train+dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
results = ModelRawResults( results = ModelRawResults(
model_weights=model_weights, model_weights=model_weights,
training_time=self._end_time - self._begin_time, training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(), datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), train_score=self.__score_func(model, X_forest, y_forest),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev), dev_score=self.__score_func(model, X_omp, y_omp),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), train_score_base=self.__score_func_base(model, X_forest, y_forest),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), dev_score_base=self.__score_func_base(model, X_omp, y_omp),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name, score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name base_score_metric=self._base_score_metric_name,
train_coherence=self._evaluate_predictions(X_forest, aggregation_function=np.max, selected_trees=selected_trees),
dev_coherence=self._evaluate_predictions(X_omp, aggregation_function=np.max, selected_trees=selected_trees),
test_coherence=self._evaluate_predictions(self._dataset.X_test, aggregation_function=np.max, selected_trees=selected_trees),
train_correlation=self._evaluate_predictions(X_forest, aggregation_function=np.mean, selected_trees=selected_trees),
dev_correlation=self._evaluate_predictions(X_omp, aggregation_function=np.mean, selected_trees=selected_trees),
test_correlation=self._evaluate_predictions(self._dataset.X_test, aggregation_function=np.mean, selected_trees=selected_trees),
train_strength=self._compute_forest_strength(X_forest, y_forest, strength_metric, selected_trees),
dev_strength=self._compute_forest_strength(X_omp, y_omp, strength_metric, selected_trees),
test_strength=self._compute_forest_strength(self._dataset.X_test, self._dataset.y_test, strength_metric, selected_trees)
) )
results.save(models_dir) results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_base)) self._logger.info("Base performance on test: {}".format(results.test_score_base))
...@@ -185,26 +253,30 @@ class Trainer(object): ...@@ -185,26 +253,30 @@ class Trainer(object):
self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score)) self._logger.info("Performance on dev: {}".format(results.dev_score))
self._logger.info(f'test_coherence: {results.test_coherence}')
self._logger.info(f'test_correlation: {results.test_correlation}')
self._logger.info(f'test_strength: {results.test_strength}')
if type(model) not in [RandomForestRegressor, RandomForestClassifier]: if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
results = ModelRawResults( results = ModelRawResults(
model_weights='', model_weights='',
training_time=self._end_time - self._begin_time, training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(), datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False), train_score=self.__score_func(model, X_forest, y_forest, False),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False), dev_score=self.__score_func(model, X_omp, y_omp, False),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), train_score_base=self.__score_func_base(model, X_forest, y_forest),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), dev_score_base=self.__score_func_base(model, X_omp, y_omp),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name, score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name base_score_metric=self._base_score_metric_name
) )
results.save(models_dir+'_no_weights') results.save(models_dir+'_no_weights')
self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) self._logger.info("Base performance on test without weights: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score)) self._logger.info("Performance on test without weights: {}".format(results.test_score))
self._logger.info("Base performance on train without weights: {}".format(results.train_score_base)) self._logger.info("Base performance on train without weights: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score)) self._logger.info("Performance on train without weights: {}".format(results.train_score))
self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base)) self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score)) self._logger.info("Performance on dev without weights: {}".format(results.dev_score))
...@@ -51,6 +51,7 @@ class Plotter(object): ...@@ -51,6 +51,7 @@ class Plotter(object):
@staticmethod @staticmethod
def plot_mean_and_CI(ax, mean, lb, ub, x_value, color_mean=None, facecolor=None, label=None): def plot_mean_and_CI(ax, mean, lb, ub, x_value, color_mean=None, facecolor=None, label=None):
#print(x_value, mean, lb, ub)
# plot the shaded range of the confidence intervals # plot the shaded range of the confidence intervals
ax.fill_between(x_value, ub, lb, facecolor=facecolor, alpha=.5) ax.fill_between(x_value, ub, lb, facecolor=facecolor, alpha=.5)
# plot the mean on top # plot the mean on top
...@@ -105,7 +106,7 @@ class Plotter(object): ...@@ -105,7 +106,7 @@ class Plotter(object):
@staticmethod @staticmethod
def plot_stage2_losses(file_path, all_experiment_scores, x_value, def plot_stage2_losses(file_path, all_experiment_scores, x_value,
xlabel, ylabel, all_labels, title): xlabel, ylabel, all_labels, title, filter_num=-1):
fig, ax = plt.subplots() fig, ax = plt.subplots()
...@@ -124,13 +125,14 @@ class Plotter(object): ...@@ -124,13 +125,14 @@ class Plotter(object):
# Compute the mean and the std for the CI # Compute the mean and the std for the CI
mean_experiment_scores = np.average(experiment_scores, axis=0) mean_experiment_scores = np.average(experiment_scores, axis=0)
std_experiment_scores = np.std(experiment_scores, axis=0) std_experiment_scores = np.std(experiment_scores, axis=0)
# Plot the score curve with the CI # Plot the score curve with the CI
Plotter.plot_mean_and_CI( Plotter.plot_mean_and_CI(
ax=ax, ax=ax,
mean=mean_experiment_scores, mean=mean_experiment_scores,
lb=mean_experiment_scores + std_experiment_scores, lb=mean_experiment_scores + std_experiment_scores,
ub=mean_experiment_scores - std_experiment_scores, ub=mean_experiment_scores - std_experiment_scores,
x_value=x_value, x_value=x_value[:filter_num] if len(mean_experiment_scores) == filter_num else x_value,
color_mean=colors[i], color_mean=colors[i],
facecolor=colors[i], facecolor=colors[i],
label=all_labels[i] label=all_labels[i]
......
This diff is collapsed.
import pathlib
import glob2
import os
import shutil
from tqdm import tqdm
if __name__ == "__main__":
models_source_path = 'models'
models_destination_path = 'bolsonaro_models_25-03-20'
#datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds',
# 'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs']
datasets = ['kin8nm']
pathlib.Path(models_destination_path).mkdir(parents=True, exist_ok=True)
with tqdm(datasets) as dataset_bar:
for dataset in dataset_bar:
dataset_bar.set_description(dataset)
found_paths = glob2.glob(os.path.join(models_source_path, dataset, 'stage5_new',
'**', 'model_raw_results.pickle'), recursive=True)
pathlib.Path(os.path.join(models_destination_path, dataset)).mkdir(parents=True, exist_ok=True)
with tqdm(found_paths) as found_paths_bar:
for path in found_paths_bar:
found_paths_bar.set_description(path)
new_path = path.replace(f'models/{dataset}/stage5_new/', '')
(new_path, filename) = os.path.split(new_path)
new_path = os.path.join(models_destination_path, dataset, new_path)
pathlib.Path(new_path).mkdir(parents=True, exist_ok=True)
shutil.copyfile(src=path, dst=os.path.join(new_path, filename))
found_paths_bar.update(1)
dataset_bar.update(1)
...@@ -55,11 +55,6 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -55,11 +55,6 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
trainer = Trainer(dataset) trainer = Trainer(dataset)
if parameters['extraction_strategy'] == 'ensemble':
library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed)
else:
library = None
if parameters['extraction_strategy'] == 'random': if parameters['extraction_strategy'] == 'random':
pretrained_model_parameters = ModelParameters( pretrained_model_parameters = ModelParameters(
extracted_forest_size=parameters['forest_size'], extracted_forest_size=parameters['forest_size'],
...@@ -70,12 +65,12 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -70,12 +65,12 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
hyperparameters=hyperparameters, hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy'] extraction_strategy=parameters['extraction_strategy']
) )
pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters, library=library) pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters)
pretraned_trainer = Trainer(dataset) pretrained_trainer = Trainer(dataset)
pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used']) pretrained_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
pretrained_estimator.fit( pretrained_estimator.fit(
X=pretraned_trainer._X_forest, X=pretrained_trainer._X_forest,
y=pretraned_trainer._y_forest y=pretrained_trainer._y_forest
) )
else: else:
pretrained_estimator = None pretrained_estimator = None
...@@ -84,8 +79,9 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -84,8 +79,9 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
if parameters['extraction_strategy'] != 'none': if parameters['extraction_strategy'] != 'none':
with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb: with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i], Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library, models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer,
pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters) pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters,
use_distillation=parameters['extraction_strategy'] == 'omp_distillation')
for i in range(len(parameters['extracted_forest_size']))) for i in range(len(parameters['extracted_forest_size'])))
else: else:
forest_size = hyperparameters['n_estimators'] forest_size = hyperparameters['n_estimators']
...@@ -97,11 +93,11 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -97,11 +93,11 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
if os.path.isdir(sub_models_dir): if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir) sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files: for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]: if file_name == 'model_raw_results.pickle':
continue
else:
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0 already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break break
else:
continue
if already_exists: if already_exists:
logger.info('Base forest result already exists. Skipping...') logger.info('Base forest result already exists. Skipping...')
else: else:
...@@ -117,7 +113,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -117,7 +113,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters, library=library) model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model, subsets_used=parameters['subsets_used']) trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model) trainer.train(model)
...@@ -126,8 +122,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -126,8 +122,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
seed_job_pb.update(1) seed_job_pb.update(1)
def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir, def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
seed, parameters, dataset, hyperparameters, experiment_id, trainer, library, seed, parameters, dataset, hyperparameters, experiment_id, trainer,
pretrained_estimator=None, pretrained_model_parameters=None): pretrained_estimator=None, pretrained_model_parameters=None, use_distillation=False):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format( logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
seed, extracted_forest_size, threading.get_ident())) seed, extracted_forest_size, threading.get_ident()))
...@@ -140,11 +136,11 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz ...@@ -140,11 +136,11 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz
if os.path.isdir(sub_models_dir): if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir) sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files: for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]: if file_name == 'model_raw_results.pickle':
continue
else:
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0 already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break break
else:
continue
if already_exists: if already_exists:
logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...') logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
return return
...@@ -162,13 +158,14 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz ...@@ -162,13 +158,14 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz
extraction_strategy=parameters['extraction_strategy'] extraction_strategy=parameters['extraction_strategy']
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters, library=library) model = ModelFactory.build(dataset.task, model_parameters)
else: else:
model = copy.deepcopy(pretrained_estimator) model = copy.deepcopy(pretrained_estimator)
pretrained_model_parameters.save(sub_models_dir, experiment_id) pretrained_model_parameters.save(sub_models_dir, experiment_id)
trainer.init(model, subsets_used=parameters['subsets_used']) trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model, extracted_forest_size=extracted_forest_size) trainer.train(model, extracted_forest_size=extracted_forest_size, seed=seed,
use_distillation=use_distillation)
trainer.compute_results(model, sub_models_dir) trainer.compute_results(model, sub_models_dir)
""" """
...@@ -235,7 +232,7 @@ if __name__ == "__main__": ...@@ -235,7 +232,7 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans, ensemble.') parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity_similarities, similarity_predictions, kmeans, ensemble.')
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id') parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
args = parser.parse_args() args = parser.parse_args()
...@@ -246,8 +243,8 @@ if __name__ == "__main__": ...@@ -246,8 +243,8 @@ if __name__ == "__main__":
else: else:
parameters = args.__dict__ parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans', 'ensemble']: if parameters['extraction_strategy'] not in ['omp', 'omp_distillation', 'random', 'none', 'similarity_similarities', 'similarity_predictions', 'kmeans', 'ensemble']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters['extraction_strategy']))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
......
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
lst_skip_strategy = ["None", "OMP Distillation", "OMP Distillation w/o weights"]
# lst_skip_subset = ["train/dev"]
lst_task_train_dev = ["coherence", "correlation"]
tasks = [
# "train_score",
# "dev_score",
# "test_score",
"coherence",
"correlation",
# "negative-percentage"
]
dct_score_metric_fancy = {
"accuracy_score": "% Accuracy",
"mean_squared_error": "MSE"
}
pio.templates.default = "plotly_white"
dct_color_by_strategy = {
"OMP": (255, 0, 0), # red
"OMP Distillation": (255, 0, 0), # red
"OMP Distillation w/o weights": (255, 128, 0), # orange
"OMP w/o weights": (255, 128, 0), # orange
"Random": (0, 0, 0), # black
"Zhang Similarities": (255, 255, 0), # jaune
'Zhang Predictions': (128, 0, 128), # turquoise
'Ensemble': (0, 0, 255), # blue
"Kmeans": (0, 255, 0) # red
}
dct_dash_by_strategy = {
"OMP": None,
"OMP Distillation": "dash",
"OMP Distillation w/o weights": "dash",
"OMP w/o weights": None,
"Random": "dot",
"Zhang Similarities": "dash",
'Zhang Predictions': "dash",
'Ensemble': "dash",
"Kmeans": "dash"
}
def add_trace_from_df(df, fig):
df.sort_values(by="forest_size", inplace=True)
df_groupby_forest_size = df.groupby(['forest_size'])
forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values)
mean_value = df_groupby_forest_size[task].mean().values
std_value = df_groupby_forest_size[task].std().values
std_value_upper = list(mean_value + std_value)
std_value_lower = list(mean_value - std_value)
# print(df_strat)
fig.add_trace(go.Scatter(x=forest_sizes, y=mean_value,
mode='lines',
name=strat,
line=dict(dash=dct_dash_by_strategy[strat], color="rgb{}".format(dct_color_by_strategy[strat]))
))
fig.add_trace(go.Scatter(
x=forest_sizes + forest_sizes[::-1],
y=std_value_upper + std_value_lower[::-1],
fill='toself',
showlegend=False,
fillcolor='rgba{}'.format(dct_color_by_strategy[strat] + tpl_transparency),
line_color='rgba(255,255,255,0)',
name=strat
))
tpl_transparency = (0.1,)
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
dir_name = "bolsonaro_models_25-03-20"
dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
input_dir_file = dir_path / "results.csv"
df_results = pd.read_csv(open(input_dir_file, 'rb'))
datasets = set(df_results["dataset"].values)
strategies = set(df_results["strategy"].values)
subsets = set(df_results["subset"].values)
for task in tasks:
for data_name in datasets:
df_data = df_results[df_results["dataset"] == data_name]
score_metric_name = df_data["score_metric"].values[0]
fig = go.Figure()
##################
# all techniques #
##################
for strat in strategies:
if strat in lst_skip_strategy:
continue
df_strat = df_data[df_data["strategy"] == strat]
df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"]
if "OMP" in strat:
###########################
# traitement avec weights #
###########################
df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
if data_name == "Boston":
df_strat_wo_weights = df_strat_wo_weights[df_strat_wo_weights["forest_size"] < 400]
add_trace_from_df(df_strat_wo_weights, fig)
#################################
# traitement general wo_weights #
#################################
if "OMP" in strat:
df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
else:
df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
if "OMP" in strat:
strat = "{} w/o weights".format(strat)
add_trace_from_df(df_strat_wo_weights, fig)
title = "{} {}".format(task, data_name)
yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name]
fig.update_layout(barmode='group',
title=title,
xaxis_title="# Selected Trees",
yaxis_title=yaxis_title,
font=dict(
# family="Courier New, monospace",
size=24,
color="black"
),
showlegend = False,
margin = dict(
l=1,
r=1,
b=1,
t=1,
# pad=4
),
legend=dict(
traceorder="normal",
font=dict(
family="sans-serif",
size=24,
color="black"
),
# bgcolor="LightSteelBlue",
# bordercolor="Black",
borderwidth=1,
)
)
# fig.show()
sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_")
filename = sanitize(title)
output_dir = out_dir / sanitize(task)
output_dir.mkdir(parents=True, exist_ok=True)
fig.write_image(str((output_dir / filename).absolute()) + ".png")
# exit()
import copy
from dotenv import load_dotenv, find_dotenv
from pathlib import Path
import os
import pandas as pd
import numpy as np
from pprint import pprint
import plotly.graph_objects as go
import plotly.io as pio
from collections import defaultdict
lst_skip_strategy = ["None", "OMP Distillation", "OMP Distillation w/o weights"]
lst_skip_task = ["correlation", "coherence"]
# lst_skip_task = []
lst_skip_subset = ["train/dev"]
# lst_skip_subset = []
tasks = [
# "train_score",
# "dev_score",
"test_score",
# "coherence",
# "correlation"
]
dct_score_metric_fancy = {
"accuracy_score": "% Accuracy",
"mean_squared_error": "MSE"
}
dct_score_metric_best_fct = {
"accuracy_score": np.argmax,
"mean_squared_error": np.argmin
}
dct_data_short = {
"Spambase": "Spambase",
"Diamonds": "Diamonds",
"Diabetes": "Diabetes",
"Steel Plates": "Steel P.",
"KR-VS-KP": "KR-VS-KP",
"Breast Cancer": "Breast C.",
"Kin8nm": "Kin8nm",
"LFW Pairs": "LFW P.",
"Gamma": "Gamma",
"California Housing": "California H.",
"Boston": "Boston",
}
dct_data_best = {
"Spambase": np.max,
"Diamonds": np.min,
"Diabetes": np.min,
"Steel Plates": np.max,
"KR-VS-KP": np.max,
"Breast Cancer": np.max,
"Kin8nm": np.min,
"LFW Pairs": np.max,
"Gamma": np.max,
"California Housing": np.min,
"Boston": np.min,
}
dct_data_metric = {
"Spambase": "Acc.",
"Diamonds": "MSE",
"Diabetes": "MSE",
"Steel Plates": "Acc.",
"KR-VS-KP": "Acc.",
"Breast Cancer": "Acc.",
"Kin8nm": "MSE",
"LFW Pairs": "Acc.",
"Gamma": "Acc.",
"California Housing": "MSE",
"Boston": "MSE",
}
def get_max_from_df(df, best_fct):
nb_to_consider = 10
df.sort_values(by="forest_size", inplace=True)
df_groupby_forest_size = df.groupby(['forest_size'])
forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values)[:nb_to_consider]
mean_value = df_groupby_forest_size[task].mean().values[:nb_to_consider]
std_value = df_groupby_forest_size[task].std().values[:nb_to_consider]
try:
argmax = best_fct(mean_value)
except:
print("no results", strat, data_name, task, subset_name)
return -1, -1, -1
max_mean = mean_value[argmax]
max_std = std_value[argmax]
max_forest_size = forest_sizes[argmax]
return max_forest_size, max_mean, max_std
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
dir_name = "bolsonaro_models_25-03-20"
dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
input_dir_file = dir_path / "results.csv"
df_results = pd.read_csv(open(input_dir_file, 'rb'))
datasets = set(df_results["dataset"].values)
strategies = sorted(list(set(df_results["strategy"].values) - set(lst_skip_strategy)))
subsets = set(df_results["subset"].values)
r"""
\begin{table}[!h]
\centering
\begin{tabular}{l{}}
\toprule
\multicolumn{1}{c}{\textbf{Dataset}} & \textbf{Data dim.} $\datadim$ & \textbf{\# classes} & \textbf{Train size} $\nexamples$ & \textbf{Test size} $\nexamples'$ \\ \midrule
\texttt{MNIST}~\cite{lecun-mnisthandwrittendigit-2010} & 784 & 10 & 60 000 & 10 000 \\ %\hline
\texttt{Kddcup99}~\cite{Dua:2019} & 116 & 23 & 4 893 431 & 5 000 \\
\bottomrule
\end{tabular}
\caption{Main features of the datasets. Discrete, unordered attributes for dataset Kddcup99 have been encoded as one-hot attributes.}
\label{table:data}
\end{table}
"""
for task in tasks:
if task in lst_skip_task:
continue
dct_data_lst_tpl_results = defaultdict(lambda: [])
lst_strats = []
for data_name in datasets:
df_data = df_results[df_results["dataset"] == data_name]
score_metric_name = df_data["score_metric"].values[0]
for subset_name in subsets:
if subset_name in lst_skip_subset:
continue
df_subset = df_data[df_data["subset"] == subset_name]
##################
# all techniques #
##################
for strat in strategies:
if strat in lst_skip_strategy:
continue
df_strat = df_subset[df_subset["strategy"] == strat]
if "OMP" in strat:
###########################
# traitement avec weights #
###########################
df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
if data_name == "Boston" and subset_name == "train+dev/train+dev":
df_strat_wo_weights = df_strat_wo_weights[df_strat_wo_weights["forest_size"] < 400]
dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
if strat not in lst_strats: lst_strats.append(strat)
if "OMP" in strat and subset_name == "train/dev":
continue
elif "Random" not in strat and subset_name == "train/dev":
continue
#################################
# traitement general wo_weights #
#################################
if "Random" in strat:
df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
else:
df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
if "OMP" in strat:
strat = "{} w/o weights".format(strat)
dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
if strat not in lst_strats: lst_strats.append(strat)
title = "{} {} {}".format(task, data_name, subset_name)
# fig.show()
sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_")
filename = sanitize(title)
# output_dir = out_dir / sanitize(subset_name) / sanitize(task)
# output_dir.mkdir(parents=True, exist_ok=True)
# fig.write_image(str((output_dir / filename).absolute()) + ".png")
# pprint(dct_data_lst_tpl_results)
lst_data_ordered = [
"Diamonds",
"Diabetes",
"Kin8nm",
"California Housing",
"Boston",
"Spambase",
"Steel Plates",
"KR-VS-KP",
"Breast Cancer",
"LFW Pairs",
"Gamma"
]
arr_results_str = np.empty((len(lst_strats)+1, len(datasets) + 1 ), dtype="object")
nb_spaces = 25
dct_strat_str = defaultdict(lambda: [])
s_empty = "{}" + " "*(nb_spaces-2) + " & "
arr_results_str[0][0] = s_empty
# arr_results_str[0][1] = s_empty
for idx_data, data_name in enumerate(lst_data_ordered):
lst_tpl_results = dct_data_lst_tpl_results[data_name]
data_name_short = dct_data_short[data_name]
s_data_tmp = "{}".format(data_name_short)
s_data_tmp += "({})".format(dct_data_metric[data_name])
# s_data_tmp = "\\texttt{{ {} }}".format(data_name_short)
# s_data_tmp = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(data_name)
s_data_tmp += " "*(nb_spaces - len(data_name_short))
s_data_tmp += " & "
arr_results_str[0, idx_data + 1] = s_data_tmp
array_results = np.array(lst_tpl_results)
best_result_perf = dct_data_best[data_name](array_results[:, 1])
best_result_perf_indexes = np.argwhere(array_results[:, 1] == best_result_perf)
copye_array_results = copy.deepcopy(array_results)
if dct_data_best[data_name] is np.min:
copye_array_results[best_result_perf_indexes] = np.inf
else:
copye_array_results[best_result_perf_indexes] = -np.inf
best_result_perf_2 = dct_data_best[data_name](copye_array_results[:, 1])
best_result_perf_indexes_2 = np.argwhere(copye_array_results[:, 1] == best_result_perf_2)
best_result_prune = np.min(array_results[:, 0])
best_result_prune_indexes = np.argwhere(array_results[:, 0] == best_result_prune)
for idx_strat, tpl_results in enumerate(array_results):
# str_strat = "\\texttt{{ {} }}".format(lst_strats[idx_strat])
# str_strat = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(lst_strats[idx_strat])
# str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ \\texttt{{ {} }} }} }}".format("}\\\\ \\texttt{".join(lst_strats[idx_strat].split(" ", 1)))
str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ {} }} }} ".format("\\\\".join(lst_strats[idx_strat].split(" ", 1)))
str_strat += " " * (nb_spaces - len(str_strat)) + " & "
arr_results_str[idx_strat+1, 0] = str_strat
# str_header = " {} & #tree &".format(dct_data_metric[data_name])
# arr_results_str[idx_strat + 1, 1] = str_header
best_forest_size = tpl_results[0]
best_mean = tpl_results[1]
best_std = tpl_results[2]
if dct_data_metric[data_name] == "Acc.":
str_perf = "{:.2f}\\%".format(best_mean * 100)
else:
str_perf = "{:.3E}".format(best_mean)
str_prune = "{:d}".format(int(best_forest_size))
if idx_strat in best_result_perf_indexes:
# str_formating = "\\textbf{{ {} }}".format(str_result_loc)
str_formating = "\\textbf[{}]"
# str_formating = "\\textbf{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std)
elif idx_strat in best_result_perf_indexes_2:
str_formating = "\\underline[{}]"
# str_formating = "\\underline{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std)
else:
str_formating = "{}"
# str_formating = "{:.3E}(~{:.3E})".format(best_mean, best_std)
if idx_strat in best_result_prune_indexes:
str_formating = str_formating.format("\\textit[{}]")
# str_prune = " & \\textit{{ {:d} }}".format(int(best_forest_size))
# else:
# str_prune = " & {:d}".format(int(best_forest_size))
str_result = str_formating.format(str_perf) + " & " + str_formating.format(str_prune)
str_result += " "*(nb_spaces - len(str_result))
str_result = str_result.replace("[", "{").replace("]", "}")
arr_results_str[idx_strat+1, idx_data+1] = str_result + " & "
dct_strat_str[lst_strats[idx_strat]].append(str_result)
arr_results_str = arr_results_str.T
for idx_lin, lin in enumerate(arr_results_str):
if idx_lin == 1:
print("\\midrule")
if idx_lin == 6:
print("\\midrule")
if lst_data_ordered[idx_lin-1] == "Diamonds":
print("%", end="")
line_print = " ".join(list(lin))
line_print = line_print.rstrip(" &") + "\\\\"
print(line_print)
# s_data = s_data.rstrip(" &") + "\\\\"
# print(s_data)
# for strat, lst_str_results in dct_strat_str.items():
# str_strat = "\\texttt{{ {} }}".format(strat)
# str_strat += " "*(nb_spaces - len(str_strat))
# str_strat += " & " + " & ".join(lst_str_results)
# str_strat += "\\\\"
# print(str_strat)
# exit()
from pathlib import Path
import os
import pandas as pd
from pprint import pprint
import pickle
from collections import defaultdict
import numpy as np
from dotenv import load_dotenv, find_dotenv
dct_experiment_id_subset = dict((str(idx), "train+dev/train+dev") for idx in range(1, 9))
dct_experiment_id_subset.update(dict((str(idx), "train/dev") for idx in range(9, 17)))
NONE = 'None'
Random = 'Random'
OMP = 'OMP'
OMP_Distillation = 'OMP Distillation'
Kmeans = 'Kmeans'
Zhang_Similarities = 'Zhang Similarities'
Zhang_Predictions = 'Zhang Predictions'
Ensemble = 'Ensemble'
dct_experiment_id_technique = {"1": NONE,
"2": Random,
"3": OMP,
"4": OMP_Distillation,
"5": Kmeans,
"6": Zhang_Similarities,
"7": Zhang_Predictions,
"8": Ensemble,
"9": NONE,
"10": Random,
"11": OMP,
"12": OMP_Distillation,
"13": Kmeans,
"14": Zhang_Similarities,
"15": Zhang_Predictions,
"16": Ensemble
}
dct_dataset_fancy = {
"boston": "Boston",
"breast_cancer": "Breast Cancer",
"california_housing": "California Housing",
"diabetes": "Diabetes",
"diamonds": "Diamonds",
"digits": "Digits",
"iris": "Iris",
"kin8nm": "Kin8nm",
"kr-vs-kp": "KR-VS-KP",
"olivetti_faces": "Olivetti Faces",
"spambase": "Spambase",
"steel-plates": "Steel Plates",
"wine": "Wine",
"gamma": "Gamma",
"lfw_pairs": "LFW Pairs"
}
skip_attributes = ["datetime"]
set_no_coherence = set()
set_no_corr = set()
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
dir_name = "results/bolsonaro_models_25-03-20"
dir_path = Path(os.environ["project_dir"]) / dir_name
output_dir_file = dir_path / "results.csv"
dct_results = defaultdict(lambda: [])
for root, dirs, files in os.walk(dir_path, topdown=False):
for file_str in files:
if file_str == "results.csv":
continue
path_dir = Path(root)
path_file = path_dir / file_str
print(path_file)
try:
with open(path_file, 'rb') as pickle_file:
obj_results = pickle.load(pickle_file)
except:
print("problem loading pickle file {}".format(path_file))
path_dir_split = str(path_dir).split("/")
bool_wo_weights = "no_weights" in str(path_file)
if bool_wo_weights:
forest_size = int(path_dir_split[-1].split("_")[0])
else:
forest_size = int(path_dir_split[-1])
seed = int(path_dir_split[-3])
id_xp = str(path_dir_split[-5])
dataset = str(path_dir_split[-6])
dct_results["forest_size"].append(forest_size)
dct_results["seed"].append(seed)
dct_results["dataset"].append(dct_dataset_fancy[dataset])
dct_results["subset"].append(dct_experiment_id_subset[id_xp])
dct_results["strategy"].append(dct_experiment_id_technique[id_xp])
dct_results["wo_weights"].append(bool_wo_weights)
for key_result, val_result in obj_results.items():
if key_result in skip_attributes:
continue
if key_result == "model_weights":
if val_result == "":
dct_results["negative-percentage"].append(None)
else:
lt_zero = val_result < 0
gt_zero = val_result > 0
nb_lt_zero = np.sum(lt_zero)
nb_gt_zero = np.sum(gt_zero)
percentage_lt_zero = nb_lt_zero / (nb_gt_zero + nb_lt_zero)
dct_results["negative-percentage"].append(percentage_lt_zero)
if val_result == "":
# print(key_result, val_result)
val_result = None
if key_result == "coherence" and val_result is None:
set_no_coherence.add(id_xp)
if key_result == "correlation" and val_result is None:
set_no_corr.add(id_xp)
dct_results[key_result].append(val_result)
# class 'dict'>: {'model_weights': '',
# 'training_time': 0.0032033920288085938,
# 'datetime': datetime.datetime(2020, 3, 25, 0, 28, 34, 938400),
# 'train_score': 1.0,
# 'dev_score': 0.978021978021978,
# 'test_score': 0.9736842105263158,
# 'train_score_base': 1.0,
# 'dev_score_base': 0.978021978021978,
# 'test_score_base': 0.9736842105263158,
# 'score_metric': 'accuracy_score',
# 'base_score_metric': 'accuracy_score',
# 'coherence': 0.9892031711775613,
# 'correlation': 0.9510700193340448}
# print(path_file)
print("coh", set_no_coherence, len(set_no_coherence))
print("cor", set_no_corr, len(set_no_corr))
final_df = pd.DataFrame.from_dict(dct_results)
final_df.to_csv(output_dir_file)
print(final_df)
# local package alabaster==0.7.12
-e . attrs==19.3.0
awscli==1.16.272
# external requirements Babel==2.7.0
click backcall==0.1.0
Sphinx -e git+git@gitlab.lis-lab.fr:luc.giffon/bolsonaro.git@bbad0e522d6b4b392f1926fa935f2a7fac093411#egg=bolsonaro
coverage botocore==1.13.8
awscli certifi==2019.11.28
flake8 chardet==3.0.4
pytest Click==7.0
scikit-learn colorama==0.4.1
git+git://github.com/darenr/scikit-optimize@master coverage==4.5.4
python-dotenv cycler==0.10.0
matplotlib decorator==4.4.2
pandas docutils==0.15.2
entrypoints==0.3
flake8==3.7.9
idna==2.8
imagesize==1.1.0
importlib-metadata==1.5.0
ipython==7.13.0
ipython-genutils==0.2.0
jedi==0.16.0
Jinja2==2.10.3
jmespath==0.9.4
joblib==0.14.0
kiwisolver==1.1.0
MarkupSafe==1.1.1
matplotlib==3.1.1
mccabe==0.6.1
mkl-fft==1.0.14
mkl-random==1.1.0
mkl-service==2.3.0
more-itertools==8.2.0
numpy==1.17.3
packaging==20.3
pandas==0.25.3
parso==0.6.2
pexpect==4.8.0
pickleshare==0.7.5
plotly==4.5.2
pluggy==0.13.1
prompt-toolkit==3.0.3
psutil==5.7.0
ptyprocess==0.6.0
py==1.8.1
pyaml==20.3.1
pyasn1==0.4.7
pycodestyle==2.5.0
pyflakes==2.1.1
Pygments==2.6.1
pyparsing==2.4.5
pytest==5.4.1
python-dateutil==2.8.1
python-dotenv==0.10.3
pytz==2019.3
PyYAML==5.1.2
requests==2.22.0
retrying==1.3.3
rsa==3.4.2
s3transfer==0.2.1
scikit-learn==0.21.3
scikit-optimize==0.7.4
scipy==1.3.1
six==1.12.0
snowballstemmer==2.0.0
Sphinx==2.2.1
sphinxcontrib-applehelp==1.0.1
sphinxcontrib-devhelp==1.0.1
sphinxcontrib-htmlhelp==1.0.2
sphinxcontrib-jsmath==1.0.1
sphinxcontrib-qthelp==1.0.2
sphinxcontrib-serializinghtml==1.1.3
tornado==6.0.3
tqdm==4.43.0
traitlets==4.3.3
urllib3==1.25.6
wcwidth==0.1.8
zipp==2.2.0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment