Skip to content
Snippets Groups Projects
Commit 5e3368e9 authored by Luc Giffon's avatar Luc Giffon
Browse files

Merge branch '15-integration-sota' of gitlab.lis-lab.fr:luc.giffon/bolsonaro...

Merge branch '15-integration-sota' of gitlab.lis-lab.fr:luc.giffon/bolsonaro into 15-integration-sota
parents f0f48756 0ea777e7
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
...@@ -10,7 +10,10 @@ class ModelRawResults(object): ...@@ -10,7 +10,10 @@ class ModelRawResults(object):
datetime, train_score, dev_score, test_score, datetime, train_score, dev_score, test_score,
train_score_base, dev_score_base, train_score_base, dev_score_base,
test_score_base, score_metric, base_score_metric, test_score_base, score_metric, base_score_metric,
coherence='', correlation=''): #coherence='', correlation=''):
train_coherence='', dev_coherence='', test_coherence='',
train_correlation='', dev_correlation='', test_correlation='',
train_strength='', dev_strength='', test_strength=''):
self._model_weights = model_weights self._model_weights = model_weights
self._training_time = training_time self._training_time = training_time
...@@ -23,8 +26,17 @@ class ModelRawResults(object): ...@@ -23,8 +26,17 @@ class ModelRawResults(object):
self._test_score_base = test_score_base self._test_score_base = test_score_base
self._score_metric = score_metric self._score_metric = score_metric
self._base_score_metric = base_score_metric self._base_score_metric = base_score_metric
self._coherence = coherence """self._coherence = coherence
self._correlation = correlation self._correlation = correlation"""
self._train_coherence = train_coherence
self._dev_coherence = dev_coherence
self._test_coherence = test_coherence
self._train_correlation = train_correlation
self._dev_correlation = dev_correlation
self._test_correlation = test_correlation
self._train_strength = train_strength
self._dev_strength = dev_strength
self._test_strength = test_strength
@property @property
def model_weights(self): def model_weights(self):
...@@ -70,13 +82,49 @@ class ModelRawResults(object): ...@@ -70,13 +82,49 @@ class ModelRawResults(object):
def base_score_metric(self): def base_score_metric(self):
return self._base_score_metric return self._base_score_metric
@property """@property
def coherence(self): def coherence(self):
return self._coherence return self._coherence
@property @property
def correlation(self): def correlation(self):
return self._correlation return self._correlation"""
@property
def train_coherence(self):
return self._train_coherence
@property
def dev_coherence(self):
return self._dev_coherence
@property
def test_coherence(self):
return self._test_coherence
@property
def train_correlation(self):
return self._train_correlation
@property
def dev_correlation(self):
return self._dev_correlation
@property
def test_correlation(self):
return self._test_correlation
@property
def train_strength(self):
return self._train_strength
@property
def dev_strength(self):
return self._dev_strength
@property
def test_strength(self):
return self._test_strength
def save(self, models_dir): def save(self, models_dir):
if not os.path.exists(models_dir): if not os.path.exists(models_dir):
......
...@@ -39,7 +39,6 @@ class Trainer(object): ...@@ -39,7 +39,6 @@ class Trainer(object):
else classification_score_metric.__name__ else classification_score_metric.__name__
self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else base_classification_score_metric.__name__ else base_classification_score_metric.__name__
self._selected_trees = ''
@property @property
def score_metric_name(self): def score_metric_name(self):
...@@ -98,7 +97,6 @@ class Trainer(object): ...@@ -98,7 +97,6 @@ class Trainer(object):
X=self._X_forest, X=self._X_forest,
y=self._y_forest y=self._y_forest
) )
self._selected_trees = model.estimators_
else: else:
if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier] and \ if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier] and \
use_distillation: use_distillation:
...@@ -154,14 +152,17 @@ class Trainer(object): ...@@ -154,14 +152,17 @@ class Trainer(object):
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
return result return result
def _evaluate_predictions(self, model, X, aggregation_function): def _evaluate_predictions(self, X, aggregation_function, selected_trees):
predictions = np.array([tree.predict(X) for tree in self._selected_trees]) predictions = np.array([tree.predict(X) for tree in selected_trees])
predictions = normalize(predictions) predictions = normalize(predictions)
return aggregation_function(np.abs((predictions @ predictions.T - np.eye(len(predictions))))) return aggregation_function(np.abs((predictions @ predictions.T - np.eye(len(predictions)))))
def compute_results(self, model, models_dir): def _compute_forest_strength(self, X, y, metric_function, selected_trees):
return np.mean([metric_function(y, tree.predict(X)) for tree in selected_trees])
def compute_results(self, model, models_dir, subsets_used='train+dev,train+dev'):
""" """
:param model: Object with :param model: Object with
:param models_dir: Where the results will be saved :param models_dir: Where the results will be saved
...@@ -177,30 +178,70 @@ class Trainer(object): ...@@ -177,30 +178,70 @@ class Trainer(object):
if type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor, if type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor,
SimilarityForestClassifier, KMeansForestClassifier, EnsembleSelectionForestClassifier]: SimilarityForestClassifier, KMeansForestClassifier, EnsembleSelectionForestClassifier]:
self._selected_trees = model.selected_trees selected_trees = model.selected_trees
elif type(model) in [OmpForestRegressor, OmpForestMulticlassClassifier, OmpForestBinaryClassifier]: elif type(model) in [OmpForestRegressor, OmpForestMulticlassClassifier, OmpForestBinaryClassifier]:
self._selected_trees = np.asarray(model.forest)[model._omp.coef_ != 0] selected_trees = np.asarray(model.forest)[model._omp.coef_ != 0]
elif type(model) in [RandomForestRegressor, RandomForestClassifier]: elif type(model) in [RandomForestRegressor, RandomForestClassifier]:
self._selected_trees = model.estimators_ selected_trees = model.estimators_
if len(self._selected_trees) > 0: if len(selected_trees) > 0:
target_selected_tree = int(os.path.split(models_dir)[-1])
if target_selected_tree != len(selected_trees):
raise ValueError(f'Invalid selected tree number target_selected_tree:{target_selected_tree} - len(selected_trees):{len(selected_trees)}')
with open(os.path.join(models_dir, 'selected_trees.pickle'), 'wb') as output_file: with open(os.path.join(models_dir, 'selected_trees.pickle'), 'wb') as output_file:
pickle.dump(self._selected_trees, output_file) pickle.dump(selected_trees, output_file)
strength_metric = self._regression_score_metric if self._dataset.task == Task.REGRESSION else self._classification_score_metric
# Reeeally dirty to put that here but otherwise it's not thread safe...
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
if subsets_used == 'train,dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
else:
X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
X_omp = self._dataset.X_dev
y_omp = self._dataset.y_dev
elif model.models_parameters.subsets_used == 'train,dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
X_omp = self._dataset.X_dev
y_omp = self._dataset.y_dev
elif model.models_parameters.subsets_used == 'train+dev,train+dev':
X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
X_omp = X_forest
y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
y_omp = y_forest
elif model.models_parameters.subsets_used == 'train,train+dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
results = ModelRawResults( results = ModelRawResults(
model_weights=model_weights, model_weights=model_weights,
training_time=self._end_time - self._begin_time, training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(), datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), train_score=self.__score_func(model, X_forest, y_forest),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev), dev_score=self.__score_func(model, X_omp, y_omp),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), train_score_base=self.__score_func_base(model, X_forest, y_forest),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), dev_score_base=self.__score_func_base(model, X_omp, y_omp),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name, score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name, base_score_metric=self._base_score_metric_name,
coherence=self._evaluate_predictions(model, self._dataset.X_train, aggregation_function=np.max), train_coherence=self._evaluate_predictions(X_forest, aggregation_function=np.max, selected_trees=selected_trees),
correlation=self._evaluate_predictions(model, self._dataset.X_train, aggregation_function=np.mean) dev_coherence=self._evaluate_predictions(X_omp, aggregation_function=np.max, selected_trees=selected_trees),
test_coherence=self._evaluate_predictions(self._dataset.X_test, aggregation_function=np.max, selected_trees=selected_trees),
train_correlation=self._evaluate_predictions(X_forest, aggregation_function=np.mean, selected_trees=selected_trees),
dev_correlation=self._evaluate_predictions(X_omp, aggregation_function=np.mean, selected_trees=selected_trees),
test_correlation=self._evaluate_predictions(self._dataset.X_test, aggregation_function=np.mean, selected_trees=selected_trees),
train_strength=self._compute_forest_strength(X_forest, y_forest, strength_metric, selected_trees),
dev_strength=self._compute_forest_strength(X_omp, y_omp, strength_metric, selected_trees),
test_strength=self._compute_forest_strength(self._dataset.X_test, self._dataset.y_test, strength_metric, selected_trees)
) )
results.save(models_dir) results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_base)) self._logger.info("Base performance on test: {}".format(results.test_score_base))
...@@ -212,16 +253,20 @@ class Trainer(object): ...@@ -212,16 +253,20 @@ class Trainer(object):
self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score)) self._logger.info("Performance on dev: {}".format(results.dev_score))
self._logger.info(f'test_coherence: {results.test_coherence}')
self._logger.info(f'test_correlation: {results.test_correlation}')
self._logger.info(f'test_strength: {results.test_strength}')
if type(model) not in [RandomForestRegressor, RandomForestClassifier]: if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
results = ModelRawResults( results = ModelRawResults(
model_weights='', model_weights='',
training_time=self._end_time - self._begin_time, training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(), datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False), train_score=self.__score_func(model, X_forest, y_forest, False),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False), dev_score=self.__score_func(model, X_omp, y_omp, False),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), train_score_base=self.__score_func_base(model, X_forest, y_forest),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), dev_score_base=self.__score_func_base(model, X_omp, y_omp),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name, score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name base_score_metric=self._base_score_metric_name
......
...@@ -2,12 +2,49 @@ from bolsonaro.models.model_raw_results import ModelRawResults ...@@ -2,12 +2,49 @@ from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.visualization.plotter import Plotter from bolsonaro.visualization.plotter import Plotter
from bolsonaro import LOG_PATH from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
import argparse import argparse
import pathlib import pathlib
from dotenv import find_dotenv, load_dotenv from dotenv import find_dotenv, load_dotenv
import os import os
import numpy as np import numpy as np
import pickle
from tqdm import tqdm
from scipy.stats import rankdata
from pyrsa.vis.colors import rdm_colormap
from pyrsa.rdm.calc import calc_rdm
from pyrsa.data.dataset import Dataset
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.preprocessing import normalize
def vect2triu(dsm_vect, dim=None):
if not dim:
# sqrt(X²) \simeq sqrt(X²-X) -> sqrt(X²) = ceil(sqrt(X²-X))
dim = int(np.ceil(np.sqrt(dsm_vect.shape[1] * 2)))
dsm = np.zeros((dim,dim))
ind_up = np.triu_indices(dim, 1)
dsm[ind_up] = dsm_vect
return dsm
def triu2full(dsm_triu):
dsm_full = np.copy(dsm_triu)
ind_low = np.tril_indices(dsm_full.shape[0], -1)
dsm_full[ind_low] = dsm_full.T[ind_low]
return dsm_full
def plot_RDM(rdm, file_path, condition_number):
rdm = triu2full(vect2triu(rdm, condition_number))
fig = plt.figure()
cols = rdm_colormap(condition_number)
plt.imshow(rdm, cmap=cols)
plt.colorbar()
plt.savefig(file_path, dpi=200)
plt.close()
def retreive_extracted_forest_sizes_number(models_dir, experiment_id): def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
...@@ -174,7 +211,6 @@ def extract_correlations_across_seeds(models_dir, results_dir, experiment_id): ...@@ -174,7 +211,6 @@ def extract_correlations_across_seeds(models_dir, results_dir, experiment_id):
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(extracted_forest_size_path) model_raw_results = ModelRawResults.load(extracted_forest_size_path)
# Save the weights
experiment_correlations[seed].append(model_raw_results.correlation) experiment_correlations[seed].append(model_raw_results.correlation)
return experiment_correlations return experiment_correlations
...@@ -203,11 +239,129 @@ def extract_coherences_across_seeds(models_dir, results_dir, experiment_id): ...@@ -203,11 +239,129 @@ def extract_coherences_across_seeds(models_dir, results_dir, experiment_id):
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(extracted_forest_size_path) model_raw_results = ModelRawResults.load(extracted_forest_size_path)
# Save the weights
experiment_coherences[seed].append(model_raw_results.coherence) experiment_coherences[seed].append(model_raw_results.coherence)
return experiment_coherences return experiment_coherences
def extract_selected_trees_scores_across_seeds(models_dir, results_dir, experiment_id, weighted=False):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
experiment_selected_trees_scores = dict()
print(f'[extract_selected_trees_scores_across_seeds] experiment_id: {experiment_id}')
# For each seed results stored in models/{experiment_id}/seeds
seeds = os.listdir(experiment_seed_root_path)
seeds.sort(key=int)
with tqdm(seeds) as seed_bar:
for seed in seed_bar:
seed_bar.set_description(f'seed: {seed}')
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' # models/{experiment_id}/seeds/{seed}/forest_size
dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id)
dataset = DatasetLoader.load(dataset_parameters)
# {{seed}:[]}
experiment_selected_trees_scores[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree]
extracted_forest_sizes.sort(key=int)
with tqdm(extracted_forest_sizes) as extracted_forest_size_bar:
for extracted_forest_size in extracted_forest_size_bar:
# models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
selected_trees = None
with open(os.path.join(extracted_forest_size_path, 'selected_trees.pickle'), 'rb') as file:
selected_trees = pickle.load(file)
selected_trees_test_scores = np.array([tree.score(dataset.X_test, dataset.y_test) for tree in selected_trees])
if weighted:
model_raw_results = ModelRawResults.load(extracted_forest_size_path)
weights = model_raw_results.model_weights
if type(weights) != str:
weights = weights[weights != 0]
score = np.mean(np.square(selected_trees_test_scores * weights))
else:
score = np.mean(np.square(selected_trees_test_scores))
else:
score = np.mean(selected_trees_test_scores)
experiment_selected_trees_scores[seed].append(score)
extracted_forest_size_bar.set_description(f'extracted_forest_size: {extracted_forest_size} - test_score: {round(score, 2)}')
extracted_forest_size_bar.update(1)
seed_bar.update(1)
return experiment_selected_trees_scores
def extract_selected_trees_across_seeds(models_dir, results_dir, experiment_id):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
experiment_selected_trees = dict()
# For each seed results stored in models/{experiment_id}/seeds
seeds = os.listdir(experiment_seed_root_path)
seeds.sort(key=int)
with tqdm(seeds) as seed_bar:
for seed in seed_bar:
seed_bar.set_description(f'seed: {seed}')
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' # models/{experiment_id}/seeds/{seed}/forest_size
dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id)
dataset = DatasetLoader.load(dataset_parameters)
# {{seed}:[]}
experiment_selected_trees[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
extracted_forest_sizes.sort(key=int)
all_selected_trees_predictions = list()
with tqdm(extracted_forest_sizes) as extracted_forest_size_bar:
for extracted_forest_size in extracted_forest_size_bar:
# models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
selected_trees = None
with open(os.path.join(extracted_forest_size_path, 'selected_trees.pickle'), 'rb') as file:
selected_trees = pickle.load(file)
#test_score = np.mean([tree.score(dataset.X_test, dataset.y_test) for tree in selected_trees])
#selected_trees_predictions = np.array([tree.score(dataset.X_test, dataset.y_test) for tree in selected_trees])
selected_trees_predictions = [tree.predict(dataset.X_test) for tree in selected_trees]
extracted_forest_size_bar.set_description(f'extracted_forest_size: {extracted_forest_size}')
#experiment_selected_trees[seed].append(test_score)
extracted_forest_size_bar.update(1)
selected_trees_predictions = np.array(selected_trees_predictions)
selected_trees_predictions = normalize(selected_trees_predictions)
"""mds = MDS(len(selected_trees_predictions))
Y = mds.fit_transform(selected_trees_predictions)
plt.scatter(Y[:, 0], Y[:, 1])
plt.savefig(f'test_mds_{experiment_id}.png')"""
if int(extracted_forest_size) <= 267:
forest_RDM = calc_rdm(Dataset(selected_trees_predictions), method='euclidean').get_vectors()
ranked_forest_RDM = np.apply_along_axis(rankdata, 1, forest_RDM.reshape(1, -1))
from scipy.cluster import hierarchy
RDM = triu2full(vect2triu(ranked_forest_RDM, int(extracted_forest_size)))
Z = hierarchy.linkage(RDM, 'average')
fig = plt.figure(figsize=(15, 8))
dn = hierarchy.dendrogram(Z)
plt.savefig(f'test_dendrogram_scores_id:{experiment_id}_seed:{seed}_size:{extracted_forest_size}.png')
plt.close()
plot_RDM(
rdm=ranked_forest_RDM,
file_path=f'test_scores_ranked_forest_RDM_id:{experiment_id}_seed:{seed}_size:{extracted_forest_size}.png',
condition_number=len(selected_trees_predictions)
)
break
seed_bar.update(1)
return experiment_selected_trees
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env # get environment variables in .env
load_dotenv(find_dotenv('.env')) load_dotenv(find_dotenv('.env'))
...@@ -217,6 +371,8 @@ if __name__ == "__main__": ...@@ -217,6 +371,8 @@ if __name__ == "__main__":
DEFAULT_PLOT_WEIGHT_DENSITY = False DEFAULT_PLOT_WEIGHT_DENSITY = False
DEFAULT_WO_LOSS_PLOTS = False DEFAULT_WO_LOSS_PLOTS = False
DEFAULT_PLOT_PREDS_COHERENCE = False DEFAULT_PLOT_PREDS_COHERENCE = False
DEFAULT_PLOT_FOREST_STRENGTH = False
DEFAULT_COMPUTE_SELECTED_TREES_RDMS = False
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].') parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].')
...@@ -232,6 +388,8 @@ if __name__ == "__main__": ...@@ -232,6 +388,8 @@ if __name__ == "__main__":
parser.add_argument('--wo_loss_plots', action='store_true', default=DEFAULT_WO_LOSS_PLOTS, help='Do not compute the loss plots.') parser.add_argument('--wo_loss_plots', action='store_true', default=DEFAULT_WO_LOSS_PLOTS, help='Do not compute the loss plots.')
parser.add_argument('--plot_preds_coherence', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the coherence of the prediction trees.') parser.add_argument('--plot_preds_coherence', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the coherence of the prediction trees.')
parser.add_argument('--plot_preds_correlation', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the correlation of the prediction trees.') parser.add_argument('--plot_preds_correlation', action='store_true', default=DEFAULT_PLOT_PREDS_COHERENCE, help='Plot the correlation of the prediction trees.')
parser.add_argument('--plot_forest_strength', action='store_true', default=DEFAULT_PLOT_FOREST_STRENGTH, help='Plot the strength of the extracted forest.')
parser.add_argument('--compute_selected_trees_rdms', action='store_true', default=DEFAULT_COMPUTE_SELECTED_TREES_RDMS, help='Representation similarity analysis of the selected trees')
args = parser.parse_args() args = parser.parse_args()
if args.stage not in list(range(1, 6)): if args.stage not in list(range(1, 6)):
...@@ -534,16 +692,17 @@ if __name__ == "__main__": ...@@ -534,16 +692,17 @@ if __name__ == "__main__":
import sys import sys
sys.exit(0)""" sys.exit(0)"""
#all_labels = ['base', 'random', 'omp', 'omp_wo_weights'] all_labels = ['base', 'random', 'omp', 'omp_wo_weights']
all_labels = ['base', 'random', 'omp'] #all_labels = ['base', 'random', 'omp']
omp_with_params_test_scores_new = dict() omp_with_params_test_scores_new = dict()
filter_num = -1 filter_num = -1
"""filter_num = 9 """filter_num = 9
for key, value in omp_with_params_test_scores.items(): for key, value in omp_with_params_test_scores.items():
omp_with_params_test_scores_new[key] = value[:filter_num]""" omp_with_params_test_scores_new[key] = value[:filter_num]"""
#all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
# omp_with_params_without_weights_test_scores] omp_with_params_without_weights_test_scores]
all_scores = [base_with_params_dev_scores, random_with_params_dev_scores, omp_with_params_dev_scores] #all_scores = [base_with_params_dev_scores, random_with_params_dev_scores, omp_with_params_dev_scores,
# omp_with_params_without_weights_dev_scores]
#all_scores = [base_with_params_train_scores, random_with_params_train_scores, omp_with_params_train_scores, #all_scores = [base_with_params_train_scores, random_with_params_train_scores, omp_with_params_train_scores,
# omp_with_params_without_weights_train_scores] # omp_with_params_without_weights_train_scores]
...@@ -567,15 +726,15 @@ if __name__ == "__main__": ...@@ -567,15 +726,15 @@ if __name__ == "__main__":
current_train_scores, current_dev_scores, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes( current_train_scores, current_dev_scores, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, current_experiment_id) args.models_dir, args.results_dir, current_experiment_id)
all_labels.append(label) all_labels.append(label)
#all_scores.append(current_test_scores) all_scores.append(current_test_scores)
#all_scores.append(current_train_scores) #all_scores.append(current_train_scores)
all_scores.append(current_dev_scores) #all_scores.append(current_dev_scores)
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_new') output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_test_train,dev')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses( Plotter.plot_stage2_losses(
file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_dev_clean.png", file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_test_train,dev.png",
all_experiment_scores=all_scores, all_experiment_scores=all_scores,
all_labels=all_labels, all_labels=all_labels,
x_value=with_params_extracted_forest_sizes, x_value=with_params_extracted_forest_sizes,
...@@ -630,7 +789,7 @@ if __name__ == "__main__": ...@@ -630,7 +789,7 @@ if __name__ == "__main__":
all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble'] all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble']
_, _, _, with_params_extracted_forest_sizes, _ = \ _, _, _, with_params_extracted_forest_sizes, _ = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2) extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 5, 6, 7, 8]] coherence_values = [extract_coherences_across_seeds(args.models_dir, args.results_dir, i) for i in args.experiment_ids]
Plotter.plot_stage2_losses( Plotter.plot_stage2_losses(
file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}.png", file_path=root_output_path + os.sep + f"coherences_{'-'.join(all_labels)}.png",
all_experiment_scores=coherence_values, all_experiment_scores=coherence_values,
...@@ -640,13 +799,14 @@ if __name__ == "__main__": ...@@ -640,13 +799,14 @@ if __name__ == "__main__":
ylabel='Coherence', ylabel='Coherence',
title='Coherence values of {}'.format(args.dataset_name)) title='Coherence values of {}'.format(args.dataset_name))
logger.info(f'Computing preds coherence plot...') logger.info(f'Computing preds coherence plot...')
if args.plot_preds_correlation: if args.plot_preds_correlation:
root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_new') root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_new')
pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True) pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True)
all_labels = ['random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble'] all_labels = ['none', 'random', 'omp', 'kmeans', 'similarity_similarities', 'similarity_predictions', 'ensemble']
_, _, _, with_params_extracted_forest_sizes, _ = \ _, _, _, with_params_extracted_forest_sizes, _ = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2) extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
correlation_values = [extract_correlations_across_seeds(args.models_dir, args.results_dir, i) for i in [2, 3, 5, 6, 7, 8]] correlation_values = [extract_correlations_across_seeds(args.models_dir, args.results_dir, i) for i in args.experiment_ids]
Plotter.plot_stage2_losses( Plotter.plot_stage2_losses(
file_path=root_output_path + os.sep + f"correlations_{'-'.join(all_labels)}.png", file_path=root_output_path + os.sep + f"correlations_{'-'.join(all_labels)}.png",
all_experiment_scores=correlation_values, all_experiment_scores=correlation_values,
...@@ -657,4 +817,78 @@ if __name__ == "__main__": ...@@ -657,4 +817,78 @@ if __name__ == "__main__":
title='correlation values of {}'.format(args.dataset_name)) title='correlation values of {}'.format(args.dataset_name))
logger.info(f'Computing preds correlation plot...') logger.info(f'Computing preds correlation plot...')
if args.plot_forest_strength:
root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_strength')
pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True)
_, _, _, with_params_extracted_forest_sizes, _ = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
#all_selected_trees_scores = list()
#all_selected_trees_weighted_scores = list()
"""with tqdm(args.experiment_ids) as experiment_id_bar:
for experiment_id in experiment_id_bar:
experiment_id_bar.set_description(f'experiment_id: {experiment_id}')
selected_trees_scores, selected_trees_weighted_scores = extract_selected_trees_scores_across_seeds(
args.models_dir, args.results_dir, experiment_id)
all_selected_trees_scores.append(selected_trees_scores)
all_selected_trees_weighted_scores.append(selected_trees_weighted_scores)
experiment_id_bar.update(1)"""
#random_selected_trees_scores = extract_selected_trees_scores_across_seeds(
# args.models_dir, args.results_dir, 2, weighted=True)
omp_selected_trees_scores = extract_selected_trees_scores_across_seeds(
args.models_dir, args.results_dir, 3, weighted=True)
similarity_similarities_selected_trees_scores = extract_selected_trees_scores_across_seeds(
args.models_dir, args.results_dir, 6, weighted=True)
#similarity_predictions_selected_trees_scores = extract_selected_trees_scores_across_seeds(
# args.models_dir, args.results_dir, 7)
ensemble_selected_trees_scores = extract_selected_trees_scores_across_seeds(
args.models_dir, args.results_dir, 8, weighted=True)
# kmeans=5
# similarity_similarities=6
# similarity_predictions=7
# ensemble=8
all_selected_trees_scores = [random_selected_trees_scores, omp_selected_trees_scores, similarity_similarities_selected_trees_scores,
ensemble_selected_trees_scores]
with open('california_housing_forest_strength_scores.pickle', 'wb') as file:
pickle.dump(all_selected_trees_scores, file)
"""with open('forest_strength_scores.pickle', 'rb') as file:
all_selected_trees_scores = pickle.load(file)"""
all_labels = ['random', 'omp', 'similarity_similarities', 'ensemble']
Plotter.plot_stage2_losses(
file_path=root_output_path + os.sep + f"forest_strength_{'-'.join(all_labels)}_v2_sota.png",
all_experiment_scores=all_selected_trees_scores,
all_labels=all_labels,
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel='Mean of selected tree scores on test set',
title='Forest strength of {}'.format(args.dataset_name))
if args.compute_selected_trees_rdms:
root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage5_strength')
pathlib.Path(root_output_path).mkdir(parents=True, exist_ok=True)
_, _, _, with_params_extracted_forest_sizes, _ = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, 2)
all_selected_trees_scores = list()
with tqdm([2, 3, 8]) as experiment_id_bar:
for experiment_id in experiment_id_bar:
experiment_id_bar.set_description(f'experiment_id: {experiment_id}')
all_selected_trees_scores.append(extract_selected_trees_across_seeds(
args.models_dir, args.results_dir, experiment_id))
experiment_id_bar.update(1)
with open('forest_strength_scores.pickle', 'rb') as file:
all_selected_trees_scores = pickle.load(file)
logger.info('Done.') logger.info('Done.')
...@@ -8,8 +8,9 @@ from tqdm import tqdm ...@@ -8,8 +8,9 @@ from tqdm import tqdm
if __name__ == "__main__": if __name__ == "__main__":
models_source_path = 'models' models_source_path = 'models'
models_destination_path = 'bolsonaro_models_25-03-20' models_destination_path = 'bolsonaro_models_25-03-20'
datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds', #datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds',
'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs'] # 'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs']
datasets = ['kin8nm']
pathlib.Path(models_destination_path).mkdir(parents=True, exist_ok=True) pathlib.Path(models_destination_path).mkdir(parents=True, exist_ok=True)
......
...@@ -66,11 +66,11 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -66,11 +66,11 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
extraction_strategy=parameters['extraction_strategy'] extraction_strategy=parameters['extraction_strategy']
) )
pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters) pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters)
pretraned_trainer = Trainer(dataset) pretrained_trainer = Trainer(dataset)
pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used']) pretrained_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
pretrained_estimator.fit( pretrained_estimator.fit(
X=pretraned_trainer._X_forest, X=pretrained_trainer._X_forest,
y=pretraned_trainer._y_forest y=pretrained_trainer._y_forest
) )
else: else:
pretrained_estimator = None pretrained_estimator = None
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment