Skip to content
Snippets Groups Projects
Commit 8e1d5f0c authored by Luc Giffon's avatar Luc Giffon
Browse files

expe viz correlation

parent b6f4ff54
No related branches found
No related tags found
No related merge requests found
......@@ -2,7 +2,7 @@ from copy import deepcopy
from scipy.optimize import nnls
import numpy as np
from sklearn.linear_model.base import _preprocess_data
from sklearn.linear_model._base import _preprocess_data
from bolsonaro import LOG_PATH
......
......@@ -686,7 +686,7 @@ if __name__ == "__main__":
30 + 1,
endpoint=True)[1:]).astype(np.int)).tolist()"""
#extracted_forest_sizes = [4, 7, 11, 14, 18, 22, 25, 29, 32, 36, 40, 43, 47, 50, 54, 58, 61, 65, 68, 72, 76, 79, 83, 86, 90, 94, 97, 101, 104, 108]
#extracted_forest_sizes = [4, 7, 9, 14, 18, 22, 25, 29, 32, 36, 40, 43, 47, 50, 54, 58, 61, 65, 68, 72, 76, 79, 83, 86, 90, 94, 97, 101, 104, 108]
#extracted_forest_sizes = [str(forest_size) for forest_size in extracted_forest_sizes]
extracted_forest_sizes= list()
......
......@@ -212,7 +212,7 @@ Command lines example for stage 3:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev
python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing
python code/compute_results.py --stage 3 --experiment_ids 9 12 13 --dataset_name=california_housing
Command lines example for stage 4:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=0.05
......
......@@ -15,15 +15,15 @@ lst_task_train_dev = ["coherence", "correlation"]
tasks = [
# "train_score",
"dev_score",
"test_score",
# "dev_score",
# "test_score",
# "coherence",
# "correlation",
"train_correlation",
# "negative-percentage",
# "dev_strength",
# "test_strength",
# "dev_correlation",
# "test_correlation",
"test_correlation",
# "dev_coherence",
# "test_coherence",
# "negative-percentage-test-score"
......@@ -266,13 +266,14 @@ def base_figures(skip_NN=False):
add_trace_from_df(df_strat_wo_weights, fig, task, strat)
title = "{} {}".format(task, data_name)
yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name]
xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees"
# yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name]
# xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees"
xaxis_title = "% Selected Trees"
yaxis_title = "Mean absolute correlation of normalized trees"
if not skip_nn:
fig.add_trace(GLOBAL_TRACE_TO_ADD_LAST)
fig.update_layout(barmode='group',
# title=title,
title=title,
xaxis_title=xaxis_title,
yaxis_title=yaxis_title,
font=dict(
......@@ -281,13 +282,13 @@ def base_figures(skip_NN=False):
color="black"
),
showlegend = False,
margin = dict(
l=1,
r=1,
b=1,
t=1,
# pad=4
),
# margin = dict(
# l=1,
# r=1,
# b=1,
# t=1,
# # pad=4
# ),
legend=dict(
traceorder="normal",
font=dict(
......@@ -300,7 +301,7 @@ def base_figures(skip_NN=False):
borderwidth=1,
)
)
# fig.show()
fig.show()
if skip_NN:
str_no_nn = " no nn"
title += str_no_nn
......@@ -646,7 +647,7 @@ def effect_of_weights_figure():
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
dir_name = "bolsonaro_models_29-03-20_v3_2"
dir_name = "bolsonaro_models_29-03-20_v3"
dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
......@@ -658,8 +659,9 @@ if __name__ == "__main__":
strategies = set(df_results["strategy"].values)
subsets = set(df_results["subset"].values)
for skip_nn in [True, False]:
# for skip_nn in [True, False]:
for skip_nn in [False]:
base_figures(skip_nn)
effect_of_weights_figure()
weights_wrt_size()
# effect_of_weights_figure()
# weights_wrt_size()
# global_figure()
......@@ -115,7 +115,7 @@ def get_max_from_df(df, best_fct):
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
dir_name = "bolsonaro_models_29-03-20_v3_2"
dir_name = "bolsonaro_models_29-03-20_v3"
dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
......
from collections import defaultdict
import plotly.graph_objects as go
import numpy as np
from pathlib import Path
import os
from dotenv import find_dotenv, load_dotenv
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS, Isomap, LocallyLinearEmbedding
from sklearn.preprocessing import normalize
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
dir_name = "results/models/predictions"
dir_path = Path(os.environ["project_dir"]) / dir_name
dct_dataset_true_labels = dict()
dct_dataset_algo_preds = defaultdict(dict)
for dataset_path in dir_path.glob('*'):
dataset_name = dataset_path.name
max_forest_size = np.max(list(map(lambda x : int(x.name), dataset_path.glob("*"))))
for forest_size_path in dataset_path.glob("*"):
pruned_forest_size = int(forest_size_path.name)
if pruned_forest_size != int(10 / 100 * max_forest_size) and pruned_forest_size != max_forest_size:
continue
for algoname_path in forest_size_path.glob("*"):
algoname = algoname_path.name
if algoname == "true_labels.npz":
if dct_dataset_true_labels.get(dataset_name, None) is None:
# store the true labels for the task
true_labels_path = algoname_path
loaded_true_labels = np.load(true_labels_path)["Y_true"]
dct_dataset_true_labels[dataset_name] = loaded_true_labels
else:
continue
else:
path_predictions = algoname_path / "predictions_train.npz"
loaded_predictions = np.load(path_predictions)["Y_preds"]
dct_dataset_algo_preds[dataset_name][algoname] = loaded_predictions
print(dct_dataset_true_labels)
print(dct_dataset_algo_preds)
for dataset_name in dct_dataset_algo_preds:
predictions_algo = dct_dataset_algo_preds[dataset_name]["NN-OMP"].T
try:
predictions_total = dct_dataset_algo_preds[dataset_name]["None"].T
except:
continue
real_preds = dct_dataset_true_labels[dataset_name].reshape(1, -1)
predictions_total = np.vstack([predictions_total, real_preds])
normalized_predictions_algo = normalize(predictions_algo)
normalized_predictions_total = normalize(predictions_total)
sim = normalized_predictions_algo @ normalized_predictions_total.T
sim_equals_1 = np.isclose(sim, 1)
bool_indices_tree_algo = np.sum(sim_equals_1, axis=0).astype(bool)
# concat = np.vstack([predictions_algo, predictions_total])
for perp in range(1, 20, 3):
# tsne_obj = TSNE(n_components=2, perplexity=perp)
tsne_obj = Isomap(n_components=2, n_neighbors=perp)
X_embedded = tsne_obj.fit_transform(predictions_total)
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_embedded[:, 0], y=X_embedded[:, 1], mode='markers', name="Base"))
fig.add_trace(go.Scatter(x=X_embedded[bool_indices_tree_algo, 0], y=X_embedded[bool_indices_tree_algo, 1], mode='markers', name="NN-OMP"))
fig.add_trace(go.Scatter(x=X_embedded[-1:, 0], y=X_embedded[-1:, 1], mode='markers', name="NN-OMP"))
fig.update_layout(title=f"Isomap {perp}")
fig.show()
......@@ -32,7 +32,7 @@ dct_experiment_id_technique = {"1": NONE,
"9": OMPNN,
# "9": NONE,
# "10": Random,
# "11": OMP,
# "9": OMP,
# "12": OMP_Distillation,
# "13": Kmeans,
# "14": Zhang_Similarities,
......@@ -88,7 +88,7 @@ if __name__ == "__main__":
# dir_name = "results/bolsonaro_models_29-03-20"
# dir_name = "results/bolsonaro_models_29-03-20_v3"
# dir_name = "results/bolsonaro_models_29-03-20_v3"
dir_name = "results/bolsonaro_models_29-03-20_v3_2"
dir_name = "results/bolsonaro_models_29-03-20_v3"
# dir_name = "results/bolsonaro_models_29-03-20"
dir_path = Path(os.environ["project_dir"]) / dir_name
......
import json
from pathlib import Path
import os
import pandas as pd
from pprint import pprint
import pickle
from collections import defaultdict
import numpy as np
from dotenv import load_dotenv, find_dotenv
from numpy import savez
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.data.dataset_parameters import DatasetParameters
dct_experiment_id_subset = dict((str(idx), "train+dev/train+dev") for idx in range(1, 10))
# dct_experiment_id_subset.update(dict((str(idx), "train/dev") for idx in range(9, 17)))
NONE = 'None'
Random = 'Random'
OMP = 'OMP'
OMPNN = 'NN-OMP'
OMP_Distillation = 'OMP Distillation'
Kmeans = 'Kmeans'
Zhang_Similarities = 'Zhang Similarities'
Zhang_Predictions = 'Zhang Predictions'
Ensemble = 'Ensemble'
dct_experiment_id_technique = {"1": NONE,
"2": Random,
"3": OMP,
"4": OMP_Distillation,
"5": Kmeans,
"6": Zhang_Similarities,
"7": Zhang_Predictions,
"8": Ensemble,
"9": OMPNN,
# "9": NONE,
# "10": Random,
# "9": OMP,
# "12": OMP_Distillation,
# "13": Kmeans,
# "14": Zhang_Similarities,
# "15": Zhang_Predictions,
# "16": Ensemble
}
dct_dataset_fancy = {
"boston": "Boston",
"breast_cancer": "Breast Cancer",
"california_housing": "California Housing",
"diabetes": "Diabetes",
"diamonds": "Diamonds",
"digits": "Digits",
"iris": "Iris",
"kin8nm": "Kin8nm",
"kr-vs-kp": "KR-VS-KP",
"olivetti_faces": "Olivetti Faces",
"spambase": "Spambase",
"steel-plates": "Steel Plates",
"wine": "Wine",
"gamma": "Gamma",
"lfw_pairs": "LFW Pairs"
}
dct_dataset_base_forest_size = {
"boston": 100,
"breast_cancer": 1000,
"california_housing": 1000,
"diabetes": 108,
"diamonds": 429,
"digits": 1000,
"iris": 1000,
"kin8nm": 1000,
"kr-vs-kp": 1000,
"olivetti_faces": 1000,
"spambase": 1000,
"steel-plates": 1000,
"wine": 1000,
"gamma": 100,
"lfw_pairs": 1000,
}
lst_attributes_tree_scores = ["dev_scores", "train_scores", "test_scores"]
skip_attributes = ["datetime"]
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
# dir_name = "results/bolsonaro_models_25-03-20"
# dir_name = "results/bolsonaro_models_27-03-20_v2"
# dir_name = "results/bolsonaro_models_29-03-20"
# dir_name = "results/bolsonaro_models_29-03-20_v3"
# dir_name = "results/bolsonaro_models_29-03-20_v3"
dir_name = "results/models"
# dir_name = "results/bolsonaro_models_29-03-20"
dir_path = Path(os.environ["project_dir"]) / dir_name
output_dir = dir_path / "predictions"
# output_dir_file = dir_path / "results.csv"
dct_results = defaultdict(lambda: [])
for root, dirs, files in os.walk(dir_path, topdown=False):
for file_str in files:
if file_str != "selected_trees.pickle":
continue
# if file_str == "results.csv":
# continue
path_dir = Path(root)
path_file = path_dir / file_str
print(path_file)
try:
with open(path_file, 'rb') as pickle_file:
lst_selected_trees = pickle.load(pickle_file)
except:
print("problem loading pickle file {}".format(path_file))
path_dir_split = str(path_dir).split("/")
bool_wo_weights = "no_weights" in str(path_file)
if bool_wo_weights:
forest_size = int(path_dir_split[-1].split("_")[0])
else:
forest_size = int(path_dir_split[-1])
seed = int(path_dir_split[-3])
id_xp = str(path_dir_split[-5])
dataset = str(path_dir_split[-7])
dct_results["forest_size"].append(forest_size)
dct_results["seed"].append(seed)
dct_results["dataset"].append(dct_dataset_fancy[dataset])
dct_results["subset"].append(dct_experiment_id_subset[id_xp])
dct_results["strategy"].append(dct_experiment_id_technique[id_xp])
dct_results["wo_weights"].append(bool_wo_weights)
dct_results["base_forest_size"].append(dct_dataset_base_forest_size[dataset])
pruning_percent = forest_size / dct_dataset_base_forest_size[dataset]
dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=2))
# assert len(lst_selected_trees) == forest_size
dct_results["actual-forest-size"].append(len(lst_selected_trees))
with open(path_dir.parent.parent / f"dataset_parameters_{id_xp}.json", 'r') as jsonparamfile:
dataset_parameters_dict = json.load(jsonparamfile)
dataset_parameters = DatasetParameters(**dataset_parameters_dict)
dataset = DatasetLoader.load(dataset_parameters)
arr_pred_selected_trees = np.array([tree.predict(dataset.X_train) for tree in lst_selected_trees]).T
arr_true_labels = dataset.y_train
output_dir_curr = output_dir / dct_results["dataset"][-1] / str(dct_results["forest_size"][-1]) / dct_results["strategy"][-1]
output_dir_curr.mkdir(parents=True, exist_ok=True)
savez(output_dir_curr / "predictions_train.npz", Y_preds=arr_pred_selected_trees)
savez(output_dir_curr.parent / "true_labels.npz", Y_true=arr_true_labels)
print()
# todo load dataset
# evaluate trees on data set: -> matrix of predictions
# store matrix of predictions
# store true labels
# final_df = pd.DataFrame.from_dict(dct_results)
# final_df.to_csv(output_dir_file)
# print(final_df)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment