From 8e1d5f0ce5e4c55754566ff05203ac91079b17bb Mon Sep 17 00:00:00 2001 From: Luc Giffon <luc.giffon@ens-lyon.fr> Date: Fri, 4 Jun 2021 17:48:54 +0200 Subject: [PATCH] expe viz correlation --- code/bolsonaro/models/nn_omp.py | 2 +- code/compute_results.py | 2 +- code/train.py | 2 +- code/vizualisation/csv_to_figure.py | 42 ++--- code/vizualisation/csv_to_table.py | 2 +- code/vizualisation/preds_to_viz.py | 72 ++++++++ code/vizualisation/results_to_csv.py | 4 +- code/vizualisation/results_to_predictions.py | 168 +++++++++++++++++++ 8 files changed, 268 insertions(+), 26 deletions(-) create mode 100644 code/vizualisation/preds_to_viz.py create mode 100644 code/vizualisation/results_to_predictions.py diff --git a/code/bolsonaro/models/nn_omp.py b/code/bolsonaro/models/nn_omp.py index af8a11a..213295f 100644 --- a/code/bolsonaro/models/nn_omp.py +++ b/code/bolsonaro/models/nn_omp.py @@ -2,7 +2,7 @@ from copy import deepcopy from scipy.optimize import nnls import numpy as np -from sklearn.linear_model.base import _preprocess_data +from sklearn.linear_model._base import _preprocess_data from bolsonaro import LOG_PATH diff --git a/code/compute_results.py b/code/compute_results.py index 111cac2..d3c3c50 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -686,7 +686,7 @@ if __name__ == "__main__": 30 + 1, endpoint=True)[1:]).astype(np.int)).tolist()""" - #extracted_forest_sizes = [4, 7, 11, 14, 18, 22, 25, 29, 32, 36, 40, 43, 47, 50, 54, 58, 61, 65, 68, 72, 76, 79, 83, 86, 90, 94, 97, 101, 104, 108] + #extracted_forest_sizes = [4, 7, 9, 14, 18, 22, 25, 29, 32, 36, 40, 43, 47, 50, 54, 58, 61, 65, 68, 72, 76, 79, 83, 86, 90, 94, 97, 101, 104, 108] #extracted_forest_sizes = [str(forest_size) for forest_size in extracted_forest_sizes] extracted_forest_sizes= list() diff --git a/code/train.py b/code/train.py index f5c6982..c4f3bf7 100644 --- a/code/train.py +++ b/code/train.py @@ -212,7 +212,7 @@ Command lines example for stage 3: python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev -python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing +python code/compute_results.py --stage 3 --experiment_ids 9 12 13 --dataset_name=california_housing Command lines example for stage 4: python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=0.05 diff --git a/code/vizualisation/csv_to_figure.py b/code/vizualisation/csv_to_figure.py index 6f7b61c..2c5a532 100644 --- a/code/vizualisation/csv_to_figure.py +++ b/code/vizualisation/csv_to_figure.py @@ -15,15 +15,15 @@ lst_task_train_dev = ["coherence", "correlation"] tasks = [ # "train_score", - "dev_score", - "test_score", + # "dev_score", + # "test_score", # "coherence", - # "correlation", + "train_correlation", # "negative-percentage", # "dev_strength", # "test_strength", # "dev_correlation", - # "test_correlation", + "test_correlation", # "dev_coherence", # "test_coherence", # "negative-percentage-test-score" @@ -266,13 +266,14 @@ def base_figures(skip_NN=False): add_trace_from_df(df_strat_wo_weights, fig, task, strat) title = "{} {}".format(task, data_name) - yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name] - xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees" - + # yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name] + # xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees" + xaxis_title = "% Selected Trees" + yaxis_title = "Mean absolute correlation of normalized trees" if not skip_nn: fig.add_trace(GLOBAL_TRACE_TO_ADD_LAST) fig.update_layout(barmode='group', - # title=title, + title=title, xaxis_title=xaxis_title, yaxis_title=yaxis_title, font=dict( @@ -281,13 +282,13 @@ def base_figures(skip_NN=False): color="black" ), showlegend = False, - margin = dict( - l=1, - r=1, - b=1, - t=1, - # pad=4 - ), + # margin = dict( + # l=1, + # r=1, + # b=1, + # t=1, + # # pad=4 + # ), legend=dict( traceorder="normal", font=dict( @@ -300,7 +301,7 @@ def base_figures(skip_NN=False): borderwidth=1, ) ) - # fig.show() + fig.show() if skip_NN: str_no_nn = " no nn" title += str_no_nn @@ -646,7 +647,7 @@ def effect_of_weights_figure(): if __name__ == "__main__": load_dotenv(find_dotenv('.env')) - dir_name = "bolsonaro_models_29-03-20_v3_2" + dir_name = "bolsonaro_models_29-03-20_v3" dir_path = Path(os.environ["project_dir"]) / "results" / dir_name out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name @@ -658,8 +659,9 @@ if __name__ == "__main__": strategies = set(df_results["strategy"].values) subsets = set(df_results["subset"].values) - for skip_nn in [True, False]: + # for skip_nn in [True, False]: + for skip_nn in [False]: base_figures(skip_nn) - effect_of_weights_figure() - weights_wrt_size() + # effect_of_weights_figure() + # weights_wrt_size() # global_figure() diff --git a/code/vizualisation/csv_to_table.py b/code/vizualisation/csv_to_table.py index 0e05e33..a0eedc6 100644 --- a/code/vizualisation/csv_to_table.py +++ b/code/vizualisation/csv_to_table.py @@ -115,7 +115,7 @@ def get_max_from_df(df, best_fct): if __name__ == "__main__": load_dotenv(find_dotenv('.env')) - dir_name = "bolsonaro_models_29-03-20_v3_2" + dir_name = "bolsonaro_models_29-03-20_v3" dir_path = Path(os.environ["project_dir"]) / "results" / dir_name out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name diff --git a/code/vizualisation/preds_to_viz.py b/code/vizualisation/preds_to_viz.py new file mode 100644 index 0000000..ed4b684 --- /dev/null +++ b/code/vizualisation/preds_to_viz.py @@ -0,0 +1,72 @@ +from collections import defaultdict +import plotly.graph_objects as go +import numpy as np +from pathlib import Path +import os + +from dotenv import find_dotenv, load_dotenv +from sklearn.decomposition import PCA +from sklearn.manifold import TSNE, MDS, Isomap, LocallyLinearEmbedding +from sklearn.preprocessing import normalize + +if __name__ == "__main__": + load_dotenv(find_dotenv('.env')) + dir_name = "results/models/predictions" + dir_path = Path(os.environ["project_dir"]) / dir_name + + dct_dataset_true_labels = dict() + dct_dataset_algo_preds = defaultdict(dict) + for dataset_path in dir_path.glob('*'): + dataset_name = dataset_path.name + max_forest_size = np.max(list(map(lambda x : int(x.name), dataset_path.glob("*")))) + for forest_size_path in dataset_path.glob("*"): + pruned_forest_size = int(forest_size_path.name) + if pruned_forest_size != int(10 / 100 * max_forest_size) and pruned_forest_size != max_forest_size: + continue + for algoname_path in forest_size_path.glob("*"): + algoname = algoname_path.name + if algoname == "true_labels.npz": + if dct_dataset_true_labels.get(dataset_name, None) is None: + # store the true labels for the task + true_labels_path = algoname_path + loaded_true_labels = np.load(true_labels_path)["Y_true"] + dct_dataset_true_labels[dataset_name] = loaded_true_labels + else: + continue + else: + path_predictions = algoname_path / "predictions_train.npz" + loaded_predictions = np.load(path_predictions)["Y_preds"] + dct_dataset_algo_preds[dataset_name][algoname] = loaded_predictions + + print(dct_dataset_true_labels) + print(dct_dataset_algo_preds) + + for dataset_name in dct_dataset_algo_preds: + predictions_algo = dct_dataset_algo_preds[dataset_name]["NN-OMP"].T + try: + predictions_total = dct_dataset_algo_preds[dataset_name]["None"].T + except: + continue + real_preds = dct_dataset_true_labels[dataset_name].reshape(1, -1) + + predictions_total = np.vstack([predictions_total, real_preds]) + + normalized_predictions_algo = normalize(predictions_algo) + normalized_predictions_total = normalize(predictions_total) + sim = normalized_predictions_algo @ normalized_predictions_total.T + + sim_equals_1 = np.isclose(sim, 1) + bool_indices_tree_algo = np.sum(sim_equals_1, axis=0).astype(bool) + + # concat = np.vstack([predictions_algo, predictions_total]) + for perp in range(1, 20, 3): + # tsne_obj = TSNE(n_components=2, perplexity=perp) + tsne_obj = Isomap(n_components=2, n_neighbors=perp) + X_embedded = tsne_obj.fit_transform(predictions_total) + fig = go.Figure() + fig.add_trace(go.Scatter(x=X_embedded[:, 0], y=X_embedded[:, 1], mode='markers', name="Base")) + fig.add_trace(go.Scatter(x=X_embedded[bool_indices_tree_algo, 0], y=X_embedded[bool_indices_tree_algo, 1], mode='markers', name="NN-OMP")) + fig.add_trace(go.Scatter(x=X_embedded[-1:, 0], y=X_embedded[-1:, 1], mode='markers', name="NN-OMP")) + + fig.update_layout(title=f"Isomap {perp}") + fig.show() diff --git a/code/vizualisation/results_to_csv.py b/code/vizualisation/results_to_csv.py index 53c7785..454c14b 100644 --- a/code/vizualisation/results_to_csv.py +++ b/code/vizualisation/results_to_csv.py @@ -32,7 +32,7 @@ dct_experiment_id_technique = {"1": NONE, "9": OMPNN, # "9": NONE, # "10": Random, - # "11": OMP, + # "9": OMP, # "12": OMP_Distillation, # "13": Kmeans, # "14": Zhang_Similarities, @@ -88,7 +88,7 @@ if __name__ == "__main__": # dir_name = "results/bolsonaro_models_29-03-20" # dir_name = "results/bolsonaro_models_29-03-20_v3" # dir_name = "results/bolsonaro_models_29-03-20_v3" - dir_name = "results/bolsonaro_models_29-03-20_v3_2" + dir_name = "results/bolsonaro_models_29-03-20_v3" # dir_name = "results/bolsonaro_models_29-03-20" dir_path = Path(os.environ["project_dir"]) / dir_name diff --git a/code/vizualisation/results_to_predictions.py b/code/vizualisation/results_to_predictions.py new file mode 100644 index 0000000..da4eea1 --- /dev/null +++ b/code/vizualisation/results_to_predictions.py @@ -0,0 +1,168 @@ +import json +from pathlib import Path +import os +import pandas as pd +from pprint import pprint +import pickle +from collections import defaultdict +import numpy as np + +from dotenv import load_dotenv, find_dotenv +from numpy import savez + +from bolsonaro.data.dataset_loader import DatasetLoader +from bolsonaro.data.dataset_parameters import DatasetParameters + +dct_experiment_id_subset = dict((str(idx), "train+dev/train+dev") for idx in range(1, 10)) +# dct_experiment_id_subset.update(dict((str(idx), "train/dev") for idx in range(9, 17))) + +NONE = 'None' +Random = 'Random' +OMP = 'OMP' +OMPNN = 'NN-OMP' +OMP_Distillation = 'OMP Distillation' +Kmeans = 'Kmeans' +Zhang_Similarities = 'Zhang Similarities' +Zhang_Predictions = 'Zhang Predictions' +Ensemble = 'Ensemble' +dct_experiment_id_technique = {"1": NONE, + "2": Random, + "3": OMP, + "4": OMP_Distillation, + "5": Kmeans, + "6": Zhang_Similarities, + "7": Zhang_Predictions, + "8": Ensemble, + "9": OMPNN, + # "9": NONE, + # "10": Random, + # "9": OMP, + # "12": OMP_Distillation, + # "13": Kmeans, + # "14": Zhang_Similarities, + # "15": Zhang_Predictions, + # "16": Ensemble + } + + +dct_dataset_fancy = { + "boston": "Boston", + "breast_cancer": "Breast Cancer", + "california_housing": "California Housing", + "diabetes": "Diabetes", + "diamonds": "Diamonds", + "digits": "Digits", + "iris": "Iris", + "kin8nm": "Kin8nm", + "kr-vs-kp": "KR-VS-KP", + "olivetti_faces": "Olivetti Faces", + "spambase": "Spambase", + "steel-plates": "Steel Plates", + "wine": "Wine", + "gamma": "Gamma", + "lfw_pairs": "LFW Pairs" +} + +dct_dataset_base_forest_size = { + "boston": 100, + "breast_cancer": 1000, + "california_housing": 1000, + "diabetes": 108, + "diamonds": 429, + "digits": 1000, + "iris": 1000, + "kin8nm": 1000, + "kr-vs-kp": 1000, + "olivetti_faces": 1000, + "spambase": 1000, + "steel-plates": 1000, + "wine": 1000, + "gamma": 100, + "lfw_pairs": 1000, +} + +lst_attributes_tree_scores = ["dev_scores", "train_scores", "test_scores"] +skip_attributes = ["datetime"] + +if __name__ == "__main__": + + load_dotenv(find_dotenv('.env')) + # dir_name = "results/bolsonaro_models_25-03-20" + # dir_name = "results/bolsonaro_models_27-03-20_v2" + # dir_name = "results/bolsonaro_models_29-03-20" + # dir_name = "results/bolsonaro_models_29-03-20_v3" + # dir_name = "results/bolsonaro_models_29-03-20_v3" + dir_name = "results/models" + # dir_name = "results/bolsonaro_models_29-03-20" + dir_path = Path(os.environ["project_dir"]) / dir_name + + output_dir = dir_path / "predictions" + # output_dir_file = dir_path / "results.csv" + + dct_results = defaultdict(lambda: []) + + for root, dirs, files in os.walk(dir_path, topdown=False): + for file_str in files: + if file_str != "selected_trees.pickle": + continue + + # if file_str == "results.csv": + # continue + path_dir = Path(root) + path_file = path_dir / file_str + print(path_file) + try: + with open(path_file, 'rb') as pickle_file: + lst_selected_trees = pickle.load(pickle_file) + except: + print("problem loading pickle file {}".format(path_file)) + + path_dir_split = str(path_dir).split("/") + + bool_wo_weights = "no_weights" in str(path_file) + + if bool_wo_weights: + forest_size = int(path_dir_split[-1].split("_")[0]) + else: + forest_size = int(path_dir_split[-1]) + + seed = int(path_dir_split[-3]) + id_xp = str(path_dir_split[-5]) + dataset = str(path_dir_split[-7]) + + dct_results["forest_size"].append(forest_size) + dct_results["seed"].append(seed) + dct_results["dataset"].append(dct_dataset_fancy[dataset]) + dct_results["subset"].append(dct_experiment_id_subset[id_xp]) + dct_results["strategy"].append(dct_experiment_id_technique[id_xp]) + dct_results["wo_weights"].append(bool_wo_weights) + dct_results["base_forest_size"].append(dct_dataset_base_forest_size[dataset]) + pruning_percent = forest_size / dct_dataset_base_forest_size[dataset] + dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=2)) + # assert len(lst_selected_trees) == forest_size + dct_results["actual-forest-size"].append(len(lst_selected_trees)) + + with open(path_dir.parent.parent / f"dataset_parameters_{id_xp}.json", 'r') as jsonparamfile: + dataset_parameters_dict = json.load(jsonparamfile) + dataset_parameters = DatasetParameters(**dataset_parameters_dict) + dataset = DatasetLoader.load(dataset_parameters) + arr_pred_selected_trees = np.array([tree.predict(dataset.X_train) for tree in lst_selected_trees]).T + arr_true_labels = dataset.y_train + + output_dir_curr = output_dir / dct_results["dataset"][-1] / str(dct_results["forest_size"][-1]) / dct_results["strategy"][-1] + output_dir_curr.mkdir(parents=True, exist_ok=True) + + savez(output_dir_curr / "predictions_train.npz", Y_preds=arr_pred_selected_trees) + savez(output_dir_curr.parent / "true_labels.npz", Y_true=arr_true_labels) + + print() + # todo load dataset + # evaluate trees on data set: -> matrix of predictions + # store matrix of predictions + # store true labels + + + + # final_df = pd.DataFrame.from_dict(dct_results) + # final_df.to_csv(output_dir_file) + # print(final_df) -- GitLab