From 8e1d5f0ce5e4c55754566ff05203ac91079b17bb Mon Sep 17 00:00:00 2001
From: Luc Giffon <luc.giffon@ens-lyon.fr>
Date: Fri, 4 Jun 2021 17:48:54 +0200
Subject: [PATCH] expe viz correlation

---
 code/bolsonaro/models/nn_omp.py              |   2 +-
 code/compute_results.py                      |   2 +-
 code/train.py                                |   2 +-
 code/vizualisation/csv_to_figure.py          |  42 ++---
 code/vizualisation/csv_to_table.py           |   2 +-
 code/vizualisation/preds_to_viz.py           |  72 ++++++++
 code/vizualisation/results_to_csv.py         |   4 +-
 code/vizualisation/results_to_predictions.py | 168 +++++++++++++++++++
 8 files changed, 268 insertions(+), 26 deletions(-)
 create mode 100644 code/vizualisation/preds_to_viz.py
 create mode 100644 code/vizualisation/results_to_predictions.py

diff --git a/code/bolsonaro/models/nn_omp.py b/code/bolsonaro/models/nn_omp.py
index af8a11a..213295f 100644
--- a/code/bolsonaro/models/nn_omp.py
+++ b/code/bolsonaro/models/nn_omp.py
@@ -2,7 +2,7 @@ from copy import deepcopy
 
 from scipy.optimize import nnls
 import numpy as np
-from sklearn.linear_model.base import _preprocess_data
+from sklearn.linear_model._base import _preprocess_data
 
 from bolsonaro import LOG_PATH
 
diff --git a/code/compute_results.py b/code/compute_results.py
index 111cac2..d3c3c50 100644
--- a/code/compute_results.py
+++ b/code/compute_results.py
@@ -686,7 +686,7 @@ if __name__ == "__main__":
             30 + 1,
             endpoint=True)[1:]).astype(np.int)).tolist()"""
 
-        #extracted_forest_sizes = [4, 7, 11, 14, 18, 22, 25, 29, 32, 36, 40, 43, 47, 50, 54, 58, 61, 65, 68, 72, 76, 79, 83, 86, 90, 94, 97, 101, 104, 108]
+        #extracted_forest_sizes = [4, 7, 9, 14, 18, 22, 25, 29, 32, 36, 40, 43, 47, 50, 54, 58, 61, 65, 68, 72, 76, 79, 83, 86, 90, 94, 97, 101, 104, 108]
 
         #extracted_forest_sizes = [str(forest_size) for forest_size in extracted_forest_sizes]
         extracted_forest_sizes= list()
diff --git a/code/train.py b/code/train.py
index f5c6982..c4f3bf7 100644
--- a/code/train.py
+++ b/code/train.py
@@ -212,7 +212,7 @@ Command lines example for stage 3:
 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev
 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev
-python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing
+python code/compute_results.py --stage 3 --experiment_ids 9 12 13 --dataset_name=california_housing
 
 Command lines example for stage 4:
 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=0.05
diff --git a/code/vizualisation/csv_to_figure.py b/code/vizualisation/csv_to_figure.py
index 6f7b61c..2c5a532 100644
--- a/code/vizualisation/csv_to_figure.py
+++ b/code/vizualisation/csv_to_figure.py
@@ -15,15 +15,15 @@ lst_task_train_dev = ["coherence", "correlation"]
 
 tasks = [
     # "train_score",
-    "dev_score",
-    "test_score",
+    # "dev_score",
+    # "test_score",
     # "coherence",
-    # "correlation",
+    "train_correlation",
     # "negative-percentage",
     # "dev_strength",
     # "test_strength",
     # "dev_correlation",
-    # "test_correlation",
+    "test_correlation",
     # "dev_coherence",
     # "test_coherence",
     # "negative-percentage-test-score"
@@ -266,13 +266,14 @@ def base_figures(skip_NN=False):
                     add_trace_from_df(df_strat_wo_weights, fig, task, strat)
 
             title = "{} {}".format(task, data_name)
-            yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name]
-            xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees"
-
+            # yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name]
+            # xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees"
+            xaxis_title = "% Selected Trees"
+            yaxis_title = "Mean absolute correlation of normalized trees"
             if not skip_nn:
                 fig.add_trace(GLOBAL_TRACE_TO_ADD_LAST)
             fig.update_layout(barmode='group',
-                              # title=title,
+                              title=title,
                               xaxis_title=xaxis_title,
                               yaxis_title=yaxis_title,
                               font=dict(
@@ -281,13 +282,13 @@ def base_figures(skip_NN=False):
                                   color="black"
                               ),
                                 showlegend = False,
-                                margin = dict(
-                                    l=1,
-                                    r=1,
-                                    b=1,
-                                    t=1,
-                                    # pad=4
-                                ),
+                                # margin = dict(
+                                #     l=1,
+                                #     r=1,
+                                #     b=1,
+                                #     t=1,
+                                #     # pad=4
+                                # ),
                               legend=dict(
                                   traceorder="normal",
                                   font=dict(
@@ -300,7 +301,7 @@ def base_figures(skip_NN=False):
                                   borderwidth=1,
                               )
                               )
-            # fig.show()
+            fig.show()
             if skip_NN:
                 str_no_nn = " no nn"
                 title += str_no_nn
@@ -646,7 +647,7 @@ def effect_of_weights_figure():
 if __name__ == "__main__":
 
     load_dotenv(find_dotenv('.env'))
-    dir_name = "bolsonaro_models_29-03-20_v3_2"
+    dir_name = "bolsonaro_models_29-03-20_v3"
     dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
 
     out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
@@ -658,8 +659,9 @@ if __name__ == "__main__":
     strategies = set(df_results["strategy"].values)
     subsets = set(df_results["subset"].values)
 
-    for skip_nn in [True, False]:
+    # for skip_nn in [True, False]:
+    for skip_nn in [False]:
         base_figures(skip_nn)
-    effect_of_weights_figure()
-    weights_wrt_size()
+    # effect_of_weights_figure()
+    # weights_wrt_size()
     # global_figure()
diff --git a/code/vizualisation/csv_to_table.py b/code/vizualisation/csv_to_table.py
index 0e05e33..a0eedc6 100644
--- a/code/vizualisation/csv_to_table.py
+++ b/code/vizualisation/csv_to_table.py
@@ -115,7 +115,7 @@ def get_max_from_df(df, best_fct):
 if __name__ == "__main__":
 
     load_dotenv(find_dotenv('.env'))
-    dir_name = "bolsonaro_models_29-03-20_v3_2"
+    dir_name = "bolsonaro_models_29-03-20_v3"
     dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
 
     out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
diff --git a/code/vizualisation/preds_to_viz.py b/code/vizualisation/preds_to_viz.py
new file mode 100644
index 0000000..ed4b684
--- /dev/null
+++ b/code/vizualisation/preds_to_viz.py
@@ -0,0 +1,72 @@
+from collections import defaultdict
+import plotly.graph_objects as go
+import numpy as np
+from pathlib import Path
+import os
+
+from dotenv import find_dotenv, load_dotenv
+from sklearn.decomposition import PCA
+from sklearn.manifold import TSNE, MDS, Isomap, LocallyLinearEmbedding
+from sklearn.preprocessing import normalize
+
+if __name__ == "__main__":
+    load_dotenv(find_dotenv('.env'))
+    dir_name = "results/models/predictions"
+    dir_path = Path(os.environ["project_dir"]) / dir_name
+
+    dct_dataset_true_labels = dict()
+    dct_dataset_algo_preds = defaultdict(dict)
+    for dataset_path in dir_path.glob('*'):
+        dataset_name = dataset_path.name
+        max_forest_size = np.max(list(map(lambda x : int(x.name), dataset_path.glob("*"))))
+        for forest_size_path in dataset_path.glob("*"):
+            pruned_forest_size = int(forest_size_path.name)
+            if pruned_forest_size != int(10 / 100 * max_forest_size) and pruned_forest_size != max_forest_size:
+                continue
+            for algoname_path in forest_size_path.glob("*"):
+                algoname = algoname_path.name
+                if algoname == "true_labels.npz":
+                    if dct_dataset_true_labels.get(dataset_name, None) is None:
+                        # store the true labels for the task
+                        true_labels_path = algoname_path
+                        loaded_true_labels = np.load(true_labels_path)["Y_true"]
+                        dct_dataset_true_labels[dataset_name] = loaded_true_labels
+                    else:
+                        continue
+                else:
+                    path_predictions = algoname_path / "predictions_train.npz"
+                    loaded_predictions = np.load(path_predictions)["Y_preds"]
+                    dct_dataset_algo_preds[dataset_name][algoname] = loaded_predictions
+
+    print(dct_dataset_true_labels)
+    print(dct_dataset_algo_preds)
+
+    for dataset_name in dct_dataset_algo_preds:
+        predictions_algo = dct_dataset_algo_preds[dataset_name]["NN-OMP"].T
+        try:
+            predictions_total = dct_dataset_algo_preds[dataset_name]["None"].T
+        except:
+            continue
+        real_preds = dct_dataset_true_labels[dataset_name].reshape(1, -1)
+
+        predictions_total = np.vstack([predictions_total, real_preds])
+
+        normalized_predictions_algo = normalize(predictions_algo)
+        normalized_predictions_total = normalize(predictions_total)
+        sim = normalized_predictions_algo @ normalized_predictions_total.T
+
+        sim_equals_1 = np.isclose(sim, 1)
+        bool_indices_tree_algo = np.sum(sim_equals_1, axis=0).astype(bool)
+
+        # concat = np.vstack([predictions_algo, predictions_total])
+        for perp in range(1, 20, 3):
+            # tsne_obj = TSNE(n_components=2, perplexity=perp)
+            tsne_obj = Isomap(n_components=2, n_neighbors=perp)
+            X_embedded = tsne_obj.fit_transform(predictions_total)
+            fig = go.Figure()
+            fig.add_trace(go.Scatter(x=X_embedded[:, 0], y=X_embedded[:, 1], mode='markers', name="Base"))
+            fig.add_trace(go.Scatter(x=X_embedded[bool_indices_tree_algo, 0], y=X_embedded[bool_indices_tree_algo, 1], mode='markers', name="NN-OMP"))
+            fig.add_trace(go.Scatter(x=X_embedded[-1:, 0], y=X_embedded[-1:, 1], mode='markers', name="NN-OMP"))
+
+            fig.update_layout(title=f"Isomap {perp}")
+            fig.show()
diff --git a/code/vizualisation/results_to_csv.py b/code/vizualisation/results_to_csv.py
index 53c7785..454c14b 100644
--- a/code/vizualisation/results_to_csv.py
+++ b/code/vizualisation/results_to_csv.py
@@ -32,7 +32,7 @@ dct_experiment_id_technique = {"1": NONE,
                                "9": OMPNN,
                                # "9": NONE,
                                # "10": Random,
-                               # "11": OMP,
+                               # "9": OMP,
                                # "12": OMP_Distillation,
                                # "13": Kmeans,
                                # "14": Zhang_Similarities,
@@ -88,7 +88,7 @@ if __name__ == "__main__":
     # dir_name = "results/bolsonaro_models_29-03-20"
     # dir_name = "results/bolsonaro_models_29-03-20_v3"
     # dir_name = "results/bolsonaro_models_29-03-20_v3"
-    dir_name = "results/bolsonaro_models_29-03-20_v3_2"
+    dir_name = "results/bolsonaro_models_29-03-20_v3"
     # dir_name = "results/bolsonaro_models_29-03-20"
     dir_path = Path(os.environ["project_dir"]) / dir_name
 
diff --git a/code/vizualisation/results_to_predictions.py b/code/vizualisation/results_to_predictions.py
new file mode 100644
index 0000000..da4eea1
--- /dev/null
+++ b/code/vizualisation/results_to_predictions.py
@@ -0,0 +1,168 @@
+import json
+from pathlib import Path
+import os
+import pandas as pd
+from pprint import pprint
+import pickle
+from collections import defaultdict
+import numpy as np
+
+from dotenv import load_dotenv, find_dotenv
+from numpy import savez
+
+from bolsonaro.data.dataset_loader import DatasetLoader
+from bolsonaro.data.dataset_parameters import DatasetParameters
+
+dct_experiment_id_subset = dict((str(idx), "train+dev/train+dev") for idx in range(1, 10))
+# dct_experiment_id_subset.update(dict((str(idx), "train/dev") for idx in range(9, 17)))
+
+NONE = 'None'
+Random = 'Random'
+OMP = 'OMP'
+OMPNN = 'NN-OMP'
+OMP_Distillation = 'OMP Distillation'
+Kmeans = 'Kmeans'
+Zhang_Similarities = 'Zhang Similarities'
+Zhang_Predictions = 'Zhang Predictions'
+Ensemble = 'Ensemble'
+dct_experiment_id_technique = {"1": NONE,
+                               "2": Random,
+                               "3": OMP,
+                               "4": OMP_Distillation,
+                               "5": Kmeans,
+                               "6": Zhang_Similarities,
+                               "7": Zhang_Predictions,
+                               "8": Ensemble,
+                               "9": OMPNN,
+                               # "9": NONE,
+                               # "10": Random,
+                               # "9": OMP,
+                               # "12": OMP_Distillation,
+                               # "13": Kmeans,
+                               # "14": Zhang_Similarities,
+                               # "15": Zhang_Predictions,
+                               # "16": Ensemble
+                               }
+
+
+dct_dataset_fancy = {
+    "boston": "Boston",
+    "breast_cancer": "Breast Cancer",
+    "california_housing": "California Housing",
+    "diabetes": "Diabetes",
+    "diamonds": "Diamonds",
+    "digits": "Digits",
+    "iris": "Iris",
+    "kin8nm": "Kin8nm",
+    "kr-vs-kp": "KR-VS-KP",
+    "olivetti_faces": "Olivetti Faces",
+    "spambase": "Spambase",
+    "steel-plates": "Steel Plates",
+    "wine": "Wine",
+    "gamma": "Gamma",
+    "lfw_pairs": "LFW Pairs"
+}
+
+dct_dataset_base_forest_size = {
+    "boston": 100,
+    "breast_cancer": 1000,
+    "california_housing": 1000,
+    "diabetes": 108,
+    "diamonds": 429,
+    "digits": 1000,
+    "iris": 1000,
+    "kin8nm": 1000,
+    "kr-vs-kp": 1000,
+    "olivetti_faces": 1000,
+    "spambase": 1000,
+    "steel-plates": 1000,
+    "wine": 1000,
+    "gamma": 100,
+    "lfw_pairs": 1000,
+}
+
+lst_attributes_tree_scores = ["dev_scores", "train_scores", "test_scores"]
+skip_attributes = ["datetime"]
+
+if __name__ == "__main__":
+
+    load_dotenv(find_dotenv('.env'))
+    # dir_name = "results/bolsonaro_models_25-03-20"
+    # dir_name = "results/bolsonaro_models_27-03-20_v2"
+    # dir_name = "results/bolsonaro_models_29-03-20"
+    # dir_name = "results/bolsonaro_models_29-03-20_v3"
+    # dir_name = "results/bolsonaro_models_29-03-20_v3"
+    dir_name = "results/models"
+    # dir_name = "results/bolsonaro_models_29-03-20"
+    dir_path = Path(os.environ["project_dir"]) / dir_name
+
+    output_dir = dir_path / "predictions"
+    # output_dir_file = dir_path / "results.csv"
+
+    dct_results = defaultdict(lambda: [])
+
+    for root, dirs, files in os.walk(dir_path, topdown=False):
+        for file_str in files:
+            if file_str != "selected_trees.pickle":
+                continue
+
+            # if file_str == "results.csv":
+            #     continue
+            path_dir = Path(root)
+            path_file = path_dir / file_str
+            print(path_file)
+            try:
+                with open(path_file, 'rb') as pickle_file:
+                    lst_selected_trees = pickle.load(pickle_file)
+            except:
+                print("problem loading pickle file {}".format(path_file))
+
+            path_dir_split = str(path_dir).split("/")
+
+            bool_wo_weights = "no_weights" in str(path_file)
+
+            if bool_wo_weights:
+                forest_size = int(path_dir_split[-1].split("_")[0])
+            else:
+                forest_size = int(path_dir_split[-1])
+
+            seed = int(path_dir_split[-3])
+            id_xp = str(path_dir_split[-5])
+            dataset = str(path_dir_split[-7])
+
+            dct_results["forest_size"].append(forest_size)
+            dct_results["seed"].append(seed)
+            dct_results["dataset"].append(dct_dataset_fancy[dataset])
+            dct_results["subset"].append(dct_experiment_id_subset[id_xp])
+            dct_results["strategy"].append(dct_experiment_id_technique[id_xp])
+            dct_results["wo_weights"].append(bool_wo_weights)
+            dct_results["base_forest_size"].append(dct_dataset_base_forest_size[dataset])
+            pruning_percent = forest_size / dct_dataset_base_forest_size[dataset]
+            dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=2))
+            # assert len(lst_selected_trees) == forest_size
+            dct_results["actual-forest-size"].append(len(lst_selected_trees))
+
+            with open(path_dir.parent.parent / f"dataset_parameters_{id_xp}.json", 'r') as jsonparamfile:
+                dataset_parameters_dict = json.load(jsonparamfile)
+            dataset_parameters = DatasetParameters(**dataset_parameters_dict)
+            dataset = DatasetLoader.load(dataset_parameters)
+            arr_pred_selected_trees = np.array([tree.predict(dataset.X_train) for tree in lst_selected_trees]).T
+            arr_true_labels = dataset.y_train
+
+            output_dir_curr = output_dir / dct_results["dataset"][-1] / str(dct_results["forest_size"][-1]) / dct_results["strategy"][-1]
+            output_dir_curr.mkdir(parents=True, exist_ok=True)
+
+            savez(output_dir_curr / "predictions_train.npz", Y_preds=arr_pred_selected_trees)
+            savez(output_dir_curr.parent / "true_labels.npz", Y_true=arr_true_labels)
+
+            print()
+            # todo load dataset
+            # evaluate trees on data set: -> matrix of predictions
+            # store matrix of predictions
+            # store true labels
+
+
+
+    # final_df = pd.DataFrame.from_dict(dct_results)
+    # final_df.to_csv(output_dir_file)
+    # print(final_df)
-- 
GitLab