From e35b35bf24c10982c1bcc906d5bd7cc1cff2e9f1 Mon Sep 17 00:00:00 2001
From: Luc Giffon <luc.giffon@lis-lab.fr>
Date: Thu, 27 Aug 2020 16:09:36 +0200
Subject: [PATCH] code figures (these luc)

---
 code/vizualisation/csv_to_figure.py      | 166 +++++++-----
 code/vizualisation/csv_to_table.py       |  65 ++---
 code/vizualisation/csv_to_table_these.py | 323 +++++++++++++++++++++++
 code/vizualisation/results_to_csv.py     |   4 +-
 4 files changed, 455 insertions(+), 103 deletions(-)
 create mode 100644 code/vizualisation/csv_to_table_these.py

diff --git a/code/vizualisation/csv_to_figure.py b/code/vizualisation/csv_to_figure.py
index 25f5976..a4d0042 100644
--- a/code/vizualisation/csv_to_figure.py
+++ b/code/vizualisation/csv_to_figure.py
@@ -15,7 +15,7 @@ lst_task_train_dev = ["coherence", "correlation"]
 
 tasks = [
     # "train_score",
-    # "dev_score",
+    "dev_score",
     "test_score",
     # "coherence",
     # "correlation",
@@ -109,8 +109,8 @@ def add_trace_from_df(df, fig, task, strat, stop_on_flat=False):
     global GLOBAL_TRACE_TO_ADD_LAST
 
     df.sort_values(by="forest_size", inplace=True)
-    df_groupby_forest_size = df.groupby(['forest_size'])
-    forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values)
+    df_groupby_forest_size = df.groupby(['pruning_percent'])
+    forest_sizes = list(df_groupby_forest_size["pruning_percent"].mean().values)
     mean_value = df_groupby_forest_size[task].mean().values
     std_value = df_groupby_forest_size[task].std().values
 
@@ -132,7 +132,8 @@ def add_trace_from_df(df, fig, task, strat, stop_on_flat=False):
                             width=2
                         )
                     ),
-                    showlegend=False
+                    name="Final NN-OMP",
+                    showlegend=True
                 )
 
     forest_sizes = forest_sizes[:index_flat]
@@ -169,7 +170,21 @@ dct_metric_figure = {
     "mean_squared_error": go.Figure()
 }
 
-def base_figures():
+dct_gamma_by_dataset = {
+    "Boston": 5,
+    "Breast Cancer": 5,
+    "California Housing": 5,
+    "Diabetes": 5,
+    "Diamonds": 5,
+    "Kin8nm": 5,
+    "KR-VS-KP": 5,
+    "Spambase": 5,
+    "Steel Plates": 5,
+    "Gamma": 5,
+    "LFW Pairs": 5,
+}
+
+def base_figures(skip_NN=False):
 
     for task in tasks:
         for data_name in datasets:
@@ -183,40 +198,40 @@ def base_figures():
             # all techniques #
             ##################
             for strat in strategies:
-                if strat in lst_skip_strategy:
+                if strat in lst_skip_strategy or (skip_NN and "NN-OMP" in strat):
                     continue
 
-                if task == "negative-percentage-test-score":
-                    if strat == "OMP":
-                        df_strat = df_data[df_data["strategy"] == strat]
-                        df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"]
-                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
-
-                        df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size'])
-
-
-                        forest_sizes = df_groupby_forest_size["forest_size"].mean().values
-                        x_values = df_groupby_forest_size["negative-percentage"].mean().values
-                        y_values = df_groupby_forest_size["test_score"].mean().values
-                        # print(df_strat)
-                        fig.add_trace(go.Scatter(x=x_values, y=y_values,
-                                                 mode='markers',
-                                                 name=strat,
-                                                 # color=forest_sizes,
-                                                 marker=dict(
-                                                    # size=16,
-                                                    # cmax=39,
-                                                    # cmin=0,
-                                                    color=forest_sizes,
-                                                    colorbar=dict(
-                                                        title="Forest Size"
-                                                    ),
-                                                    # colorscale="Viridis"
-                                                ),
-                                                 # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat]))
-                         ))
-
-                    continue
+                # if task == "negative-percentage-test-score":
+                #     if strat == "OMP":
+                #         df_strat = df_data[df_data["strategy"] == strat]
+                #         df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"]
+                #         df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
+                #
+                #         df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size'])
+                #
+                #
+                #         forest_sizes = df_groupby_forest_size["forest_size"].mean().values
+                #         x_values = df_groupby_forest_size["negative-percentage"].mean().values
+                #         y_values = df_groupby_forest_size["test_score"].mean().values
+                #         # print(df_strat)
+                #         fig.add_trace(go.Scatter(x=x_values, y=y_values,
+                #                                  mode='markers',
+                #                                  name=strat,
+                #                                  # color=forest_sizes,
+                #                                  marker=dict(
+                #                                     # size=16,
+                #                                     # cmax=39,
+                #                                     # cmin=0,
+                #                                     color=forest_sizes,
+                #                                     colorbar=dict(
+                #                                         title="Forest Size"
+                #                                     ),
+                #                                     # colorscale="Viridis"
+                #                                 ),
+                #                                  # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat]))
+                #          ))
+                #
+                #     continue
 
 
                 df_strat = df_data[df_data["strategy"] == strat]
@@ -252,9 +267,10 @@ def base_figures():
 
             title = "{} {}".format(task, data_name)
             yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name]
-            xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "# Selected Trees"
+            xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees"
 
-            fig.add_trace(GLOBAL_TRACE_TO_ADD_LAST)
+            if not skip_nn:
+                fig.add_trace(GLOBAL_TRACE_TO_ADD_LAST)
             fig.update_layout(barmode='group',
                               # title=title,
                               xaxis_title=xaxis_title,
@@ -264,7 +280,7 @@ def base_figures():
                                   size=24,
                                   color="black"
                               ),
-                                # showlegend = False,
+                                showlegend = False,
                                 margin = dict(
                                     l=1,
                                     r=1,
@@ -285,6 +301,9 @@ def base_figures():
                               )
                               )
             # fig.show()
+            if skip_NN:
+                str_no_nn = " no nn"
+                title += str_no_nn
             sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_")
             filename = sanitize(title)
             output_dir = out_dir / sanitize(task)
@@ -375,14 +394,14 @@ def global_figure():
             # fig.show()
 
 def weights_wrt_size():
-    lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"]
-
+    # lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"]
+    lst_skip_data_weight_effect = ["Gamma"]
     fig = go.Figure()
 
     for data_name in datasets:
 
-        # if data_name in lst_skip_data_weight_effect:
-        #     continue
+        if data_name in lst_skip_data_weight_effect:
+            continue
         df_data = df_results[df_results["dataset"] == data_name]
         score_metric_name = df_data["score_metric"].values[0]
 
@@ -401,7 +420,7 @@ def weights_wrt_size():
         y_values = df_groupby_forest_size["negative-percentage"].mean().values
         y_values = (y_values - np.min(y_values)) / (np.max(y_values) - np.min(y_values))
 
-        x_values = np.around(df_groupby_forest_size["pruning_percent"].mean().values, decimals=1)
+        x_values = df_groupby_forest_size["pruning_percent"].mean().values
         # x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values))
 
         # if score_metric_name == "mean_squared_error":
@@ -410,8 +429,8 @@ def weights_wrt_size():
         lin_reg = svm.SVR(gamma=10)
         lin_reg.fit(x_values[:, np.newaxis], y_values)
 
-        xx = np.linspace(0, 1)
-        yy = lin_reg.predict(xx[:, np.newaxis])
+        # xx = np.linspace(0, 1)
+        yy = lin_reg.predict(x_values[:, np.newaxis])
 
         # print(df_strat)
         fig.add_trace(go.Scatter(x=x_values, y=y_values,
@@ -430,7 +449,7 @@ def weights_wrt_size():
                                  ),
                                  # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat]))
                                  ))
-        fig.add_trace(go.Scatter(x=xx, y=yy,
+        fig.add_trace(go.Scatter(x=x_values, y=yy,
                                  mode='lines',
                                  name=strat,
                                  # color=forest_sizes,
@@ -452,8 +471,8 @@ def weights_wrt_size():
     title = "{}".format("weight wrt size")
 
     fig.update_layout(barmode='group',
-                      title=title,
-                      xaxis_title="Pruning percentage",
+                      # title=title,
+                      xaxis_title="% Selected Trees",
                       yaxis_title="Standardized % negative weights",
                       font=dict(
                           # family="Courier New, monospace",
@@ -464,8 +483,8 @@ def weights_wrt_size():
                       margin=dict(
                           l=1,
                           r=1,
-                          b=1,
-                          t=1,
+                          b=3,
+                          t=10,
                           # pad=4
                       ),
                       legend=dict(
@@ -488,12 +507,13 @@ def weights_wrt_size():
     fig.write_image(str((output_dir / filename).absolute()) + ".png")
 
 def effect_of_weights_figure():
-    lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"]
+    lst_skip_data_weight_effect = ["Gamma"]
+    # lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"]
 
     fig = go.Figure()
 
     for data_name in datasets:
-
+        #
         # if data_name in lst_skip_data_weight_effect:
         #     continue
         df_data = df_results[df_results["dataset"] == data_name]
@@ -506,29 +526,31 @@ def effect_of_weights_figure():
         df_strat = df_data[df_data["strategy"] == strat]
         df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"]
         df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
-
         df_strat_wo_weights.sort_values(by="pruning_percent", inplace=True)
 
         df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size'])
 
         x_values = df_groupby_forest_size["negative-percentage"].mean().values
-        x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values))
-
         y_values = df_groupby_forest_size["test_score"].mean().values
-
         if score_metric_name == "mean_squared_error":
             y_values = 1/y_values
 
+
+        x_values = x_values[3:]
+        y_values = y_values[3:]
+
+        x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values))
         y_values = (y_values - np.min(y_values)) / (np.max(y_values) - np.min(y_values))
 
-        bins = np.histogram(x_values)[1]
-        indices_x_values = np.digitize(x_values, bins)-1
-        mean_val = np.empty(len(bins)-1)
-        for idx_group in range(len(bins) - 1):
-            mean_val[idx_group] = np.mean(y_values[indices_x_values == idx_group])
+        # bins = np.histogram(x_values)[1]
+        # indices_x_values = np.digitize(x_values, bins)-1
+        # mean_val = np.empty(len(bins)-1)
+        # for idx_group in range(len(bins) - 1):
+        #     mean_val[idx_group] = np.mean(y_values[indices_x_values == idx_group])
 
         # lin_reg = LinearRegression()
-        lin_reg = svm.SVR(gamma=5)
+        # lin_reg = svm.SVR(gamma=dct_gamma_by_dataset[data_name])
+        lin_reg = svm.SVR(gamma=1.)
         lin_reg.fit(x_values[:, np.newaxis], y_values)
 
         xx = np.linspace(0, 1)
@@ -540,6 +562,7 @@ def effect_of_weights_figure():
         fig.add_trace(go.Scatter(x=x_values, y=y_values,
                                  mode='markers',
                                  name=strat,
+                                 showlegend=False,
                                  # color=forest_sizes,
                                  marker=dict(
                                      # size=16,
@@ -576,15 +599,15 @@ def effect_of_weights_figure():
     title = "{}".format("negative weights effect")
 
     fig.update_layout(barmode='group',
-                      title=title,
-                      xaxis_title="Standardized % negative weights",
-                      yaxis_title="Normalized Performance",
+                      # title=title,
+                      xaxis_title="Standardized % Negative Weights",
+                      yaxis_title="Standardized Performance",
                       font=dict(
                           # family="Courier New, monospace",
                           size=24,
                           color="black"
                       ),
-                      showlegend = False,
+                      # showlegend = False,
                       margin=dict(
                           l=1,
                           r=1,
@@ -626,7 +649,8 @@ if __name__ == "__main__":
     strategies = set(df_results["strategy"].values)
     subsets = set(df_results["subset"].values)
 
-    # base_figures()
-    effect_of_weights_figure()
-    weights_wrt_size()
+    for skip_nn in [True, False]:
+        base_figures(skip_nn)
+    # effect_of_weights_figure()
+    # weights_wrt_size()
     # global_figure()
diff --git a/code/vizualisation/csv_to_table.py b/code/vizualisation/csv_to_table.py
index 440e5fc..0e05e33 100644
--- a/code/vizualisation/csv_to_table.py
+++ b/code/vizualisation/csv_to_table.py
@@ -33,18 +33,32 @@ dct_score_metric_best_fct = {
     "mean_squared_error": np.argmin
 }
 
+# dct_data_short = {
+#     "Spambase": "Spambase",
+#     "Diamonds": "Diamonds",
+#     "Diabetes": "Diabetes",
+#     "Steel Plates": "Steel P.",
+#     "KR-VS-KP": "KR-VS-KP",
+#     "Breast Cancer": "Breast C.",
+#     "Kin8nm": "Kin8nm",
+#     "LFW Pairs": "LFW P.",
+#     "Gamma": "Gamma",
+#     "California Housing": "California H.",
+#     "Boston": "Boston",
+# }
+
 dct_data_short = {
-    "Spambase": "Spambase",
-    "Diamonds": "Diamonds",
-    "Diabetes": "Diabetes",
-    "Steel Plates": "Steel P.",
-    "KR-VS-KP": "KR-VS-KP",
-    "Breast Cancer": "Breast C.",
-    "Kin8nm": "Kin8nm",
+    "Spambase": "Sp. B.",
+    "Diamonds": "Diam.",
+    "Diabetes": "Diab.",
+    "Steel Plates": "St. P.",
+    "KR-VS-KP": "KR-KP",
+    "Breast Cancer": "B. C.",
+    "Kin8nm": "Kin.",
     "LFW Pairs": "LFW P.",
-    "Gamma": "Gamma",
-    "California Housing": "California H.",
-    "Boston": "Boston",
+    "Gamma": "Gam.",
+    "California Housing": "C. H.",
+    "Boston": "Bos.",
 }
 
 dct_data_best = {
@@ -101,7 +115,7 @@ def get_max_from_df(df, best_fct):
 if __name__ == "__main__":
 
     load_dotenv(find_dotenv('.env'))
-    dir_name = "bolsonaro_models_25-03-20"
+    dir_name = "bolsonaro_models_29-03-20_v3_2"
     dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
 
     out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
@@ -155,29 +169,19 @@ if __name__ == "__main__":
 
                     if "OMP" in strat:
                         ###########################
-                        # traitement avec weights #
+                        # traitement without weights #
                         ###########################
-                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
-                        if data_name == "Boston" and subset_name == "train+dev/train+dev":
-                            df_strat_wo_weights = df_strat_wo_weights[df_strat_wo_weights["forest_size"] < 400]
-                        dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
-                        if strat not in lst_strats: lst_strats.append(strat)
+                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
 
-                    if "OMP" in strat and subset_name == "train/dev":
-                        continue
-                    elif "Random" not in strat and subset_name == "train/dev":
-                        continue
+                        strat_woweights = "{} w/o weights".format(strat)
+                        dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
+                        if strat_woweights not in lst_strats: lst_strats.append(strat_woweights)
 
                     #################################
                     # traitement general wo_weights #
                     #################################
-                    if "Random" in strat:
-                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
-                    else:
-                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
+                    df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
 
-                    if "OMP" in strat:
-                        strat = "{} w/o weights".format(strat)
 
                     dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
                     if strat not in lst_strats: lst_strats.append(strat)
@@ -219,7 +223,8 @@ if __name__ == "__main__":
             lst_tpl_results = dct_data_lst_tpl_results[data_name]
             data_name_short = dct_data_short[data_name]
             s_data_tmp = "{}".format(data_name_short)
-            s_data_tmp += "({})".format(dct_data_metric[data_name])
+            # add metric in parenthesis
+            # s_data_tmp += "({})".format(dct_data_metric[data_name])
             # s_data_tmp = "\\texttt{{ {} }}".format(data_name_short)
             # s_data_tmp = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(data_name)
             s_data_tmp += " "*(nb_spaces - len(data_name_short))
@@ -292,8 +297,8 @@ if __name__ == "__main__":
                 print("\\midrule")
             if idx_lin == 6:
                 print("\\midrule")
-            if lst_data_ordered[idx_lin-1] == "Diamonds":
-                print("%", end="")
+            # if lst_data_ordered[idx_lin-1] == "Diamonds":
+            #     print("%", end="")
             line_print = " ".join(list(lin))
             line_print = line_print.rstrip(" &") + "\\\\"
             print(line_print)
diff --git a/code/vizualisation/csv_to_table_these.py b/code/vizualisation/csv_to_table_these.py
new file mode 100644
index 0000000..8d4dbee
--- /dev/null
+++ b/code/vizualisation/csv_to_table_these.py
@@ -0,0 +1,323 @@
+import copy
+
+from dotenv import load_dotenv, find_dotenv
+from pathlib import Path
+import os
+import pandas as pd
+import numpy as np
+from pprint import pprint
+import plotly.graph_objects as go
+import plotly.io as pio
+from collections import defaultdict
+
+lst_skip_strategy = ["None", "OMP Distillation", "OMP Distillation w/o weights"]
+lst_skip_task = ["correlation", "coherence"]
+# lst_skip_task = []
+lst_skip_subset = ["train/dev"]
+# lst_skip_subset = []
+
+tasks = [
+    # "train_score",
+    # "dev_score",
+    "test_score",
+    # "coherence",
+    # "correlation"
+]
+
+dct_score_metric_fancy = {
+    "accuracy_score": "% Accuracy",
+    "mean_squared_error": "MSE"
+}
+dct_score_metric_best_fct = {
+    "accuracy_score": np.argmax,
+    "mean_squared_error": np.argmin
+}
+
+# dct_data_short = {
+#     "Spambase": "Spambase",
+#     "Diamonds": "Diamonds",
+#     "Diabetes": "Diabetes",
+#     "Steel Plates": "Steel P.",
+#     "KR-VS-KP": "KR-VS-KP",
+#     "Breast Cancer": "Breast C.",
+#     "Kin8nm": "Kin8nm",
+#     "LFW Pairs": "LFW P.",
+#     "Gamma": "Gamma",
+#     "California Housing": "California H.",
+#     "Boston": "Boston",
+# }
+
+dct_data_short = {
+    "Spambase": "Sp. B.",
+    "Diamonds": "Diam.",
+    "Diabetes": "Diab.",
+    "Steel Plates": "St. P.",
+    "KR-VS-KP": "KR-KP",
+    "Breast Cancer": "B. C.",
+    "Kin8nm": "Kin.",
+    "LFW Pairs": "LFW P.",
+    "Gamma": "Gam.",
+    "California Housing": "C. H.",
+    "Boston": "Bos.",
+}
+
+dct_data_best = {
+    "Spambase": np.max,
+    "Diamonds": np.min,
+    "Diabetes": np.min,
+    "Steel Plates": np.max,
+    "KR-VS-KP": np.max,
+    "Breast Cancer": np.max,
+    "Kin8nm": np.min,
+    "LFW Pairs": np.max,
+    "Gamma": np.max,
+    "California Housing": np.min,
+    "Boston": np.min,
+}
+dct_data_metric = {
+    "Spambase": "Acc.",
+    "Diamonds": "MSE",
+    "Diabetes": "MSE",
+    "Steel Plates": "Acc.",
+    "KR-VS-KP": "Acc.",
+    "Breast Cancer": "Acc.",
+    "Kin8nm": "MSE",
+    "LFW Pairs": "Acc.",
+    "Gamma": "Acc.",
+    "California Housing": "MSE",
+    "Boston": "MSE",
+}
+
+
+
+def get_max_from_df(df, best_fct):
+    nb_to_consider = 10
+    df.sort_values(by="forest_size", inplace=True)
+    df_groupby_forest_size = df.groupby(['forest_size'])
+    forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values)[:nb_to_consider]
+    mean_value = df_groupby_forest_size[task].mean().values[:nb_to_consider]
+    std_value = df_groupby_forest_size[task].std().values[:nb_to_consider]
+
+    try:
+        argmax = best_fct(mean_value)
+    except:
+        print("no results", strat, data_name, task, subset_name)
+        return -1, -1, -1
+
+    max_mean = mean_value[argmax]
+    max_std = std_value[argmax]
+    max_forest_size = forest_sizes[argmax]
+
+    return max_forest_size, max_mean, max_std
+
+
+
+if __name__ == "__main__":
+
+    load_dotenv(find_dotenv('.env'))
+    dir_name = "bolsonaro_models_29-03-20_v3_2"
+    dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
+
+    out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
+
+    input_dir_file = dir_path / "results.csv"
+    df_results = pd.read_csv(open(input_dir_file, 'rb'))
+
+    datasets = set(df_results["dataset"].values)
+    strategies = sorted(list(set(df_results["strategy"].values) - set(lst_skip_strategy)))
+    subsets = set(df_results["subset"].values)
+
+    r"""
+    \begin{table}[!h]
+    \centering
+    \begin{tabular}{l{}}
+    \toprule
+    \multicolumn{1}{c}{\textbf{Dataset}} & \textbf{Data dim.} $\datadim$        & \textbf{\# classes} & \textbf{Train size} $\nexamples$ & \textbf{Test size} $\nexamples'$ \\ \midrule
+    \texttt{MNIST}~\cite{lecun-mnisthandwrittendigit-2010}                      & 784    & 10        & 60 000    & 10 000               \\ %\hline
+    \texttt{Kddcup99}~\cite{Dua:2019}                                           & 116    & 23      & 4 893 431      & 5 000               \\ 
+    \bottomrule
+    \end{tabular}
+    \caption{Main features of the datasets. Discrete, unordered attributes for dataset Kddcup99 have been encoded as one-hot attributes.}
+    \label{table:data}
+    \end{table}
+    """
+
+
+    for task in tasks:
+        if task in lst_skip_task:
+            continue
+
+        dct_data_lst_tpl_results = defaultdict(lambda: [])
+
+        lst_strats = []
+        for data_name in datasets:
+            df_data = df_results[df_results["dataset"] == data_name]
+            score_metric_name = df_data["score_metric"].values[0]
+
+            for subset_name in subsets:
+                if subset_name in lst_skip_subset:
+                    continue
+                df_subset = df_data[df_data["subset"] == subset_name]
+
+                ##################
+                # all techniques #
+                ##################
+                for strat in strategies:
+                    if strat in lst_skip_strategy:
+                        continue
+                    df_strat = df_subset[df_subset["strategy"] == strat]
+
+                    if "OMP" in strat:
+                        ###########################
+                        # traitement without weights #
+                        ###########################
+                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
+
+                        strat_woweights = "{} w/o weights".format(strat)
+                        dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
+                        if strat_woweights not in lst_strats: lst_strats.append(strat_woweights)
+
+                    #################################
+                    # traitement general wo_weights #
+                    #################################
+                    df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
+
+
+                    dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
+                    if strat not in lst_strats: lst_strats.append(strat)
+
+                title = "{} {} {}".format(task, data_name, subset_name)
+
+                # fig.show()
+                sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_")
+                filename = sanitize(title)
+                # output_dir = out_dir / sanitize(subset_name) / sanitize(task)
+                # output_dir.mkdir(parents=True, exist_ok=True)
+                # fig.write_image(str((output_dir / filename).absolute()) + ".png")
+
+
+        # pprint(dct_data_lst_tpl_results)
+
+        lst_data_ordered = [
+            "Diamonds",
+            "Diabetes",
+            "Kin8nm",
+            "California Housing",
+            "Boston",
+            "Spambase",
+            "Steel Plates",
+            "KR-VS-KP",
+            "Breast Cancer",
+            "LFW Pairs",
+            "Gamma"
+        ]
+
+
+        arr_results_str = np.empty((len(lst_strats)+1, len(datasets) + 1 ), dtype="object")
+        nb_spaces = 25
+        dct_strat_str = defaultdict(lambda: [])
+        s_empty = "{}" + " "*(nb_spaces-2) + " & "
+        arr_results_str[0][0] = s_empty
+        # arr_results_str[0][1] = s_empty
+        for idx_data, data_name in enumerate(lst_data_ordered):
+            lst_tpl_results = dct_data_lst_tpl_results[data_name]
+            data_name_short = dct_data_short[data_name]
+            # s_data_tmp = "{}".format(data_name_short)
+            # add metric in parenthesis
+            # s_data_tmp += "({})".format(dct_data_metric[data_name])
+            # s_data_tmp = "\\texttt{{ {} }}".format(data_name_short)
+            s_data_tmp = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(data_name)
+            s_data_tmp += " "*(nb_spaces - len(s_data_tmp))
+            s_data_tmp += " & "
+            arr_results_str[0, idx_data + 1] = s_data_tmp
+
+
+            array_results = np.array(lst_tpl_results)
+            best_result_perf = dct_data_best[data_name](array_results[:, 1])
+            best_result_perf_indexes = np.argwhere(array_results[:, 1] == best_result_perf)
+
+            copye_array_results = copy.deepcopy(array_results)
+            if dct_data_best[data_name] is np.min:
+                copye_array_results[best_result_perf_indexes] = np.inf
+            else:
+                copye_array_results[best_result_perf_indexes] = -np.inf
+
+            best_result_perf_2 = dct_data_best[data_name](copye_array_results[:, 1])
+            best_result_perf_indexes_2 = np.argwhere(copye_array_results[:, 1] == best_result_perf_2)
+
+            best_result_prune = np.min(array_results[:, 0])
+            best_result_prune_indexes = np.argwhere(array_results[:, 0] == best_result_prune)
+
+            for idx_strat, tpl_results in enumerate(array_results):
+                str_strat = "\\texttt{{ {} }}".format(lst_strats[idx_strat])
+                # str_strat = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(lst_strats[idx_strat])
+                # str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ \\texttt{{ {} }} }} }}".format("}\\\\ \\texttt{".join(lst_strats[idx_strat].split(" ", 1)))
+                # str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ {} }} }} ".format("\\\\".join(lst_strats[idx_strat].split(" ", 1)))
+                str_strat += " " * (nb_spaces - len(str_strat)) + " & "
+                arr_results_str[idx_strat+1, 0] =  str_strat
+
+                # str_header = " {} & #tree &".format(dct_data_metric[data_name])
+                # arr_results_str[idx_strat + 1, 1] = str_header
+
+                best_forest_size = tpl_results[0]
+                best_mean = tpl_results[1]
+                best_std = tpl_results[2]
+                if dct_data_metric[data_name] == "Acc.":
+                    str_perf = "{:.2f}\\%".format(best_mean * 100)
+                else:
+                    str_perf = "{:.3E}".format(best_mean)
+
+                str_prune = "{:d}".format(int(best_forest_size))
+
+                if idx_strat in best_result_perf_indexes:
+                    # str_formating = "\\textbf{{ {} }}".format(str_result_loc)
+                    str_formating = "\\textbf[{}]"
+                    # str_formating = "\\textbf{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std)
+                elif idx_strat in best_result_perf_indexes_2:
+                    str_formating = "\\underline[{}]"
+                    # str_formating = "\\underline{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std)
+                else:
+                    str_formating = "{}"
+                    # str_formating = "{:.3E}(~{:.3E})".format(best_mean, best_std)
+
+                if idx_strat in best_result_prune_indexes:
+                    str_formating = str_formating.format("\\textit[{}]")
+                    # str_prune = " & \\textit{{ {:d} }}".format(int(best_forest_size))
+                # else:
+                #     str_prune = " & {:d}".format(int(best_forest_size))
+                str_result = str_formating.format(str_perf) + " & " + str_formating.format(str_prune)
+                str_result += " "*(nb_spaces - len(str_result))
+                str_result = str_result.replace("[", "{").replace("]", "}")
+
+                arr_results_str[idx_strat+1, idx_data+1] = str_result + " & "
+                dct_strat_str[lst_strats[idx_strat]].append(str_result)
+
+        # arr_results_str = arr_results_str.T
+
+        arr_results_str_classif = arr_results_str[:, 6:]
+        arr_results_str_classif = np.hstack([arr_results_str[:, 0:1], arr_results_str_classif])
+        arr_results_str_reg = arr_results_str[:, :6]
+
+        for arr_results_str in [arr_results_str_classif, arr_results_str_reg]:
+            print(r"\toprule")
+            for idx_lin, lin in enumerate(arr_results_str):
+                if idx_lin == 1:
+                    print("\\midrule")
+                # if idx_lin == 6:
+                #     print("\\midrule")
+                # if lst_data_ordered[idx_lin-1] == "Diamonds":
+                #     print("%", end="")
+                line_print = " ".join(list(lin))
+                line_print = line_print.rstrip(" &") + "\\\\"
+                print(line_print)
+            print(r"\bottomrule")
+        # s_data = s_data.rstrip(" &") + "\\\\"
+        # print(s_data)
+        # for strat, lst_str_results in dct_strat_str.items():
+        #     str_strat = "\\texttt{{ {} }}".format(strat)
+        #     str_strat += " "*(nb_spaces - len(str_strat))
+        #     str_strat += " & " + " & ".join(lst_str_results)
+        #     str_strat += "\\\\"
+        #     print(str_strat)
+
+                # exit()
diff --git a/code/vizualisation/results_to_csv.py b/code/vizualisation/results_to_csv.py
index db43618..53c7785 100644
--- a/code/vizualisation/results_to_csv.py
+++ b/code/vizualisation/results_to_csv.py
@@ -60,7 +60,7 @@ dct_dataset_fancy = {
 }
 
 dct_dataset_base_forest_size = {
-    "boston": 1000,
+    "boston": 100,
     "breast_cancer": 1000,
     "california_housing": 1000,
     "diabetes": 108,
@@ -132,7 +132,7 @@ if __name__ == "__main__":
             dct_results["wo_weights"].append(bool_wo_weights)
             dct_results["base_forest_size"].append(dct_dataset_base_forest_size[dataset])
             pruning_percent = forest_size / dct_dataset_base_forest_size[dataset]
-            dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=1))
+            dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=2))
 
 
             dct_nb_val_scores = {}
-- 
GitLab