code figures (these luc)

e35b35bf · Luc Giffon · 17fa2b1f · e35b35bf · e35b35bf · e35b35bf
Commit e35b35bf authored 4 years ago by Luc Giffon
--- a/code/vizualisation/csv_to_figure.py
+++ b/code/vizualisation/csv_to_figure.py
@@ -15,7 +15,7 @@ lst_task_train_dev = ["coherence", "correlation"]
 tasks = [
    # "train_score",
-    # "dev_score",
+    "dev_score",
    "test_score",
    # "coherence",
    # "correlation",
@@ -109,8 +109,8 @@ def add_trace_from_df(df, fig, task, strat, stop_on_flat=False):
    global GLOBAL_TRACE_TO_ADD_LAST
    df.sort_values(by="forest_size", inplace=True)
-    df_groupby_forest_size = df.groupby(['forest_size'])
+    df_groupby_forest_size = df.groupby(['pruning_percent'])
-    forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values)
+    forest_sizes = list(df_groupby_forest_size["pruning_percent"].mean().values)
    mean_value = df_groupby_forest_size[task].mean().values
    std_value = df_groupby_forest_size[task].std().values
@@ -132,7 +132,8 @@ def add_trace_from_df(df, fig, task, strat, stop_on_flat=False):
                            width=2
                        )
                    ),
-                    showlegend=False
+                    name="Final NN-OMP",
+                    showlegend=True
                )
    forest_sizes = forest_sizes[:index_flat]
@@ -169,7 +170,21 @@ dct_metric_figure = {
    "mean_squared_error": go.Figure()
 }
-def base_figures():
+dct_gamma_by_dataset = {
+    "Boston": 5,
+    "Breast Cancer": 5,
+    "California Housing": 5,
+    "Diabetes": 5,
+    "Diamonds": 5,
+    "Kin8nm": 5,
+    "KR-VS-KP": 5,
+    "Spambase": 5,
+    "Steel Plates": 5,
+    "Gamma": 5,
+    "LFW Pairs": 5,
+}
+def base_figures(skip_NN=False):
    for task in tasks:
        for data_name in datasets:
@@ -183,40 +198,40 @@ def base_figures():
            # all techniques #
            ##################
            for strat in strategies:
-                if strat in lst_skip_strategy:
+                if strat in lst_skip_strategy or (skip_NN and "NN-OMP" in strat):
                    continue
-                if task == "negative-percentage-test-score":
+                # if task == "negative-percentage-test-score":
-                    if strat == "OMP":
+                #     if strat == "OMP":
-                        df_strat = df_data[df_data["strategy"] == strat]
+                #         df_strat = df_data[df_data["strategy"] == strat]
-                        df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"]
+                #         df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"]
-                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
+                #         df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
+                #
-                        df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size'])
+                #         df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size'])
+                #
+                #
-                        forest_sizes = df_groupby_forest_size["forest_size"].mean().values
+                #         forest_sizes = df_groupby_forest_size["forest_size"].mean().values
-                        x_values = df_groupby_forest_size["negative-percentage"].mean().values
+                #         x_values = df_groupby_forest_size["negative-percentage"].mean().values
-                        y_values = df_groupby_forest_size["test_score"].mean().values
+                #         y_values = df_groupby_forest_size["test_score"].mean().values
-                        # print(df_strat)
+                #         # print(df_strat)
-                        fig.add_trace(go.Scatter(x=x_values, y=y_values,
+                #         fig.add_trace(go.Scatter(x=x_values, y=y_values,
-                                                 mode='markers',
+                #                                  mode='markers',
-                                                 name=strat,
+                #                                  name=strat,
+                #                                  # color=forest_sizes,
+                #                                  marker=dict(
+                #                                     # size=16,
+                #                                     # cmax=39,
+                #                                     # cmin=0,
                #                                     color=forest_sizes,
-                                                 marker=dict(
+                #                                     colorbar=dict(
-                                                    # size=16,
+                #                                         title="Forest Size"
-                                                    # cmax=39,
+                #                                     ),
-                                                    # cmin=0,
+                #                                     # colorscale="Viridis"
-                                                    color=forest_sizes,
+                #                                 ),
-                                                    colorbar=dict(
+                #                                  # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat]))
-                                                        title="Forest Size"
+                #          ))
-                                                    ),
+                #
-                                                    # colorscale="Viridis"
+                #     continue
-                                                ),
-                                                 # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat]))
-                         ))
-                    continue
                df_strat = df_data[df_data["strategy"] == strat]
@@ -252,8 +267,9 @@ def base_figures():
            title = "{} {}".format(task, data_name)
            yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name]
-            xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "# Selected Trees"
+            xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees"
+            if not skip_nn:
                fig.add_trace(GLOBAL_TRACE_TO_ADD_LAST)
            fig.update_layout(barmode='group',
                              # title=title,
@@ -264,7 +280,7 @@ def base_figures():
                                  size=24,
                                  color="black"
                              ),
-                                # showlegend = False,
+                                showlegend = False,
                                margin = dict(
                                    l=1,
                                    r=1,
@@ -285,6 +301,9 @@ def base_figures():
                              )
                              )
            # fig.show()
+            if skip_NN:
+                str_no_nn = " no nn"
+                title += str_no_nn
            sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_")
            filename = sanitize(title)
            output_dir = out_dir / sanitize(task)
@@ -375,14 +394,14 @@ def global_figure():
            # fig.show()
 def weights_wrt_size():
-    lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"]
+    # lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"]
+    lst_skip_data_weight_effect = ["Gamma"]
    fig = go.Figure()
    for data_name in datasets:
-        # if data_name in lst_skip_data_weight_effect:
+        if data_name in lst_skip_data_weight_effect:
-        #     continue
+            continue
        df_data = df_results[df_results["dataset"] == data_name]
        score_metric_name = df_data["score_metric"].values[0]
@@ -401,7 +420,7 @@ def weights_wrt_size():
        y_values = df_groupby_forest_size["negative-percentage"].mean().values
        y_values = (y_values - np.min(y_values)) / (np.max(y_values) - np.min(y_values))
-        x_values = np.around(df_groupby_forest_size["pruning_percent"].mean().values, decimals=1)
+        x_values = df_groupby_forest_size["pruning_percent"].mean().values
        # x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values))
        # if score_metric_name == "mean_squared_error":
@@ -410,8 +429,8 @@ def weights_wrt_size():
        lin_reg = svm.SVR(gamma=10)
        lin_reg.fit(x_values[:, np.newaxis], y_values)
-        xx = np.linspace(0, 1)
+        # xx = np.linspace(0, 1)
-        yy = lin_reg.predict(xx[:, np.newaxis])
+        yy = lin_reg.predict(x_values[:, np.newaxis])
        # print(df_strat)
        fig.add_trace(go.Scatter(x=x_values, y=y_values,
@@ -430,7 +449,7 @@ def weights_wrt_size():
                                 ),
                                 # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat]))
                                 ))
-        fig.add_trace(go.Scatter(x=xx, y=yy,
+        fig.add_trace(go.Scatter(x=x_values, y=yy,
                                 mode='lines',
                                 name=strat,
                                 # color=forest_sizes,
@@ -452,8 +471,8 @@ def weights_wrt_size():
    title = "{}".format("weight wrt size")
    fig.update_layout(barmode='group',
-                      title=title,
+                      # title=title,
-                      xaxis_title="Pruning percentage",
+                      xaxis_title="% Selected Trees",
                      yaxis_title="Standardized % negative weights",
                      font=dict(
                          # family="Courier New, monospace",
@@ -464,8 +483,8 @@ def weights_wrt_size():
                      margin=dict(
                          l=1,
                          r=1,
-                          b=1,
+                          b=3,
-                          t=1,
+                          t=10,
                          # pad=4
                      ),
                      legend=dict(
@@ -488,12 +507,13 @@ def weights_wrt_size():
    fig.write_image(str((output_dir / filename).absolute()) + ".png")
 def effect_of_weights_figure():
-    lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"]
+    lst_skip_data_weight_effect = ["Gamma"]
+    # lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"]
    fig = go.Figure()
    for data_name in datasets:
+        #
        # if data_name in lst_skip_data_weight_effect:
        #     continue
        df_data = df_results[df_results["dataset"] == data_name]
@@ -506,29 +526,31 @@ def effect_of_weights_figure():
        df_strat = df_data[df_data["strategy"] == strat]
        df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"]
        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
        df_strat_wo_weights.sort_values(by="pruning_percent", inplace=True)
        df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size'])
        x_values = df_groupby_forest_size["negative-percentage"].mean().values
-        x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values))
        y_values = df_groupby_forest_size["test_score"].mean().values
        if score_metric_name == "mean_squared_error":
            y_values = 1/y_values
+        x_values = x_values[3:]
+        y_values = y_values[3:]
+        x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values))
        y_values = (y_values - np.min(y_values)) / (np.max(y_values) - np.min(y_values))
-        bins = np.histogram(x_values)[1]
+        # bins = np.histogram(x_values)[1]
-        indices_x_values = np.digitize(x_values, bins)-1
+        # indices_x_values = np.digitize(x_values, bins)-1
-        mean_val = np.empty(len(bins)-1)
+        # mean_val = np.empty(len(bins)-1)
-        for idx_group in range(len(bins) - 1):
+        # for idx_group in range(len(bins) - 1):
-            mean_val[idx_group] = np.mean(y_values[indices_x_values == idx_group])
+        #     mean_val[idx_group] = np.mean(y_values[indices_x_values == idx_group])
        # lin_reg = LinearRegression()
-        lin_reg = svm.SVR(gamma=5)
+        # lin_reg = svm.SVR(gamma=dct_gamma_by_dataset[data_name])
+        lin_reg = svm.SVR(gamma=1.)
        lin_reg.fit(x_values[:, np.newaxis], y_values)
        xx = np.linspace(0, 1)
@@ -540,6 +562,7 @@ def effect_of_weights_figure():
        fig.add_trace(go.Scatter(x=x_values, y=y_values,
                                 mode='markers',
                                 name=strat,
+                                 showlegend=False,
                                 # color=forest_sizes,
                                 marker=dict(
                                     # size=16,
@@ -576,15 +599,15 @@ def effect_of_weights_figure():
    title = "{}".format("negative weights effect")
    fig.update_layout(barmode='group',
-                      title=title,
+                      # title=title,
-                      xaxis_title="Standardized % negative weights",
+                      xaxis_title="Standardized % Negative Weights",
-                      yaxis_title="Normalized Performance",
+                      yaxis_title="Standardized Performance",
                      font=dict(
                          # family="Courier New, monospace",
                          size=24,
                          color="black"
                      ),
-                      showlegend = False,
+                      # showlegend = False,
                      margin=dict(
                          l=1,
                          r=1,
@@ -626,7 +649,8 @@ if __name__ == "__main__":
    strategies = set(df_results["strategy"].values)
    subsets = set(df_results["subset"].values)
-    # base_figures()
+    for skip_nn in [True, False]:
-    effect_of_weights_figure()
+        base_figures(skip_nn)
-    weights_wrt_size()
+    # effect_of_weights_figure()
+    # weights_wrt_size()
    # global_figure()
--- a/code/vizualisation/csv_to_table.py
+++ b/code/vizualisation/csv_to_table.py
@@ -33,18 +33,32 @@ dct_score_metric_best_fct = {
    "mean_squared_error": np.argmin
 }
+# dct_data_short = {
+#     "Spambase": "Spambase",
+#     "Diamonds": "Diamonds",
+#     "Diabetes": "Diabetes",
+#     "Steel Plates": "Steel P.",
+#     "KR-VS-KP": "KR-VS-KP",
+#     "Breast Cancer": "Breast C.",
+#     "Kin8nm": "Kin8nm",
+#     "LFW Pairs": "LFW P.",
+#     "Gamma": "Gamma",
+#     "California Housing": "California H.",
+#     "Boston": "Boston",
+# }
 dct_data_short = {
-    "Spambase": "Spambase",
+    "Spambase": "Sp. B.",
-    "Diamonds": "Diamonds",
+    "Diamonds": "Diam.",
-    "Diabetes": "Diabetes",
+    "Diabetes": "Diab.",
-    "Steel Plates": "Steel P.",
+    "Steel Plates": "St. P.",
-    "KR-VS-KP": "KR-VS-KP",
+    "KR-VS-KP": "KR-KP",
-    "Breast Cancer": "Breast C.",
+    "Breast Cancer": "B. C.",
-    "Kin8nm": "Kin8nm",
+    "Kin8nm": "Kin.",
    "LFW Pairs": "LFW P.",
-    "Gamma": "Gamma",
+    "Gamma": "Gam.",
-    "California Housing": "California H.",
+    "California Housing": "C. H.",
-    "Boston": "Boston",
+    "Boston": "Bos.",
 }
 dct_data_best = {
@@ -101,7 +115,7 @@ def get_max_from_df(df, best_fct):
 if __name__ == "__main__":
    load_dotenv(find_dotenv('.env'))
-    dir_name = "bolsonaro_models_25-03-20"
+    dir_name = "bolsonaro_models_29-03-20_v3_2"
    dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
    out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
@@ -155,29 +169,19 @@ if __name__ == "__main__":
                    if "OMP" in strat:
                        ###########################
-                        # traitement avec weights #
+                        # traitement without weights #
                        ###########################
-                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
+                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
-                        if data_name == "Boston" and subset_name == "train+dev/train+dev":
-                            df_strat_wo_weights = df_strat_wo_weights[df_strat_wo_weights["forest_size"] < 400]
-                        dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
-                        if strat not in lst_strats: lst_strats.append(strat)
-                    if "OMP" in strat and subset_name == "train/dev":
+                        strat_woweights = "{} w/o weights".format(strat)
-                        continue
+                        dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
-                    elif "Random" not in strat and subset_name == "train/dev":
+                        if strat_woweights not in lst_strats: lst_strats.append(strat_woweights)
-                        continue
                    #################################
                    # traitement general wo_weights #
                    #################################
-                    if "Random" in strat:
                    df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
-                    else:
-                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
-                    if "OMP" in strat:
-                        strat = "{} w/o weights".format(strat)
                    dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
                    if strat not in lst_strats: lst_strats.append(strat)
@@ -219,7 +223,8 @@ if __name__ == "__main__":
            lst_tpl_results = dct_data_lst_tpl_results[data_name]
            data_name_short = dct_data_short[data_name]
            s_data_tmp = "{}".format(data_name_short)
-            s_data_tmp += "({})".format(dct_data_metric[data_name])
+            # add metric in parenthesis
+            # s_data_tmp += "({})".format(dct_data_metric[data_name])
            # s_data_tmp = "\\texttt{{ {} }}".format(data_name_short)
            # s_data_tmp = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(data_name)
            s_data_tmp += " "*(nb_spaces - len(data_name_short))
@@ -292,8 +297,8 @@ if __name__ == "__main__":
                print("\\midrule")
            if idx_lin == 6:
                print("\\midrule")
-            if lst_data_ordered[idx_lin-1] == "Diamonds":
+            # if lst_data_ordered[idx_lin-1] == "Diamonds":
-                print("%", end="")
+            #     print("%", end="")
            line_print = " ".join(list(lin))
            line_print = line_print.rstrip(" &") + "\\\\"
            print(line_print)

--- a/code/vizualisation/csv_to_table_these.py
+++ b/code/vizualisation/csv_to_table_these.py
+import copy
+from dotenv import load_dotenv, find_dotenv
+from pathlib import Path
+import os
+import pandas as pd
+import numpy as np
+from pprint import pprint
+import plotly.graph_objects as go
+import plotly.io as pio
+from collections import defaultdict
+lst_skip_strategy = ["None", "OMP Distillation", "OMP Distillation w/o weights"]
+lst_skip_task = ["correlation", "coherence"]
+# lst_skip_task = []
+lst_skip_subset = ["train/dev"]
+# lst_skip_subset = []
+tasks = [
+    # "train_score",
+    # "dev_score",
+    "test_score",
+    # "coherence",
+    # "correlation"
+]
+dct_score_metric_fancy = {
+    "accuracy_score": "% Accuracy",
+    "mean_squared_error": "MSE"
+}
+dct_score_metric_best_fct = {
+    "accuracy_score": np.argmax,
+    "mean_squared_error": np.argmin
+}
+# dct_data_short = {
+#     "Spambase": "Spambase",
+#     "Diamonds": "Diamonds",
+#     "Diabetes": "Diabetes",
+#     "Steel Plates": "Steel P.",
+#     "KR-VS-KP": "KR-VS-KP",
+#     "Breast Cancer": "Breast C.",
+#     "Kin8nm": "Kin8nm",
+#     "LFW Pairs": "LFW P.",
+#     "Gamma": "Gamma",
+#     "California Housing": "California H.",
+#     "Boston": "Boston",
+# }
+dct_data_short = {
+    "Spambase": "Sp. B.",
+    "Diamonds": "Diam.",
+    "Diabetes": "Diab.",
+    "Steel Plates": "St. P.",
+    "KR-VS-KP": "KR-KP",
+    "Breast Cancer": "B. C.",
+    "Kin8nm": "Kin.",
+    "LFW Pairs": "LFW P.",
+    "Gamma": "Gam.",
+    "California Housing": "C. H.",
+    "Boston": "Bos.",
+}
+dct_data_best = {
+    "Spambase": np.max,
+    "Diamonds": np.min,
+    "Diabetes": np.min,
+    "Steel Plates": np.max,
+    "KR-VS-KP": np.max,
+    "Breast Cancer": np.max,
+    "Kin8nm": np.min,
+    "LFW Pairs": np.max,
+    "Gamma": np.max,
+    "California Housing": np.min,
+    "Boston": np.min,
+}
+dct_data_metric = {
+    "Spambase": "Acc.",
+    "Diamonds": "MSE",
+    "Diabetes": "MSE",
+    "Steel Plates": "Acc.",
+    "KR-VS-KP": "Acc.",
+    "Breast Cancer": "Acc.",
+    "Kin8nm": "MSE",
+    "LFW Pairs": "Acc.",
+    "Gamma": "Acc.",
+    "California Housing": "MSE",
+    "Boston": "MSE",
+}
+def get_max_from_df(df, best_fct):
+    nb_to_consider = 10
+    df.sort_values(by="forest_size", inplace=True)
+    df_groupby_forest_size = df.groupby(['forest_size'])
+    forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values)[:nb_to_consider]
+    mean_value = df_groupby_forest_size[task].mean().values[:nb_to_consider]
+    std_value = df_groupby_forest_size[task].std().values[:nb_to_consider]
+    try:
+        argmax = best_fct(mean_value)
+    except:
+        print("no results", strat, data_name, task, subset_name)
+        return -1, -1, -1
+    max_mean = mean_value[argmax]
+    max_std = std_value[argmax]
+    max_forest_size = forest_sizes[argmax]
+    return max_forest_size, max_mean, max_std
+if __name__ == "__main__":
+    load_dotenv(find_dotenv('.env'))
+    dir_name = "bolsonaro_models_29-03-20_v3_2"
+    dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
+    out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
+    input_dir_file = dir_path / "results.csv"
+    df_results = pd.read_csv(open(input_dir_file, 'rb'))
+    datasets = set(df_results["dataset"].values)
+    strategies = sorted(list(set(df_results["strategy"].values) - set(lst_skip_strategy)))
+    subsets = set(df_results["subset"].values)
+    r"""
+    \begin{table}[!h]
+    \centering
+    \begin{tabular}{l{}}
+    \toprule
+    \multicolumn{1}{c}{\textbf{Dataset}} & \textbf{Data dim.} $\datadim$        & \textbf{\# classes} & \textbf{Train size} $\nexamples$ & \textbf{Test size} $\nexamples'$ \\ \midrule
+    \texttt{MNIST}~\cite{lecun-mnisthandwrittendigit-2010}                      & 784    & 10        & 60 000    & 10 000               \\ %\hline
+    \texttt{Kddcup99}~\cite{Dua:2019}                                           & 116    & 23      & 4 893 431      & 5 000               \\ 
+    \bottomrule
+    \end{tabular}
+    \caption{Main features of the datasets. Discrete, unordered attributes for dataset Kddcup99 have been encoded as one-hot attributes.}
+    \label{table:data}
+    \end{table}
+    """
+    for task in tasks:
+        if task in lst_skip_task:
+            continue
+        dct_data_lst_tpl_results = defaultdict(lambda: [])
+        lst_strats = []
+        for data_name in datasets:
+            df_data = df_results[df_results["dataset"] == data_name]
+            score_metric_name = df_data["score_metric"].values[0]
+            for subset_name in subsets:
+                if subset_name in lst_skip_subset:
+                    continue
+                df_subset = df_data[df_data["subset"] == subset_name]
+                ##################
+                # all techniques #
+                ##################
+                for strat in strategies:
+                    if strat in lst_skip_strategy:
+                        continue
+                    df_strat = df_subset[df_subset["strategy"] == strat]
+                    if "OMP" in strat:
+                        ###########################
+                        # traitement without weights #
+                        ###########################
+                        df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
+                        strat_woweights = "{} w/o weights".format(strat)
+                        dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
+                        if strat_woweights not in lst_strats: lst_strats.append(strat_woweights)
+                    #################################
+                    # traitement general wo_weights #
+                    #################################
+                    df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
+                    dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
+                    if strat not in lst_strats: lst_strats.append(strat)
+                title = "{} {} {}".format(task, data_name, subset_name)
+                # fig.show()
+                sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_")
+                filename = sanitize(title)
+                # output_dir = out_dir / sanitize(subset_name) / sanitize(task)
+                # output_dir.mkdir(parents=True, exist_ok=True)
+                # fig.write_image(str((output_dir / filename).absolute()) + ".png")
+        # pprint(dct_data_lst_tpl_results)
+        lst_data_ordered = [
+            "Diamonds",
+            "Diabetes",
+            "Kin8nm",
+            "California Housing",
+            "Boston",
+            "Spambase",
+            "Steel Plates",
+            "KR-VS-KP",
+            "Breast Cancer",
+            "LFW Pairs",
+            "Gamma"
+        ]
+        arr_results_str = np.empty((len(lst_strats)+1, len(datasets) + 1 ), dtype="object")
+        nb_spaces = 25
+        dct_strat_str = defaultdict(lambda: [])
+        s_empty = "{}" + " "*(nb_spaces-2) + " & "
+        arr_results_str[0][0] = s_empty
+        # arr_results_str[0][1] = s_empty
+        for idx_data, data_name in enumerate(lst_data_ordered):
+            lst_tpl_results = dct_data_lst_tpl_results[data_name]
+            data_name_short = dct_data_short[data_name]
+            # s_data_tmp = "{}".format(data_name_short)
+            # add metric in parenthesis
+            # s_data_tmp += "({})".format(dct_data_metric[data_name])
+            # s_data_tmp = "\\texttt{{ {} }}".format(data_name_short)
+            s_data_tmp = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(data_name)
+            s_data_tmp += " "*(nb_spaces - len(s_data_tmp))
+            s_data_tmp += " & "
+            arr_results_str[0, idx_data + 1] = s_data_tmp
+            array_results = np.array(lst_tpl_results)
+            best_result_perf = dct_data_best[data_name](array_results[:, 1])
+            best_result_perf_indexes = np.argwhere(array_results[:, 1] == best_result_perf)
+            copye_array_results = copy.deepcopy(array_results)
+            if dct_data_best[data_name] is np.min:
+                copye_array_results[best_result_perf_indexes] = np.inf
+            else:
+                copye_array_results[best_result_perf_indexes] = -np.inf
+            best_result_perf_2 = dct_data_best[data_name](copye_array_results[:, 1])
+            best_result_perf_indexes_2 = np.argwhere(copye_array_results[:, 1] == best_result_perf_2)
+            best_result_prune = np.min(array_results[:, 0])
+            best_result_prune_indexes = np.argwhere(array_results[:, 0] == best_result_prune)
+            for idx_strat, tpl_results in enumerate(array_results):
+                str_strat = "\\texttt{{ {} }}".format(lst_strats[idx_strat])
+                # str_strat = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(lst_strats[idx_strat])
+                # str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ \\texttt{{ {} }} }} }}".format("}\\\\ \\texttt{".join(lst_strats[idx_strat].split(" ", 1)))
+                # str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ {} }} }} ".format("\\\\".join(lst_strats[idx_strat].split(" ", 1)))
+                str_strat += " " * (nb_spaces - len(str_strat)) + " & "
+                arr_results_str[idx_strat+1, 0] =  str_strat
+                # str_header = " {} & #tree &".format(dct_data_metric[data_name])
+                # arr_results_str[idx_strat + 1, 1] = str_header
+                best_forest_size = tpl_results[0]
+                best_mean = tpl_results[1]
+                best_std = tpl_results[2]
+                if dct_data_metric[data_name] == "Acc.":
+                    str_perf = "{:.2f}\\%".format(best_mean * 100)
+                else:
+                    str_perf = "{:.3E}".format(best_mean)
+                str_prune = "{:d}".format(int(best_forest_size))
+                if idx_strat in best_result_perf_indexes:
+                    # str_formating = "\\textbf{{ {} }}".format(str_result_loc)
+                    str_formating = "\\textbf[{}]"
+                    # str_formating = "\\textbf{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std)
+                elif idx_strat in best_result_perf_indexes_2:
+                    str_formating = "\\underline[{}]"
+                    # str_formating = "\\underline{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std)
+                else:
+                    str_formating = "{}"
+                    # str_formating = "{:.3E}(~{:.3E})".format(best_mean, best_std)
+                if idx_strat in best_result_prune_indexes:
+                    str_formating = str_formating.format("\\textit[{}]")
+                    # str_prune = " & \\textit{{ {:d} }}".format(int(best_forest_size))
+                # else:
+                #     str_prune = " & {:d}".format(int(best_forest_size))
+                str_result = str_formating.format(str_perf) + " & " + str_formating.format(str_prune)
+                str_result += " "*(nb_spaces - len(str_result))
+                str_result = str_result.replace("[", "{").replace("]", "}")
+                arr_results_str[idx_strat+1, idx_data+1] = str_result + " & "
+                dct_strat_str[lst_strats[idx_strat]].append(str_result)
+        # arr_results_str = arr_results_str.T
+        arr_results_str_classif = arr_results_str[:, 6:]
+        arr_results_str_classif = np.hstack([arr_results_str[:, 0:1], arr_results_str_classif])
+        arr_results_str_reg = arr_results_str[:, :6]
+        for arr_results_str in [arr_results_str_classif, arr_results_str_reg]:
+            print(r"\toprule")
+            for idx_lin, lin in enumerate(arr_results_str):
+                if idx_lin == 1:
+                    print("\\midrule")
+                # if idx_lin == 6:
+                #     print("\\midrule")
+                # if lst_data_ordered[idx_lin-1] == "Diamonds":
+                #     print("%", end="")
+                line_print = " ".join(list(lin))
+                line_print = line_print.rstrip(" &") + "\\\\"
+                print(line_print)
+            print(r"\bottomrule")
+        # s_data = s_data.rstrip(" &") + "\\\\"
+        # print(s_data)
+        # for strat, lst_str_results in dct_strat_str.items():
+        #     str_strat = "\\texttt{{ {} }}".format(strat)
+        #     str_strat += " "*(nb_spaces - len(str_strat))
+        #     str_strat += " & " + " & ".join(lst_str_results)
+        #     str_strat += "\\\\"
+        #     print(str_strat)
+                # exit()
--- a/code/vizualisation/results_to_csv.py
+++ b/code/vizualisation/results_to_csv.py
@@ -60,7 +60,7 @@ dct_dataset_fancy = {
 }
 dct_dataset_base_forest_size = {
-    "boston": 1000,
+    "boston": 100,
    "breast_cancer": 1000,
    "california_housing": 1000,
    "diabetes": 108,
@@ -132,7 +132,7 @@ if __name__ == "__main__":
            dct_results["wo_weights"].append(bool_wo_weights)
            dct_results["base_forest_size"].append(dct_dataset_base_forest_size[dataset])
            pruning_percent = forest_size / dct_dataset_base_forest_size[dataset]
-            dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=1))
+            dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=2))
            dct_nb_val_scores = {}