From e35b35bf24c10982c1bcc906d5bd7cc1cff2e9f1 Mon Sep 17 00:00:00 2001 From: Luc Giffon <luc.giffon@lis-lab.fr> Date: Thu, 27 Aug 2020 16:09:36 +0200 Subject: [PATCH] code figures (these luc) --- code/vizualisation/csv_to_figure.py | 166 +++++++----- code/vizualisation/csv_to_table.py | 65 ++--- code/vizualisation/csv_to_table_these.py | 323 +++++++++++++++++++++++ code/vizualisation/results_to_csv.py | 4 +- 4 files changed, 455 insertions(+), 103 deletions(-) create mode 100644 code/vizualisation/csv_to_table_these.py diff --git a/code/vizualisation/csv_to_figure.py b/code/vizualisation/csv_to_figure.py index 25f5976..a4d0042 100644 --- a/code/vizualisation/csv_to_figure.py +++ b/code/vizualisation/csv_to_figure.py @@ -15,7 +15,7 @@ lst_task_train_dev = ["coherence", "correlation"] tasks = [ # "train_score", - # "dev_score", + "dev_score", "test_score", # "coherence", # "correlation", @@ -109,8 +109,8 @@ def add_trace_from_df(df, fig, task, strat, stop_on_flat=False): global GLOBAL_TRACE_TO_ADD_LAST df.sort_values(by="forest_size", inplace=True) - df_groupby_forest_size = df.groupby(['forest_size']) - forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values) + df_groupby_forest_size = df.groupby(['pruning_percent']) + forest_sizes = list(df_groupby_forest_size["pruning_percent"].mean().values) mean_value = df_groupby_forest_size[task].mean().values std_value = df_groupby_forest_size[task].std().values @@ -132,7 +132,8 @@ def add_trace_from_df(df, fig, task, strat, stop_on_flat=False): width=2 ) ), - showlegend=False + name="Final NN-OMP", + showlegend=True ) forest_sizes = forest_sizes[:index_flat] @@ -169,7 +170,21 @@ dct_metric_figure = { "mean_squared_error": go.Figure() } -def base_figures(): +dct_gamma_by_dataset = { + "Boston": 5, + "Breast Cancer": 5, + "California Housing": 5, + "Diabetes": 5, + "Diamonds": 5, + "Kin8nm": 5, + "KR-VS-KP": 5, + "Spambase": 5, + "Steel Plates": 5, + "Gamma": 5, + "LFW Pairs": 5, +} + +def base_figures(skip_NN=False): for task in tasks: for data_name in datasets: @@ -183,40 +198,40 @@ def base_figures(): # all techniques # ################## for strat in strategies: - if strat in lst_skip_strategy: + if strat in lst_skip_strategy or (skip_NN and "NN-OMP" in strat): continue - if task == "negative-percentage-test-score": - if strat == "OMP": - df_strat = df_data[df_data["strategy"] == strat] - df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"] - df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False] - - df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size']) - - - forest_sizes = df_groupby_forest_size["forest_size"].mean().values - x_values = df_groupby_forest_size["negative-percentage"].mean().values - y_values = df_groupby_forest_size["test_score"].mean().values - # print(df_strat) - fig.add_trace(go.Scatter(x=x_values, y=y_values, - mode='markers', - name=strat, - # color=forest_sizes, - marker=dict( - # size=16, - # cmax=39, - # cmin=0, - color=forest_sizes, - colorbar=dict( - title="Forest Size" - ), - # colorscale="Viridis" - ), - # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat])) - )) - - continue + # if task == "negative-percentage-test-score": + # if strat == "OMP": + # df_strat = df_data[df_data["strategy"] == strat] + # df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"] + # df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False] + # + # df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size']) + # + # + # forest_sizes = df_groupby_forest_size["forest_size"].mean().values + # x_values = df_groupby_forest_size["negative-percentage"].mean().values + # y_values = df_groupby_forest_size["test_score"].mean().values + # # print(df_strat) + # fig.add_trace(go.Scatter(x=x_values, y=y_values, + # mode='markers', + # name=strat, + # # color=forest_sizes, + # marker=dict( + # # size=16, + # # cmax=39, + # # cmin=0, + # color=forest_sizes, + # colorbar=dict( + # title="Forest Size" + # ), + # # colorscale="Viridis" + # ), + # # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat])) + # )) + # + # continue df_strat = df_data[df_data["strategy"] == strat] @@ -252,9 +267,10 @@ def base_figures(): title = "{} {}".format(task, data_name) yaxis_title = "% negative weights" if task == "negative-percentage" else dct_score_metric_fancy[score_metric_name] - xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "# Selected Trees" + xaxis_title = "% negative weights" if task == "negative-percentage-test-score" else "% Selected Trees" - fig.add_trace(GLOBAL_TRACE_TO_ADD_LAST) + if not skip_nn: + fig.add_trace(GLOBAL_TRACE_TO_ADD_LAST) fig.update_layout(barmode='group', # title=title, xaxis_title=xaxis_title, @@ -264,7 +280,7 @@ def base_figures(): size=24, color="black" ), - # showlegend = False, + showlegend = False, margin = dict( l=1, r=1, @@ -285,6 +301,9 @@ def base_figures(): ) ) # fig.show() + if skip_NN: + str_no_nn = " no nn" + title += str_no_nn sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_") filename = sanitize(title) output_dir = out_dir / sanitize(task) @@ -375,14 +394,14 @@ def global_figure(): # fig.show() def weights_wrt_size(): - lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"] - + # lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"] + lst_skip_data_weight_effect = ["Gamma"] fig = go.Figure() for data_name in datasets: - # if data_name in lst_skip_data_weight_effect: - # continue + if data_name in lst_skip_data_weight_effect: + continue df_data = df_results[df_results["dataset"] == data_name] score_metric_name = df_data["score_metric"].values[0] @@ -401,7 +420,7 @@ def weights_wrt_size(): y_values = df_groupby_forest_size["negative-percentage"].mean().values y_values = (y_values - np.min(y_values)) / (np.max(y_values) - np.min(y_values)) - x_values = np.around(df_groupby_forest_size["pruning_percent"].mean().values, decimals=1) + x_values = df_groupby_forest_size["pruning_percent"].mean().values # x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values)) # if score_metric_name == "mean_squared_error": @@ -410,8 +429,8 @@ def weights_wrt_size(): lin_reg = svm.SVR(gamma=10) lin_reg.fit(x_values[:, np.newaxis], y_values) - xx = np.linspace(0, 1) - yy = lin_reg.predict(xx[:, np.newaxis]) + # xx = np.linspace(0, 1) + yy = lin_reg.predict(x_values[:, np.newaxis]) # print(df_strat) fig.add_trace(go.Scatter(x=x_values, y=y_values, @@ -430,7 +449,7 @@ def weights_wrt_size(): ), # marker=dict(color="rgb{}".format(dct_color_by_strategy[strat])) )) - fig.add_trace(go.Scatter(x=xx, y=yy, + fig.add_trace(go.Scatter(x=x_values, y=yy, mode='lines', name=strat, # color=forest_sizes, @@ -452,8 +471,8 @@ def weights_wrt_size(): title = "{}".format("weight wrt size") fig.update_layout(barmode='group', - title=title, - xaxis_title="Pruning percentage", + # title=title, + xaxis_title="% Selected Trees", yaxis_title="Standardized % negative weights", font=dict( # family="Courier New, monospace", @@ -464,8 +483,8 @@ def weights_wrt_size(): margin=dict( l=1, r=1, - b=1, - t=1, + b=3, + t=10, # pad=4 ), legend=dict( @@ -488,12 +507,13 @@ def weights_wrt_size(): fig.write_image(str((output_dir / filename).absolute()) + ".png") def effect_of_weights_figure(): - lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"] + lst_skip_data_weight_effect = ["Gamma"] + # lst_skip_data_weight_effect = ["Gamma", "KR-VS-KP", "Steel Plates"] fig = go.Figure() for data_name in datasets: - + # # if data_name in lst_skip_data_weight_effect: # continue df_data = df_results[df_results["dataset"] == data_name] @@ -506,29 +526,31 @@ def effect_of_weights_figure(): df_strat = df_data[df_data["strategy"] == strat] df_strat = df_strat[df_strat["subset"] == "train+dev/train+dev"] df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False] - df_strat_wo_weights.sort_values(by="pruning_percent", inplace=True) df_groupby_forest_size = df_strat_wo_weights.groupby(['forest_size']) x_values = df_groupby_forest_size["negative-percentage"].mean().values - x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values)) - y_values = df_groupby_forest_size["test_score"].mean().values - if score_metric_name == "mean_squared_error": y_values = 1/y_values + + x_values = x_values[3:] + y_values = y_values[3:] + + x_values = (x_values - np.min(x_values)) / (np.max(x_values) - np.min(x_values)) y_values = (y_values - np.min(y_values)) / (np.max(y_values) - np.min(y_values)) - bins = np.histogram(x_values)[1] - indices_x_values = np.digitize(x_values, bins)-1 - mean_val = np.empty(len(bins)-1) - for idx_group in range(len(bins) - 1): - mean_val[idx_group] = np.mean(y_values[indices_x_values == idx_group]) + # bins = np.histogram(x_values)[1] + # indices_x_values = np.digitize(x_values, bins)-1 + # mean_val = np.empty(len(bins)-1) + # for idx_group in range(len(bins) - 1): + # mean_val[idx_group] = np.mean(y_values[indices_x_values == idx_group]) # lin_reg = LinearRegression() - lin_reg = svm.SVR(gamma=5) + # lin_reg = svm.SVR(gamma=dct_gamma_by_dataset[data_name]) + lin_reg = svm.SVR(gamma=1.) lin_reg.fit(x_values[:, np.newaxis], y_values) xx = np.linspace(0, 1) @@ -540,6 +562,7 @@ def effect_of_weights_figure(): fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='markers', name=strat, + showlegend=False, # color=forest_sizes, marker=dict( # size=16, @@ -576,15 +599,15 @@ def effect_of_weights_figure(): title = "{}".format("negative weights effect") fig.update_layout(barmode='group', - title=title, - xaxis_title="Standardized % negative weights", - yaxis_title="Normalized Performance", + # title=title, + xaxis_title="Standardized % Negative Weights", + yaxis_title="Standardized Performance", font=dict( # family="Courier New, monospace", size=24, color="black" ), - showlegend = False, + # showlegend = False, margin=dict( l=1, r=1, @@ -626,7 +649,8 @@ if __name__ == "__main__": strategies = set(df_results["strategy"].values) subsets = set(df_results["subset"].values) - # base_figures() - effect_of_weights_figure() - weights_wrt_size() + for skip_nn in [True, False]: + base_figures(skip_nn) + # effect_of_weights_figure() + # weights_wrt_size() # global_figure() diff --git a/code/vizualisation/csv_to_table.py b/code/vizualisation/csv_to_table.py index 440e5fc..0e05e33 100644 --- a/code/vizualisation/csv_to_table.py +++ b/code/vizualisation/csv_to_table.py @@ -33,18 +33,32 @@ dct_score_metric_best_fct = { "mean_squared_error": np.argmin } +# dct_data_short = { +# "Spambase": "Spambase", +# "Diamonds": "Diamonds", +# "Diabetes": "Diabetes", +# "Steel Plates": "Steel P.", +# "KR-VS-KP": "KR-VS-KP", +# "Breast Cancer": "Breast C.", +# "Kin8nm": "Kin8nm", +# "LFW Pairs": "LFW P.", +# "Gamma": "Gamma", +# "California Housing": "California H.", +# "Boston": "Boston", +# } + dct_data_short = { - "Spambase": "Spambase", - "Diamonds": "Diamonds", - "Diabetes": "Diabetes", - "Steel Plates": "Steel P.", - "KR-VS-KP": "KR-VS-KP", - "Breast Cancer": "Breast C.", - "Kin8nm": "Kin8nm", + "Spambase": "Sp. B.", + "Diamonds": "Diam.", + "Diabetes": "Diab.", + "Steel Plates": "St. P.", + "KR-VS-KP": "KR-KP", + "Breast Cancer": "B. C.", + "Kin8nm": "Kin.", "LFW Pairs": "LFW P.", - "Gamma": "Gamma", - "California Housing": "California H.", - "Boston": "Boston", + "Gamma": "Gam.", + "California Housing": "C. H.", + "Boston": "Bos.", } dct_data_best = { @@ -101,7 +115,7 @@ def get_max_from_df(df, best_fct): if __name__ == "__main__": load_dotenv(find_dotenv('.env')) - dir_name = "bolsonaro_models_25-03-20" + dir_name = "bolsonaro_models_29-03-20_v3_2" dir_path = Path(os.environ["project_dir"]) / "results" / dir_name out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name @@ -155,29 +169,19 @@ if __name__ == "__main__": if "OMP" in strat: ########################### - # traitement avec weights # + # traitement without weights # ########################### - df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False] - if data_name == "Boston" and subset_name == "train+dev/train+dev": - df_strat_wo_weights = df_strat_wo_weights[df_strat_wo_weights["forest_size"] < 400] - dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name])) - if strat not in lst_strats: lst_strats.append(strat) + df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True] - if "OMP" in strat and subset_name == "train/dev": - continue - elif "Random" not in strat and subset_name == "train/dev": - continue + strat_woweights = "{} w/o weights".format(strat) + dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name])) + if strat_woweights not in lst_strats: lst_strats.append(strat_woweights) ################################# # traitement general wo_weights # ################################# - if "Random" in strat: - df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False] - else: - df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True] + df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False] - if "OMP" in strat: - strat = "{} w/o weights".format(strat) dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name])) if strat not in lst_strats: lst_strats.append(strat) @@ -219,7 +223,8 @@ if __name__ == "__main__": lst_tpl_results = dct_data_lst_tpl_results[data_name] data_name_short = dct_data_short[data_name] s_data_tmp = "{}".format(data_name_short) - s_data_tmp += "({})".format(dct_data_metric[data_name]) + # add metric in parenthesis + # s_data_tmp += "({})".format(dct_data_metric[data_name]) # s_data_tmp = "\\texttt{{ {} }}".format(data_name_short) # s_data_tmp = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(data_name) s_data_tmp += " "*(nb_spaces - len(data_name_short)) @@ -292,8 +297,8 @@ if __name__ == "__main__": print("\\midrule") if idx_lin == 6: print("\\midrule") - if lst_data_ordered[idx_lin-1] == "Diamonds": - print("%", end="") + # if lst_data_ordered[idx_lin-1] == "Diamonds": + # print("%", end="") line_print = " ".join(list(lin)) line_print = line_print.rstrip(" &") + "\\\\" print(line_print) diff --git a/code/vizualisation/csv_to_table_these.py b/code/vizualisation/csv_to_table_these.py new file mode 100644 index 0000000..8d4dbee --- /dev/null +++ b/code/vizualisation/csv_to_table_these.py @@ -0,0 +1,323 @@ +import copy + +from dotenv import load_dotenv, find_dotenv +from pathlib import Path +import os +import pandas as pd +import numpy as np +from pprint import pprint +import plotly.graph_objects as go +import plotly.io as pio +from collections import defaultdict + +lst_skip_strategy = ["None", "OMP Distillation", "OMP Distillation w/o weights"] +lst_skip_task = ["correlation", "coherence"] +# lst_skip_task = [] +lst_skip_subset = ["train/dev"] +# lst_skip_subset = [] + +tasks = [ + # "train_score", + # "dev_score", + "test_score", + # "coherence", + # "correlation" +] + +dct_score_metric_fancy = { + "accuracy_score": "% Accuracy", + "mean_squared_error": "MSE" +} +dct_score_metric_best_fct = { + "accuracy_score": np.argmax, + "mean_squared_error": np.argmin +} + +# dct_data_short = { +# "Spambase": "Spambase", +# "Diamonds": "Diamonds", +# "Diabetes": "Diabetes", +# "Steel Plates": "Steel P.", +# "KR-VS-KP": "KR-VS-KP", +# "Breast Cancer": "Breast C.", +# "Kin8nm": "Kin8nm", +# "LFW Pairs": "LFW P.", +# "Gamma": "Gamma", +# "California Housing": "California H.", +# "Boston": "Boston", +# } + +dct_data_short = { + "Spambase": "Sp. B.", + "Diamonds": "Diam.", + "Diabetes": "Diab.", + "Steel Plates": "St. P.", + "KR-VS-KP": "KR-KP", + "Breast Cancer": "B. C.", + "Kin8nm": "Kin.", + "LFW Pairs": "LFW P.", + "Gamma": "Gam.", + "California Housing": "C. H.", + "Boston": "Bos.", +} + +dct_data_best = { + "Spambase": np.max, + "Diamonds": np.min, + "Diabetes": np.min, + "Steel Plates": np.max, + "KR-VS-KP": np.max, + "Breast Cancer": np.max, + "Kin8nm": np.min, + "LFW Pairs": np.max, + "Gamma": np.max, + "California Housing": np.min, + "Boston": np.min, +} +dct_data_metric = { + "Spambase": "Acc.", + "Diamonds": "MSE", + "Diabetes": "MSE", + "Steel Plates": "Acc.", + "KR-VS-KP": "Acc.", + "Breast Cancer": "Acc.", + "Kin8nm": "MSE", + "LFW Pairs": "Acc.", + "Gamma": "Acc.", + "California Housing": "MSE", + "Boston": "MSE", +} + + + +def get_max_from_df(df, best_fct): + nb_to_consider = 10 + df.sort_values(by="forest_size", inplace=True) + df_groupby_forest_size = df.groupby(['forest_size']) + forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values)[:nb_to_consider] + mean_value = df_groupby_forest_size[task].mean().values[:nb_to_consider] + std_value = df_groupby_forest_size[task].std().values[:nb_to_consider] + + try: + argmax = best_fct(mean_value) + except: + print("no results", strat, data_name, task, subset_name) + return -1, -1, -1 + + max_mean = mean_value[argmax] + max_std = std_value[argmax] + max_forest_size = forest_sizes[argmax] + + return max_forest_size, max_mean, max_std + + + +if __name__ == "__main__": + + load_dotenv(find_dotenv('.env')) + dir_name = "bolsonaro_models_29-03-20_v3_2" + dir_path = Path(os.environ["project_dir"]) / "results" / dir_name + + out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name + + input_dir_file = dir_path / "results.csv" + df_results = pd.read_csv(open(input_dir_file, 'rb')) + + datasets = set(df_results["dataset"].values) + strategies = sorted(list(set(df_results["strategy"].values) - set(lst_skip_strategy))) + subsets = set(df_results["subset"].values) + + r""" + \begin{table}[!h] + \centering + \begin{tabular}{l{}} + \toprule + \multicolumn{1}{c}{\textbf{Dataset}} & \textbf{Data dim.} $\datadim$ & \textbf{\# classes} & \textbf{Train size} $\nexamples$ & \textbf{Test size} $\nexamples'$ \\ \midrule + \texttt{MNIST}~\cite{lecun-mnisthandwrittendigit-2010} & 784 & 10 & 60 000 & 10 000 \\ %\hline + \texttt{Kddcup99}~\cite{Dua:2019} & 116 & 23 & 4 893 431 & 5 000 \\ + \bottomrule + \end{tabular} + \caption{Main features of the datasets. Discrete, unordered attributes for dataset Kddcup99 have been encoded as one-hot attributes.} + \label{table:data} + \end{table} + """ + + + for task in tasks: + if task in lst_skip_task: + continue + + dct_data_lst_tpl_results = defaultdict(lambda: []) + + lst_strats = [] + for data_name in datasets: + df_data = df_results[df_results["dataset"] == data_name] + score_metric_name = df_data["score_metric"].values[0] + + for subset_name in subsets: + if subset_name in lst_skip_subset: + continue + df_subset = df_data[df_data["subset"] == subset_name] + + ################## + # all techniques # + ################## + for strat in strategies: + if strat in lst_skip_strategy: + continue + df_strat = df_subset[df_subset["strategy"] == strat] + + if "OMP" in strat: + ########################### + # traitement without weights # + ########################### + df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True] + + strat_woweights = "{} w/o weights".format(strat) + dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name])) + if strat_woweights not in lst_strats: lst_strats.append(strat_woweights) + + ################################# + # traitement general wo_weights # + ################################# + df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False] + + + dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name])) + if strat not in lst_strats: lst_strats.append(strat) + + title = "{} {} {}".format(task, data_name, subset_name) + + # fig.show() + sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_") + filename = sanitize(title) + # output_dir = out_dir / sanitize(subset_name) / sanitize(task) + # output_dir.mkdir(parents=True, exist_ok=True) + # fig.write_image(str((output_dir / filename).absolute()) + ".png") + + + # pprint(dct_data_lst_tpl_results) + + lst_data_ordered = [ + "Diamonds", + "Diabetes", + "Kin8nm", + "California Housing", + "Boston", + "Spambase", + "Steel Plates", + "KR-VS-KP", + "Breast Cancer", + "LFW Pairs", + "Gamma" + ] + + + arr_results_str = np.empty((len(lst_strats)+1, len(datasets) + 1 ), dtype="object") + nb_spaces = 25 + dct_strat_str = defaultdict(lambda: []) + s_empty = "{}" + " "*(nb_spaces-2) + " & " + arr_results_str[0][0] = s_empty + # arr_results_str[0][1] = s_empty + for idx_data, data_name in enumerate(lst_data_ordered): + lst_tpl_results = dct_data_lst_tpl_results[data_name] + data_name_short = dct_data_short[data_name] + # s_data_tmp = "{}".format(data_name_short) + # add metric in parenthesis + # s_data_tmp += "({})".format(dct_data_metric[data_name]) + # s_data_tmp = "\\texttt{{ {} }}".format(data_name_short) + s_data_tmp = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(data_name) + s_data_tmp += " "*(nb_spaces - len(s_data_tmp)) + s_data_tmp += " & " + arr_results_str[0, idx_data + 1] = s_data_tmp + + + array_results = np.array(lst_tpl_results) + best_result_perf = dct_data_best[data_name](array_results[:, 1]) + best_result_perf_indexes = np.argwhere(array_results[:, 1] == best_result_perf) + + copye_array_results = copy.deepcopy(array_results) + if dct_data_best[data_name] is np.min: + copye_array_results[best_result_perf_indexes] = np.inf + else: + copye_array_results[best_result_perf_indexes] = -np.inf + + best_result_perf_2 = dct_data_best[data_name](copye_array_results[:, 1]) + best_result_perf_indexes_2 = np.argwhere(copye_array_results[:, 1] == best_result_perf_2) + + best_result_prune = np.min(array_results[:, 0]) + best_result_prune_indexes = np.argwhere(array_results[:, 0] == best_result_prune) + + for idx_strat, tpl_results in enumerate(array_results): + str_strat = "\\texttt{{ {} }}".format(lst_strats[idx_strat]) + # str_strat = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(lst_strats[idx_strat]) + # str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ \\texttt{{ {} }} }} }}".format("}\\\\ \\texttt{".join(lst_strats[idx_strat].split(" ", 1))) + # str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ {} }} }} ".format("\\\\".join(lst_strats[idx_strat].split(" ", 1))) + str_strat += " " * (nb_spaces - len(str_strat)) + " & " + arr_results_str[idx_strat+1, 0] = str_strat + + # str_header = " {} & #tree &".format(dct_data_metric[data_name]) + # arr_results_str[idx_strat + 1, 1] = str_header + + best_forest_size = tpl_results[0] + best_mean = tpl_results[1] + best_std = tpl_results[2] + if dct_data_metric[data_name] == "Acc.": + str_perf = "{:.2f}\\%".format(best_mean * 100) + else: + str_perf = "{:.3E}".format(best_mean) + + str_prune = "{:d}".format(int(best_forest_size)) + + if idx_strat in best_result_perf_indexes: + # str_formating = "\\textbf{{ {} }}".format(str_result_loc) + str_formating = "\\textbf[{}]" + # str_formating = "\\textbf{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std) + elif idx_strat in best_result_perf_indexes_2: + str_formating = "\\underline[{}]" + # str_formating = "\\underline{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std) + else: + str_formating = "{}" + # str_formating = "{:.3E}(~{:.3E})".format(best_mean, best_std) + + if idx_strat in best_result_prune_indexes: + str_formating = str_formating.format("\\textit[{}]") + # str_prune = " & \\textit{{ {:d} }}".format(int(best_forest_size)) + # else: + # str_prune = " & {:d}".format(int(best_forest_size)) + str_result = str_formating.format(str_perf) + " & " + str_formating.format(str_prune) + str_result += " "*(nb_spaces - len(str_result)) + str_result = str_result.replace("[", "{").replace("]", "}") + + arr_results_str[idx_strat+1, idx_data+1] = str_result + " & " + dct_strat_str[lst_strats[idx_strat]].append(str_result) + + # arr_results_str = arr_results_str.T + + arr_results_str_classif = arr_results_str[:, 6:] + arr_results_str_classif = np.hstack([arr_results_str[:, 0:1], arr_results_str_classif]) + arr_results_str_reg = arr_results_str[:, :6] + + for arr_results_str in [arr_results_str_classif, arr_results_str_reg]: + print(r"\toprule") + for idx_lin, lin in enumerate(arr_results_str): + if idx_lin == 1: + print("\\midrule") + # if idx_lin == 6: + # print("\\midrule") + # if lst_data_ordered[idx_lin-1] == "Diamonds": + # print("%", end="") + line_print = " ".join(list(lin)) + line_print = line_print.rstrip(" &") + "\\\\" + print(line_print) + print(r"\bottomrule") + # s_data = s_data.rstrip(" &") + "\\\\" + # print(s_data) + # for strat, lst_str_results in dct_strat_str.items(): + # str_strat = "\\texttt{{ {} }}".format(strat) + # str_strat += " "*(nb_spaces - len(str_strat)) + # str_strat += " & " + " & ".join(lst_str_results) + # str_strat += "\\\\" + # print(str_strat) + + # exit() diff --git a/code/vizualisation/results_to_csv.py b/code/vizualisation/results_to_csv.py index db43618..53c7785 100644 --- a/code/vizualisation/results_to_csv.py +++ b/code/vizualisation/results_to_csv.py @@ -60,7 +60,7 @@ dct_dataset_fancy = { } dct_dataset_base_forest_size = { - "boston": 1000, + "boston": 100, "breast_cancer": 1000, "california_housing": 1000, "diabetes": 108, @@ -132,7 +132,7 @@ if __name__ == "__main__": dct_results["wo_weights"].append(bool_wo_weights) dct_results["base_forest_size"].append(dct_dataset_base_forest_size[dataset]) pruning_percent = forest_size / dct_dataset_base_forest_size[dataset] - dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=1)) + dct_results["pruning_percent"].append(np.round(pruning_percent, decimals=2)) dct_nb_val_scores = {} -- GitLab