Skip to content
Snippets Groups Projects
csv_to_table.py 12 KiB
Newer Older
  • Learn to ignore specific revisions
  • Luc Giffon's avatar
    Luc Giffon committed
    import copy
    
    from dotenv import load_dotenv, find_dotenv
    from pathlib import Path
    import os
    import pandas as pd
    import numpy as np
    from pprint import pprint
    import plotly.graph_objects as go
    import plotly.io as pio
    from collections import defaultdict
    
    lst_skip_strategy = ["None", "OMP Distillation", "OMP Distillation w/o weights"]
    lst_skip_task = ["correlation", "coherence"]
    # lst_skip_task = []
    lst_skip_subset = ["train/dev"]
    # lst_skip_subset = []
    
    tasks = [
        # "train_score",
        # "dev_score",
        "test_score",
        # "coherence",
        # "correlation"
    ]
    
    dct_score_metric_fancy = {
        "accuracy_score": "% Accuracy",
        "mean_squared_error": "MSE"
    }
    dct_score_metric_best_fct = {
        "accuracy_score": np.argmax,
        "mean_squared_error": np.argmin
    }
    
    dct_data_short = {
        "Spambase": "Spambase",
        "Diamonds": "Diamonds",
        "Diabetes": "Diabetes",
        "Steel Plates": "Steel P.",
        "KR-VS-KP": "KR-VS-KP",
        "Breast Cancer": "Breast C.",
        "Kin8nm": "Kin8nm",
        "LFW Pairs": "LFW P.",
        "Gamma": "Gamma",
        "California Housing": "California H.",
        "Boston": "Boston",
    }
    
    dct_data_best = {
        "Spambase": np.max,
        "Diamonds": np.min,
        "Diabetes": np.min,
        "Steel Plates": np.max,
        "KR-VS-KP": np.max,
        "Breast Cancer": np.max,
        "Kin8nm": np.min,
        "LFW Pairs": np.max,
        "Gamma": np.max,
        "California Housing": np.min,
        "Boston": np.min,
    }
    dct_data_metric = {
        "Spambase": "Acc.",
        "Diamonds": "MSE",
        "Diabetes": "MSE",
        "Steel Plates": "Acc.",
        "KR-VS-KP": "Acc.",
        "Breast Cancer": "Acc.",
        "Kin8nm": "MSE",
        "LFW Pairs": "Acc.",
        "Gamma": "Acc.",
        "California Housing": "MSE",
        "Boston": "MSE",
    }
    
    
    
    def get_max_from_df(df, best_fct):
    
    Luc Giffon's avatar
    Luc Giffon committed
        nb_to_consider = 10
    
    Luc Giffon's avatar
    Luc Giffon committed
        df.sort_values(by="forest_size", inplace=True)
        df_groupby_forest_size = df.groupby(['forest_size'])
        forest_sizes = list(df_groupby_forest_size["forest_size"].mean().values)[:nb_to_consider]
        mean_value = df_groupby_forest_size[task].mean().values[:nb_to_consider]
        std_value = df_groupby_forest_size[task].std().values[:nb_to_consider]
    
        try:
            argmax = best_fct(mean_value)
        except:
            print("no results", strat, data_name, task, subset_name)
            return -1, -1, -1
    
        max_mean = mean_value[argmax]
        max_std = std_value[argmax]
        max_forest_size = forest_sizes[argmax]
    
        return max_forest_size, max_mean, max_std
    
    
    
    if __name__ == "__main__":
    
        load_dotenv(find_dotenv('.env'))
        dir_name = "bolsonaro_models_25-03-20"
        dir_path = Path(os.environ["project_dir"]) / "results" / dir_name
    
        out_dir = Path(os.environ["project_dir"]) / "reports/figures" / dir_name
    
        input_dir_file = dir_path / "results.csv"
        df_results = pd.read_csv(open(input_dir_file, 'rb'))
    
        datasets = set(df_results["dataset"].values)
        strategies = sorted(list(set(df_results["strategy"].values) - set(lst_skip_strategy)))
        subsets = set(df_results["subset"].values)
    
        r"""
        \begin{table}[!h]
        \centering
        \begin{tabular}{l{}}
        \toprule
        \multicolumn{1}{c}{\textbf{Dataset}} & \textbf{Data dim.} $\datadim$        & \textbf{\# classes} & \textbf{Train size} $\nexamples$ & \textbf{Test size} $\nexamples'$ \\ \midrule
        \texttt{MNIST}~\cite{lecun-mnisthandwrittendigit-2010}                      & 784    & 10        & 60 000    & 10 000               \\ %\hline
        \texttt{Kddcup99}~\cite{Dua:2019}                                           & 116    & 23      & 4 893 431      & 5 000               \\ 
        \bottomrule
        \end{tabular}
        \caption{Main features of the datasets. Discrete, unordered attributes for dataset Kddcup99 have been encoded as one-hot attributes.}
        \label{table:data}
        \end{table}
        """
    
    
        for task in tasks:
            if task in lst_skip_task:
                continue
    
            dct_data_lst_tpl_results = defaultdict(lambda: [])
    
            lst_strats = []
            for data_name in datasets:
                df_data = df_results[df_results["dataset"] == data_name]
                score_metric_name = df_data["score_metric"].values[0]
    
                for subset_name in subsets:
                    if subset_name in lst_skip_subset:
                        continue
                    df_subset = df_data[df_data["subset"] == subset_name]
    
                    ##################
                    # all techniques #
                    ##################
                    for strat in strategies:
                        if strat in lst_skip_strategy:
                            continue
                        df_strat = df_subset[df_subset["strategy"] == strat]
    
                        if "OMP" in strat:
                            ###########################
                            # traitement avec weights #
                            ###########################
                            df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
                            if data_name == "Boston" and subset_name == "train+dev/train+dev":
                                df_strat_wo_weights = df_strat_wo_weights[df_strat_wo_weights["forest_size"] < 400]
                            dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
                            if strat not in lst_strats: lst_strats.append(strat)
    
                        if "OMP" in strat and subset_name == "train/dev":
                            continue
                        elif "Random" not in strat and subset_name == "train/dev":
                            continue
    
                        #################################
                        # traitement general wo_weights #
                        #################################
                        if "Random" in strat:
                            df_strat_wo_weights = df_strat[df_strat["wo_weights"] == False]
                        else:
                            df_strat_wo_weights = df_strat[df_strat["wo_weights"] == True]
    
                        if "OMP" in strat:
                            strat = "{} w/o weights".format(strat)
    
                        dct_data_lst_tpl_results[data_name].append(get_max_from_df(df_strat_wo_weights, dct_score_metric_best_fct[score_metric_name]))
                        if strat not in lst_strats: lst_strats.append(strat)
    
                    title = "{} {} {}".format(task, data_name, subset_name)
    
                    # fig.show()
                    sanitize = lambda x: x.replace(" ", "_").replace("/", "_").replace("+", "_")
                    filename = sanitize(title)
                    # output_dir = out_dir / sanitize(subset_name) / sanitize(task)
                    # output_dir.mkdir(parents=True, exist_ok=True)
                    # fig.write_image(str((output_dir / filename).absolute()) + ".png")
    
    
            # pprint(dct_data_lst_tpl_results)
    
            lst_data_ordered = [
                "Diamonds",
                "Diabetes",
                "Kin8nm",
                "California Housing",
                "Boston",
                "Spambase",
                "Steel Plates",
                "KR-VS-KP",
                "Breast Cancer",
                "LFW Pairs",
                "Gamma"
            ]
    
    
            arr_results_str = np.empty((len(lst_strats)+1, len(datasets) + 1 ), dtype="object")
            nb_spaces = 25
            dct_strat_str = defaultdict(lambda: [])
            s_empty = "{}" + " "*(nb_spaces-2) + " & "
            arr_results_str[0][0] = s_empty
            # arr_results_str[0][1] = s_empty
            for idx_data, data_name in enumerate(lst_data_ordered):
                lst_tpl_results = dct_data_lst_tpl_results[data_name]
                data_name_short = dct_data_short[data_name]
                s_data_tmp = "{}".format(data_name_short)
                s_data_tmp += "({})".format(dct_data_metric[data_name])
                # s_data_tmp = "\\texttt{{ {} }}".format(data_name_short)
                # s_data_tmp = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(data_name)
                s_data_tmp += " "*(nb_spaces - len(data_name_short))
                s_data_tmp += " & "
                arr_results_str[0, idx_data + 1] = s_data_tmp
    
                array_results = np.array(lst_tpl_results)
                best_result_perf = dct_data_best[data_name](array_results[:, 1])
                best_result_perf_indexes = np.argwhere(array_results[:, 1] == best_result_perf)
    
                copye_array_results = copy.deepcopy(array_results)
                if dct_data_best[data_name] is np.min:
                    copye_array_results[best_result_perf_indexes] = np.inf
                else:
                    copye_array_results[best_result_perf_indexes] = -np.inf
    
                best_result_perf_2 = dct_data_best[data_name](copye_array_results[:, 1])
                best_result_perf_indexes_2 = np.argwhere(copye_array_results[:, 1] == best_result_perf_2)
    
                best_result_prune = np.min(array_results[:, 0])
                best_result_prune_indexes = np.argwhere(array_results[:, 0] == best_result_prune)
    
                for idx_strat, tpl_results in enumerate(array_results):
                    # str_strat = "\\texttt{{ {} }}".format(lst_strats[idx_strat])
                    # str_strat = "\\multicolumn{{2}}{{c}}{{ \\texttt{{ {} }} }}".format(lst_strats[idx_strat])
                    # str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ \\texttt{{ {} }} }} }}".format("}\\\\ \\texttt{".join(lst_strats[idx_strat].split(" ", 1)))
                    str_strat = "\\multicolumn{{2}}{{c}}{{ \\thead{{ {} }} }} ".format("\\\\".join(lst_strats[idx_strat].split(" ", 1)))
                    str_strat += " " * (nb_spaces - len(str_strat)) + " & "
                    arr_results_str[idx_strat+1, 0] =  str_strat
    
                    # str_header = " {} & #tree &".format(dct_data_metric[data_name])
                    # arr_results_str[idx_strat + 1, 1] = str_header
    
                    best_forest_size = tpl_results[0]
                    best_mean = tpl_results[1]
                    best_std = tpl_results[2]
                    if dct_data_metric[data_name] == "Acc.":
                        str_perf = "{:.2f}\\%".format(best_mean * 100)
                    else:
                        str_perf = "{:.3E}".format(best_mean)
    
                    str_prune = "{:d}".format(int(best_forest_size))
    
                    if idx_strat in best_result_perf_indexes:
                        # str_formating = "\\textbf{{ {} }}".format(str_result_loc)
                        str_formating = "\\textbf[{}]"
                        # str_formating = "\\textbf{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std)
                    elif idx_strat in best_result_perf_indexes_2:
                        str_formating = "\\underline[{}]"
                        # str_formating = "\\underline{{ {:.3E} }}(\\~{:.3E})".format(best_mean, best_std)
                    else:
                        str_formating = "{}"
                        # str_formating = "{:.3E}(~{:.3E})".format(best_mean, best_std)
    
                    if idx_strat in best_result_prune_indexes:
                        str_formating = str_formating.format("\\textit[{}]")
                        # str_prune = " & \\textit{{ {:d} }}".format(int(best_forest_size))
                    # else:
                    #     str_prune = " & {:d}".format(int(best_forest_size))
                    str_result = str_formating.format(str_perf) + " & " + str_formating.format(str_prune)
                    str_result += " "*(nb_spaces - len(str_result))
                    str_result = str_result.replace("[", "{").replace("]", "}")
    
                    arr_results_str[idx_strat+1, idx_data+1] = str_result + " & "
                    dct_strat_str[lst_strats[idx_strat]].append(str_result)
    
            arr_results_str = arr_results_str.T
            for idx_lin, lin in enumerate(arr_results_str):
                if idx_lin == 1:
                    print("\\midrule")
                if idx_lin == 6:
                    print("\\midrule")
    
    Luc Giffon's avatar
    Luc Giffon committed
                if lst_data_ordered[idx_lin-1] == "Diamonds":
                    print("%", end="")
    
    Luc Giffon's avatar
    Luc Giffon committed
                line_print = " ".join(list(lin))
                line_print = line_print.rstrip(" &") + "\\\\"
                print(line_print)
    
            # s_data = s_data.rstrip(" &") + "\\\\"
            # print(s_data)
            # for strat, lst_str_results in dct_strat_str.items():
            #     str_strat = "\\texttt{{ {} }}".format(strat)
            #     str_strat += " "*(nb_spaces - len(str_strat))
            #     str_strat += " & " + " & ".join(lst_str_results)
            #     str_strat += "\\\\"
            #     print(str_strat)
    
                    # exit()