-
Luc Giffon authoredLuc Giffon authored
results_to_csv.py 5.63 KiB
from pathlib import Path
import os
import pandas as pd
from pprint import pprint
import pickle
from collections import defaultdict
import numpy as np
from dotenv import load_dotenv, find_dotenv
dct_experiment_id_subset = dict((str(idx), "train+dev/train+dev") for idx in range(1, 9))
dct_experiment_id_subset.update(dict((str(idx), "train/dev") for idx in range(9, 17)))
NONE = 'None'
Random = 'Random'
OMP = 'OMP'
OMP_Distillation = 'OMP Distillation'
Kmeans = 'Kmeans'
Zhang_Similarities = 'Zhang Similarities'
Zhang_Predictions = 'Zhang Predictions'
Ensemble = 'Ensemble'
dct_experiment_id_technique = {"1": NONE,
"2": Random,
"3": OMP,
"4": OMP_Distillation,
"5": Kmeans,
"6": Zhang_Similarities,
"7": Zhang_Predictions,
"8": Ensemble,
"9": NONE,
"10": Random,
"11": OMP,
"12": OMP_Distillation,
"13": Kmeans,
"14": Zhang_Similarities,
"15": Zhang_Predictions,
"16": Ensemble
}
dct_dataset_fancy = {
"boston": "Boston",
"breast_cancer": "Breast Cancer",
"california_housing": "California Housing",
"diabetes": "Diabetes",
"diamonds": "Diamonds",
"digits": "Digits",
"iris": "Iris",
"kin8nm": "Kin8nm",
"kr-vs-kp": "KR-VS-KP",
"olivetti_faces": "Olivetti Faces",
"spambase": "Spambase",
"steel-plates": "Steel Plates",
"wine": "Wine",
"gamma": "Gamma",
"lfw_pairs": "LFW Pairs"
}
skip_attributes = ["datetime"]
set_no_coherence = set()
set_no_corr = set()
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
dir_name = "results/bolsonaro_models_25-03-20"
dir_path = Path(os.environ["project_dir"]) / dir_name
output_dir_file = dir_path / "results.csv"
dct_results = defaultdict(lambda: [])
for root, dirs, files in os.walk(dir_path, topdown=False):
for file_str in files:
if file_str == "results.csv":
continue
path_dir = Path(root)
path_file = path_dir / file_str
print(path_file)
try:
with open(path_file, 'rb') as pickle_file:
obj_results = pickle.load(pickle_file)
except:
print("problem loading pickle file {}".format(path_file))
path_dir_split = str(path_dir).split("/")
bool_wo_weights = "no_weights" in str(path_file)
if bool_wo_weights:
forest_size = int(path_dir_split[-1].split("_")[0])
else:
forest_size = int(path_dir_split[-1])
seed = int(path_dir_split[-3])
id_xp = str(path_dir_split[-5])
dataset = str(path_dir_split[-6])
dct_results["forest_size"].append(forest_size)
dct_results["seed"].append(seed)
dct_results["dataset"].append(dct_dataset_fancy[dataset])
dct_results["subset"].append(dct_experiment_id_subset[id_xp])
dct_results["strategy"].append(dct_experiment_id_technique[id_xp])
dct_results["wo_weights"].append(bool_wo_weights)
for key_result, val_result in obj_results.items():
if key_result in skip_attributes:
continue
if key_result == "model_weights":
if val_result == "":
dct_results["negative-percentage"].append(None)
else:
lt_zero = val_result < 0
gt_zero = val_result > 0
nb_lt_zero = np.sum(lt_zero)
nb_gt_zero = np.sum(gt_zero)
percentage_lt_zero = nb_lt_zero / (nb_gt_zero + nb_lt_zero)
dct_results["negative-percentage"].append(percentage_lt_zero)
if val_result == "":
# print(key_result, val_result)
val_result = None
if key_result == "coherence" and val_result is None:
set_no_coherence.add(id_xp)
if key_result == "correlation" and val_result is None:
set_no_corr.add(id_xp)
dct_results[key_result].append(val_result)
# class 'dict'>: {'model_weights': '',
# 'training_time': 0.0032033920288085938,
# 'datetime': datetime.datetime(2020, 3, 25, 0, 28, 34, 938400),
# 'train_score': 1.0,
# 'dev_score': 0.978021978021978,
# 'test_score': 0.9736842105263158,
# 'train_score_base': 1.0,
# 'dev_score_base': 0.978021978021978,
# 'test_score_base': 0.9736842105263158,
# 'score_metric': 'accuracy_score',
# 'base_score_metric': 'accuracy_score',
# 'coherence': 0.9892031711775613,
# 'correlation': 0.9510700193340448}
# print(path_file)
print("coh", set_no_coherence, len(set_no_coherence))
print("cor", set_no_corr, len(set_no_corr))
final_df = pd.DataFrame.from_dict(dct_results)
final_df.to_csv(output_dir_file)
print(final_df)