from pathlib import Path import os import pandas as pd from pprint import pprint import pickle from collections import defaultdict import numpy as np from dotenv import load_dotenv, find_dotenv dct_experiment_id_subset = dict((str(idx), "train+dev/train+dev") for idx in range(1, 9)) dct_experiment_id_subset.update(dict((str(idx), "train/dev") for idx in range(9, 17))) NONE = 'None' Random = 'Random' OMP = 'OMP' OMP_Distillation = 'OMP Distillation' Kmeans = 'Kmeans' Zhang_Similarities = 'Zhang Similarities' Zhang_Predictions = 'Zhang Predictions' Ensemble = 'Ensemble' dct_experiment_id_technique = {"1": NONE, "2": Random, "3": OMP, "4": OMP_Distillation, "5": Kmeans, "6": Zhang_Similarities, "7": Zhang_Predictions, "8": Ensemble, "9": NONE, "10": Random, "11": OMP, "12": OMP_Distillation, "13": Kmeans, "14": Zhang_Similarities, "15": Zhang_Predictions, "16": Ensemble } dct_dataset_fancy = { "boston": "Boston", "breast_cancer": "Breast Cancer", "california_housing": "California Housing", "diabetes": "Diabetes", "diamonds": "Diamonds", "digits": "Digits", "iris": "Iris", "kin8nm": "Kin8nm", "kr-vs-kp": "KR-VS-KP", "olivetti_faces": "Olivetti Faces", "spambase": "Spambase", "steel-plates": "Steel Plates", "wine": "Wine", "gamma": "Gamma", "lfw_pairs": "LFW Pairs" } skip_attributes = ["datetime"] set_no_coherence = set() set_no_corr = set() if __name__ == "__main__": load_dotenv(find_dotenv('.env')) dir_name = "results/bolsonaro_models_25-03-20" dir_path = Path(os.environ["project_dir"]) / dir_name output_dir_file = dir_path / "results.csv" dct_results = defaultdict(lambda: []) for root, dirs, files in os.walk(dir_path, topdown=False): for file_str in files: if file_str == "results.csv": continue path_dir = Path(root) path_file = path_dir / file_str print(path_file) try: with open(path_file, 'rb') as pickle_file: obj_results = pickle.load(pickle_file) except: print("problem loading pickle file {}".format(path_file)) path_dir_split = str(path_dir).split("/") bool_wo_weights = "no_weights" in str(path_file) if bool_wo_weights: forest_size = int(path_dir_split[-1].split("_")[0]) else: forest_size = int(path_dir_split[-1]) seed = int(path_dir_split[-3]) id_xp = str(path_dir_split[-5]) dataset = str(path_dir_split[-6]) dct_results["forest_size"].append(forest_size) dct_results["seed"].append(seed) dct_results["dataset"].append(dct_dataset_fancy[dataset]) dct_results["subset"].append(dct_experiment_id_subset[id_xp]) dct_results["strategy"].append(dct_experiment_id_technique[id_xp]) dct_results["wo_weights"].append(bool_wo_weights) for key_result, val_result in obj_results.items(): if key_result in skip_attributes: continue if key_result == "model_weights": if val_result == "": dct_results["negative-percentage"].append(None) else: lt_zero = val_result < 0 gt_zero = val_result > 0 nb_lt_zero = np.sum(lt_zero) nb_gt_zero = np.sum(gt_zero) percentage_lt_zero = nb_lt_zero / (nb_gt_zero + nb_lt_zero) dct_results["negative-percentage"].append(percentage_lt_zero) if val_result == "": # print(key_result, val_result) val_result = None if key_result == "coherence" and val_result is None: set_no_coherence.add(id_xp) if key_result == "correlation" and val_result is None: set_no_corr.add(id_xp) dct_results[key_result].append(val_result) # class 'dict'>: {'model_weights': '', # 'training_time': 0.0032033920288085938, # 'datetime': datetime.datetime(2020, 3, 25, 0, 28, 34, 938400), # 'train_score': 1.0, # 'dev_score': 0.978021978021978, # 'test_score': 0.9736842105263158, # 'train_score_base': 1.0, # 'dev_score_base': 0.978021978021978, # 'test_score_base': 0.9736842105263158, # 'score_metric': 'accuracy_score', # 'base_score_metric': 'accuracy_score', # 'coherence': 0.9892031711775613, # 'correlation': 0.9510700193340448} # print(path_file) print("coh", set_no_coherence, len(set_no_coherence)) print("cor", set_no_corr, len(set_no_corr)) final_df = pd.DataFrame.from_dict(dct_results) final_df.to_csv(output_dir_file) print(final_df)