From c57e67a323a26a1dd29dd82c9e6abb37d2be0db0 Mon Sep 17 00:00:00 2001 From: Benoit Favre <benoit.favre@lis-lab.fr> Date: Wed, 22 Mar 2023 12:07:09 +0100 Subject: [PATCH] add official evaluation scripts --- eval_scripts/EvaluationClassification.py | 62 ++++++++++++++++++++++ eval_scripts/EvaluationQA.py | 66 ++++++++++++++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 eval_scripts/EvaluationClassification.py create mode 100644 eval_scripts/EvaluationQA.py diff --git a/eval_scripts/EvaluationClassification.py b/eval_scripts/EvaluationClassification.py new file mode 100644 index 0000000..19c65d5 --- /dev/null +++ b/eval_scripts/EvaluationClassification.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +"""Recipe for the evaluation of the classification system of FrenchMedMCQA. +> Run the evaluation script: + > python EvaluationClassification.py --references="./references_classification.txt" --predictions="./sample_classification.txt" +Authors + * Yanis LABRAK 2023 +""" + +import argparse + +from sklearn.metrics import classification_report, f1_score, accuracy_score + +parser = argparse.ArgumentParser(formatter_class = argparse.RawDescriptionHelpFormatter) +parser.add_argument("-r", "--references", default="./references_classification.txt", help = "Reference file") +parser.add_argument("-p", "--predictions", default="./sample_classification.txt", help = "Predictions file") +args = vars(parser.parse_args()) + +class SystemColors: + FAIL = '\033[91m' + OKGREEN = '\033[92m' + ENDC = '\033[0m' + +f_refs = open(args["references"],"r") +pairs_refs = [l.split(";") for l in f_refs.read().split("\n") if len(l) > 0] +pairs_refs = {p[0]: p[1] for p in pairs_refs} +f_refs.close() + +f_preds = open(args["predictions"],"r") +pairs_preds = [l.split(";") for l in f_preds.read().split("\n") if len(l) > 0] +pairs_preds = {p[0]: p[1] for p in pairs_preds} +f_preds.close() + +# Check if identifiers list are differents lengths +if len(pairs_refs) != len(pairs_preds): + print(f"{SystemColors.FAIL} The number of identifiers doesn't match the references ! {SystemColors.ENDC}") + exit(0) + +# Check if all required identifiers are presents +if list(set([k in pairs_preds.keys() for k in pairs_refs.keys()])) != [True]: + print(f"{SystemColors.FAIL} A required identifiers is missing ! {SystemColors.ENDC}") + exit(0) + +refs = [pairs_refs[k] for k in pairs_refs.keys()] +preds = [pairs_preds[k] for k in pairs_refs.keys()] + +cr = classification_report( + refs, + preds, + digits=4, + zero_division=0.0, + target_names=["1","2","3","4","5"], +) + +accuracy = accuracy_score(refs, preds) +f1_macro = f1_score(refs, preds, average='macro') + +print("#"*60) +print(cr) +print("#"*60) +print(f"Accuracy: {SystemColors.OKGREEN} {accuracy * 100} {SystemColors.ENDC}") +print(f"Macro F1-Score: {SystemColors.OKGREEN} {f1_macro * 100} {SystemColors.ENDC}") +print("#"*60) diff --git a/eval_scripts/EvaluationQA.py b/eval_scripts/EvaluationQA.py new file mode 100644 index 0000000..733897f --- /dev/null +++ b/eval_scripts/EvaluationQA.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 +"""Recipe for the evaluation of the question answering system of FrenchMedMCQA. +> Run the evaluation script: + > python EvaluationQA.py --references="./references_qa.txt" --predictions="./sample_qa.txt" +Authors + * Yanis LABRAK 2023 +""" + +import argparse + +from sklearn.metrics import classification_report, f1_score, accuracy_score + +parser = argparse.ArgumentParser(formatter_class = argparse.RawDescriptionHelpFormatter) +parser.add_argument("-r", "--references", default="./references_qa.txt", help = "Reference file") +parser.add_argument("-p", "--predictions", default="./sample_qa.txt", help = "Predictions file") +args = vars(parser.parse_args()) + +class SystemColors: + FAIL = '\033[91m' + OKGREEN = '\033[92m' + ENDC = '\033[0m' + +f_refs = open(args["references"],"r") +pairs_refs = [l.split(";") for l in f_refs.read().split("\n") if len(l) > 0] +pairs_refs = {p[0]: p[1].split("|") for p in pairs_refs} +f_refs.close() + +f_preds = open(args["predictions"],"r") +pairs_preds = [l.split(";") for l in f_preds.read().split("\n") if len(l) > 0] +pairs_preds = {p[0]: p[1].split("|") for p in pairs_preds} +f_preds.close() + +# Check if identifiers list are differents lengths +if len(pairs_refs) != len(pairs_preds): + print(f"{SystemColors.FAIL} The number of identifiers doesn't match the references ! {SystemColors.ENDC}") + exit(0) + +# Check if all required identifiers are presents +if list(set([k in pairs_preds.keys() for k in pairs_refs.keys()])) != [True]: + print(f"{SystemColors.FAIL} A required identifiers is missing ! {SystemColors.ENDC}") + exit(0) + +refs = [pairs_refs[k] for k in pairs_refs.keys()] +preds = [pairs_preds[k] for k in pairs_refs.keys()] + +def compute_accuracy_exact_match(preds, refs): + exact_score = [] + for p, r in zip(preds, refs): + exact_score.append(sorted(p) == sorted(r)) + return sum(exact_score) / len(exact_score) + +def compute_accuracy_hamming(preds, refs): + corrects = [True for p in preds if p in refs] + corrects = sum(corrects) + total_refs = len(list(set(preds + refs))) + return corrects / total_refs + +hamming_scores = [compute_accuracy_hamming(r, p) for r, p in zip(refs, preds)] +hamming_score = sum(hamming_scores) / len(hamming_scores) + +exact_match = compute_accuracy_exact_match(refs, preds) + +print("#"*60) +print(f"Hamming Score: {SystemColors.OKGREEN} {hamming_score} {SystemColors.ENDC}") +print(f"Exact Match Ratio: {SystemColors.OKGREEN} {exact_match} {SystemColors.ENDC}") +print("#"*60) -- GitLab