From c57e67a323a26a1dd29dd82c9e6abb37d2be0db0 Mon Sep 17 00:00:00 2001
From: Benoit Favre <benoit.favre@lis-lab.fr>
Date: Wed, 22 Mar 2023 12:07:09 +0100
Subject: [PATCH] add official evaluation scripts

---
 eval_scripts/EvaluationClassification.py | 62 ++++++++++++++++++++++
 eval_scripts/EvaluationQA.py             | 66 ++++++++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 eval_scripts/EvaluationClassification.py
 create mode 100644 eval_scripts/EvaluationQA.py

diff --git a/eval_scripts/EvaluationClassification.py b/eval_scripts/EvaluationClassification.py
new file mode 100644
index 0000000..19c65d5
--- /dev/null
+++ b/eval_scripts/EvaluationClassification.py
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+"""Recipe for the evaluation of the classification system of FrenchMedMCQA.
+> Run the evaluation script:
+    > python EvaluationClassification.py --references="./references_classification.txt" --predictions="./sample_classification.txt"
+Authors
+ * Yanis LABRAK 2023
+"""
+
+import argparse
+
+from sklearn.metrics import classification_report, f1_score, accuracy_score
+
+parser = argparse.ArgumentParser(formatter_class = argparse.RawDescriptionHelpFormatter)
+parser.add_argument("-r", "--references", default="./references_classification.txt", help = "Reference file")
+parser.add_argument("-p", "--predictions", default="./sample_classification.txt", help = "Predictions file")
+args = vars(parser.parse_args())
+
+class SystemColors:
+    FAIL = '\033[91m'
+    OKGREEN = '\033[92m'
+    ENDC = '\033[0m'
+
+f_refs = open(args["references"],"r")
+pairs_refs = [l.split(";") for l in f_refs.read().split("\n") if len(l) > 0]
+pairs_refs = {p[0]: p[1] for p in pairs_refs}
+f_refs.close()
+
+f_preds = open(args["predictions"],"r")
+pairs_preds = [l.split(";") for l in f_preds.read().split("\n") if len(l) > 0]
+pairs_preds = {p[0]: p[1] for p in pairs_preds}
+f_preds.close()
+
+# Check if identifiers list are differents lengths
+if len(pairs_refs) != len(pairs_preds):
+    print(f"{SystemColors.FAIL} The number of identifiers doesn't match the references ! {SystemColors.ENDC}")
+    exit(0)
+
+# Check if all required identifiers are presents
+if list(set([k in pairs_preds.keys() for k in pairs_refs.keys()])) != [True]:
+    print(f"{SystemColors.FAIL} A required identifiers is missing ! {SystemColors.ENDC}")
+    exit(0)
+
+refs  = [pairs_refs[k] for k in pairs_refs.keys()]
+preds = [pairs_preds[k] for k in pairs_refs.keys()]
+
+cr = classification_report(
+    refs,
+    preds,
+    digits=4,
+    zero_division=0.0,
+    target_names=["1","2","3","4","5"],
+)
+
+accuracy = accuracy_score(refs, preds)
+f1_macro = f1_score(refs, preds, average='macro')
+
+print("#"*60)
+print(cr)
+print("#"*60)
+print(f"Accuracy: {SystemColors.OKGREEN} {accuracy * 100} {SystemColors.ENDC}")
+print(f"Macro F1-Score: {SystemColors.OKGREEN} {f1_macro * 100} {SystemColors.ENDC}")
+print("#"*60)
diff --git a/eval_scripts/EvaluationQA.py b/eval_scripts/EvaluationQA.py
new file mode 100644
index 0000000..733897f
--- /dev/null
+++ b/eval_scripts/EvaluationQA.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+"""Recipe for the evaluation of the question answering system of FrenchMedMCQA.
+> Run the evaluation script:
+    > python EvaluationQA.py --references="./references_qa.txt" --predictions="./sample_qa.txt"
+Authors
+ * Yanis LABRAK 2023
+"""
+
+import argparse
+
+from sklearn.metrics import classification_report, f1_score, accuracy_score
+
+parser = argparse.ArgumentParser(formatter_class = argparse.RawDescriptionHelpFormatter)
+parser.add_argument("-r", "--references", default="./references_qa.txt", help = "Reference file")
+parser.add_argument("-p", "--predictions", default="./sample_qa.txt", help = "Predictions file")
+args = vars(parser.parse_args())
+
+class SystemColors:
+    FAIL = '\033[91m'
+    OKGREEN = '\033[92m'
+    ENDC = '\033[0m'
+
+f_refs = open(args["references"],"r")
+pairs_refs = [l.split(";") for l in f_refs.read().split("\n") if len(l) > 0]
+pairs_refs = {p[0]: p[1].split("|") for p in pairs_refs}
+f_refs.close()
+
+f_preds = open(args["predictions"],"r")
+pairs_preds = [l.split(";") for l in f_preds.read().split("\n") if len(l) > 0]
+pairs_preds = {p[0]: p[1].split("|") for p in pairs_preds}
+f_preds.close()
+
+# Check if identifiers list are differents lengths
+if len(pairs_refs) != len(pairs_preds):
+    print(f"{SystemColors.FAIL} The number of identifiers doesn't match the references ! {SystemColors.ENDC}")
+    exit(0)
+
+# Check if all required identifiers are presents
+if list(set([k in pairs_preds.keys() for k in pairs_refs.keys()])) != [True]:
+    print(f"{SystemColors.FAIL} A required identifiers is missing ! {SystemColors.ENDC}")
+    exit(0)
+
+refs  = [pairs_refs[k] for k in pairs_refs.keys()]
+preds = [pairs_preds[k] for k in pairs_refs.keys()]
+
+def compute_accuracy_exact_match(preds, refs):
+    exact_score = []
+    for p, r in zip(preds, refs):
+        exact_score.append(sorted(p) == sorted(r))
+    return sum(exact_score) / len(exact_score)
+
+def compute_accuracy_hamming(preds, refs):
+    corrects = [True for p in preds if p in refs]
+    corrects = sum(corrects)
+    total_refs = len(list(set(preds + refs)))
+    return corrects / total_refs
+
+hamming_scores = [compute_accuracy_hamming(r, p) for r, p in zip(refs, preds)]
+hamming_score = sum(hamming_scores) / len(hamming_scores)
+
+exact_match = compute_accuracy_exact_match(refs, preds)
+
+print("#"*60)
+print(f"Hamming Score: {SystemColors.OKGREEN} {hamming_score} {SystemColors.ENDC}")
+print(f"Exact Match Ratio: {SystemColors.OKGREEN} {exact_match} {SystemColors.ENDC}")
+print("#"*60)
-- 
GitLab