diff --git a/README.md b/README.md index 6c6b963bb50465ccee444ff9d097e2d3c6896d81..eb822822ecc01c33b79147b00740bd009aa0dd8d 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ The JSON object contains the following fields: -- `lu`: The linguistic unit (LU) in the sentence. +- `lu`: The lexical unit (LU)/trigger in the sentence. - `pos_lu`: The part of speech tag of the LU. (**corresponding to *f<sub>trigger</sub>***). - `lemma_lu`: The lemma or root form of the LU. - `frame`: The semantic frame associated with the LU. @@ -21,6 +21,7 @@ The JSON object contains the following fields: - `predictions`: A dictionary containing model predictions and corresponding ROUGE-L scores. Each model has an entry with: - `answer_pred`: The predicted answer by the model. - `rougeL`: The ROUGE-L score of the prediction. + - `HScore`: The HScore of the prediction as computed in the paper : "Correct" as human annotation → 1, "Partiellement correct" → 0.5 and 0 otherwise. In the case of mutliple annotation for the same question, the average HScore of all the annotation is taken. - `human_annot`: A dictionary containing human annotations for each model's output. Each model has an entry which is a list of annotations: - `annot`: The annotation identifier. @@ -32,7 +33,7 @@ The JSON object contains the following fields: - `entropy_frame`: Entropy of the question's frame, common to all the examples of this frame. (**corresponding to *f<sub>entropy</sub>***). - `complexity_vector` : Each element corresponds to a complexity factor, 1 if it's "active" and the example therefore corresponds to the difficult group, 0 otherwise. Indexes correspond to the following complexity factors: - `0`: ***f<sub>LU in q</sub>*** - - `1`: ***f<sub>trigger/sub>*** + - `1`: ***f<sub>trigger</sub>*** - `2`: ***f<sub>dist</sub>*** - `3`: ***f<sub>entropy</sub>*** - `4`: ***f<sub>nb FEs</sub>*** diff --git a/calor_complexity.json b/calor_complexity.json index 7ae6330c59a3f8476f04fca061f62b3bd9e09508..553afe6433bff22349b30b42b7dd5f6008889475 100644 --- a/calor_complexity.json +++ b/calor_complexity.json @@ -26,33 +26,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "état souverain et indépendant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "état souverain et indépendant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "souverain et indépendant", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "souverain et indépendant", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "république", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "un état souverain et indépendant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un état souverain et indépendant, puis une République", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -86,12 +93,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -154,42 +155,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Calme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Calme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Baonnettes plantées aux fusils par quelques sections obstinées, clairons qui sonnent la charge, bonds suprêmes d' isolés héroques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Calme affecté", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "calme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "calme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le calme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -282,33 +284,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "projectiles explosibles ouchargées de matières fulminantes ou inflammables", - "rougeL": 0.8749999999999999 + "rougeL": 0.8749999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 0.9285714285714286 + "rougeL": 0.9285714285714286, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les projectiles explosibles ou chargés de matières fulminantes ou inflammables, d'un poids inférieur à 400 grammes.", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -330,12 +339,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -410,33 +413,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "passer à l' insurrection armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "passer à l' insurrection armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "passer à l' insurrection armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "passer à l' insurrection armée.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sur la proposition de passer à l'insurrection armée", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "passer à l' insurrection armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le parti bolchevique décida de passer à l'insurrection armée.", - "rougeL": 0.46153846153846156 + "rougeL": 0.46153846153846156, + "HScore": 1.0 } }, "human_annot": { @@ -464,12 +474,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -532,33 +536,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "passer à l' insurrection armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "passer à l' insurrection armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "passer à l' insurrection armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "passer à l' insurrection armée.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "passer à l'insurrection armée", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "passer à l' insurrection armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "passer à l'insurrection armée", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 } }, "human_annot": { @@ -598,12 +609,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -660,33 +665,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Italie,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Italie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -708,12 +720,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -788,33 +794,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -836,12 +849,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -910,33 +917,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "preparation générale insuffisante de toute notre armée", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -970,12 +984,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -1038,33 +1046,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la paix revient.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -1098,12 +1113,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -1160,33 +1169,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la paix revient.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -1220,12 +1236,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -1288,33 +1298,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "éviter l' affrontement", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "éviter l' affrontement", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "éviter l' affrontement en mettant fin à la grève", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mettant fin à la grève le 14 novembre 1918", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "d'éviter l'affrontement en mettant fin à la grève le 14 novembre 1918", - "rougeL": 0.7368421052631577 + "rougeL": 0.7368421052631577, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mettant fin à la grève", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le comité d'Olten décidera de mettre fin à la grève", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 0.5 } }, "human_annot": { @@ -1342,12 +1359,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -1428,33 +1439,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Paul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Paul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un journal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Paul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "un journal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un journal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le journal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -1464,12 +1482,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -1556,33 +1568,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son indépendance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son indépendance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son indépendance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' indépendance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "son indépendance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "son indépendance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Belgique a perdu son indépendance.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -1616,12 +1635,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -1696,33 +1709,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Albert Barbet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Albert Barbet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Albert Barbet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Albert Barbet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Albert Barbet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Albert Barbet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Albert Barbet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -1756,12 +1776,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -1824,33 +1838,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la seule issue possible des conflits qui mettent en péril l' existence des Etats, leur liberté, leurs intérêts vitaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'horreur", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -1866,12 +1887,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -1946,33 +1961,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "horreur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'horreur qu'elle inspire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -2012,12 +2034,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -2086,33 +2102,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le bulletin d' information de l' Etat - major suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bulletin d' information de l' Etat - major suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bulletin d' information de l' Etat - major suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bulletin d' information de l' Etat - major suisse,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bulletin d' information de l' Etat - major suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le bulletin d' information de l' Etat - major suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le bulletin d'information de l'Etat-major suisse.", - "rougeL": 0.30769230769230765 + "rougeL": 0.30769230769230765, + "HScore": 1.0 } }, "human_annot": { @@ -2158,12 +2181,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -2214,33 +2231,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "groupes financiers importants d' Europe", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les groupes financiers importants d'Europe", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -2268,12 +2292,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -2337,33 +2355,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -2397,12 +2422,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -2466,33 +2485,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les Russes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les Russes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Allemands", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Russes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Russes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Russes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Allemands.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -2538,12 +2564,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -2595,33 +2615,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "réconciliation humaine et de recherche du bonheur social", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la liberté", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'harmonie entre les peuples", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -2649,12 +2676,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -2717,33 +2738,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "social", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "social", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "social", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -2789,12 +2817,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -2845,33 +2867,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' auteur de l' attentat,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -2911,12 +2940,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -2973,33 +2996,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chacun des marchés de capitaux s' occupant ou étant susceptible de s' occuper des autres, d' avoir une influence sur eux, de leur rendre des services ou de leur créer des difficultés", - "rougeL": 0.35294117647058826 + "rougeL": 0.35294117647058826, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "marchés de capitaux", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chacun des marchés de capitaux s' occuper ou étant susceptible de s' occuper des autres, d' avoir une influence sur eux, de leur rendre des services ou de leur créer", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chacun des marchés de capitaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les affaires financières", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les marchés de capitaux", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les événements de 1911", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -3009,12 +3039,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -3107,33 +3131,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Russie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Allemagne,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'Allemagne", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Il n'y a pas d'information dans l'article concernant l'aide de l'Italie.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -3173,12 +3204,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -3229,33 +3254,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Allemagne,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'Allemagne", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il n'y a pas de mention de l'Italie dans l'article, donc on ne peut pas répondre à la question posée.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -3271,12 +3303,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -3357,33 +3383,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Toute la génération", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Toute la génération", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Toute la génération", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Toute la génération", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Toute la génération", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Toute la génération", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la génération", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -3417,12 +3450,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -3485,33 +3512,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "général, le chef de l' Etat - major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Conseil fédéral", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le général, le chef de l' Etat - major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le général, le chef de l' Etat - major,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Chambres fédérales", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le général, le chef de l' Etat - major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Chambres fédérales", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -3545,12 +3579,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -3619,33 +3647,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "300 moteurs par mois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "des camions, des moteurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -3691,12 +3726,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -3741,33 +3770,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des camions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -3795,12 +3831,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -3869,33 +3899,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "anglais et écossais", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "anglais et écossais", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "anglais et écossais", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "anglais et écossais", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "anglais et écossais", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "anglais et écossais", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "protestants anglais et écossais", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -3935,12 +3972,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -3991,33 +4022,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "protestants anglais et écossais", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "protestants anglais et écossais", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "protestants anglais et écossais", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "protestants anglais et écossais", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les protestants anglais et écossais", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "protestants anglais et écossais", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "protestants anglais et écossais", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -4057,12 +4095,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -4119,33 +4151,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Armée allemande de la Meuse", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -4185,12 +4224,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -4243,33 +4276,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' Armée allemande de la Meuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'Armée allemande de la Meuse", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "celui qui voulait nous attaquer", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "l'Armée allemande de la Meuse", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -4297,12 +4337,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -4373,33 +4407,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en vain l' ennemi", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ennemi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ennemi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' ennemi.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'ennemi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' ennemi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'ennemi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -4433,12 +4474,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -4501,33 +4536,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la guerre Au Reichstag", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le SPD soutient la guerre", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -4561,12 +4603,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -4623,33 +4659,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "guerre Au Reichstag", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "soutien à la guerre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 } }, "human_annot": { @@ -4689,12 +4732,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -4751,33 +4788,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -4817,12 +4861,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -4874,33 +4912,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -4940,12 +4985,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -5003,33 +5042,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "En 1918, la classe ouvrière a perdu une bataille, mais remporté une victoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "En 1918, la classe ouvrière a perdu une bataille, mais remporté une victoire. La bataille a été courte, la victoire durable", - "rougeL": 0.761904761904762 + "rougeL": 0.761904761904762, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "En 1918, la classe ouvrière a perdu une bataille, mais remporté une victoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "En 1918, la classe ouvrière a perdu une bataille, mais remporté une victoire.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "\"En 1918, la classe ouvrière a perdu une bataille, mais remporté une victoire. La bataille a été courte, la victoire durable \".", - "rougeL": 0.761904761904762 + "rougeL": 0.761904761904762, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "En 1918, la classe ouvrière a perdu une bataille, mais remporté une victoire. La bataille a été courte, la victoire durable", - "rougeL": 0.761904761904762 + "rougeL": 0.761904761904762, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "\"En 1918, la classe ouvrière a perdu une bataille, mais remporté une victoire.\"", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -5045,12 +5091,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -5131,42 +5171,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cette bipolarisation de l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bipolarisation de l' Europe", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cette bipolarisation de l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de l' Europe", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la Triple Alliance (Allemagne, Autriche - Hongrie, Italie) -la Triple Entente (France, Russie, Royaume-Uni)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la Triple Alliance (Allemagne, Autriche - Hongrie, Italie) -la Triple Entente (France, Russie, Royaume-Uni)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la bipolarisation de l' Europe", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -5253,33 +5294,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bipolarisation de l' Europe", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bipolarisation de l' Europe", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cette bipolarisation de l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bipolarisation de l' Europe", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bipolarisation de l' Europe", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bipolarisation de l' Europe", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la bipolarisation de l'Europe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -5301,12 +5349,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -5393,33 +5435,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "qu' on m' envoie illico sur le front et dans le régiment de mon beau-frère, le caporal Paul Delroze", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à titre d' athlète plus que complet et de lauréat de toutes les sociétés de gymnastique et de préparation militaire, je désire qu' on m' envoie illico sur le front et dans le", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "on m' envoie illico sur le front et dans le régiment de mon beau-frère, le caporal Paul Delroze", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "qu' on m' envoie illico sur le front et dans le régiment de mon beau-frère,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "qu'on m'envoie illico sur le front et dans le régiment de mon beau-frère, le caporal Paul Delroze.", - "rougeL": 0.5185185185185185 + "rougeL": 0.5185185185185185, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "qu' on m' envoie illico sur le front et dans le régiment de mon beau-frère, le caporal Paul Delroze.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "qu'on m'envoie illico sur le front et dans le régiment+ de mon beau-frère, le caporal Paul Delroze.", - "rougeL": 0.5185185185185185 + "rougeL": 0.5185185185185185, + "HScore": 1.0 } }, "human_annot": { @@ -5441,12 +5490,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -5518,33 +5561,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Au XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Au XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "XVe siècle, XVIIIe et XIXe", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au XVIIIe et XIXe siècle", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -5572,12 +5622,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -5640,33 +5684,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Au XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "XVe siècle, XVIIIe et XIXe", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "XVIIIe et XIXe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Au XVIIIe et XIXe siècle", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -5706,12 +5757,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -5798,33 +5843,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Europe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -5840,12 +5892,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -5926,33 +5972,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "29 juin 1914", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.6 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "29 juin 1914", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.6 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "29 juin 1914", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.6 }, "Camembert_baseline": { "answer_pred": "sur-le-champ.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.6 }, "llama-2_lora": { "answer_pred": "sur-le-champ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.6 }, "mixtral-8x7b": { "answer_pred": "sur-le-champ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.6 }, "GPT-3.5": { "answer_pred": "fut arrêté sur-le-champ", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.55 } }, "human_annot": { @@ -5998,7 +6051,7 @@ "rating": "Correct" } ], - "llama-2-70b": [ + "llama-2_lora": [ { "annot": "annot_1", "rating": "Correct" @@ -6040,7 +6093,7 @@ "rating": "Correct" } ], - "llama-2_lora": [ + "mixtral-8x7b": [ { "annot": "annot_1", "rating": "Correct" @@ -6082,7 +6135,7 @@ "rating": "Correct" } ], - "mixtral-8x7b": [ + "ground_truth": [ { "annot": "annot_1", "rating": "Correct" @@ -6124,7 +6177,7 @@ "rating": "Correct" } ], - "ground_truth": [ + "GPT-3.5": [ { "annot": "annot_1", "rating": "Correct" @@ -6143,7 +6196,7 @@ }, { "annot": "annot_5", - "rating": "Partiellement correct" + "rating": "Erreur acceptable (\"humaine\")" }, { "annot": "annot_6", @@ -6166,91 +6219,49 @@ "rating": "Correct" } ], - "GPT-3.5": [ + "FLAN-T5-large": [ { "annot": "annot_1", "rating": "Correct" }, { "annot": "annot_2", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_3", - "rating": "Partiellement correct" + "rating": "Correct" }, { "annot": "annot_4", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" + "rating": "Correct" }, { "annot": "annot_6", - "rating": "Correct" + "rating": "Erreur acceptable (\"humaine\")" }, { "annot": "annot_7", - "rating": "Correct" + "rating": "Erreur acceptable (\"humaine\")" }, { "annot": "annot_8", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_9", - "rating": "Correct" + "rating": "Erreur acceptable (\"humaine\")" }, { "annot": "annot_10", - "rating": "Correct" + "rating": "Erreur acceptable (\"humaine\")" } ], - "FLAN-T5-large": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], - "MT5-large": [ + "MT5-large": [ { "annot": "annot_1", "rating": "Correct" @@ -6396,33 +6407,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le corbeau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "corbeau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ils mangeaient gravement en criant de temps en temps", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "corbeau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les corbeaux", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les corbeaux", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le corbeau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -6462,12 +6480,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -6526,15 +6538,15 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "vers les mauvaises terres", "rougeL": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers les mauvaises terres", "rougeL": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vers les mauvaises terres", "rougeL": 1.0 }, @@ -6606,33 +6618,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "groupes financiers importants d' Europe", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -6660,12 +6679,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -6729,33 +6742,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "groupes financiers importants d' Europe", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.55 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.55 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.55 }, "Camembert_baseline": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.55 }, "llama-2_lora": { "answer_pred": "M. Germain", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.05 }, "mixtral-8x7b": { "answer_pred": "tous les groupes financiers importants d' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.55 }, "GPT-3.5": { "answer_pred": "tous les groupes financiers importants d' Europe.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.55 } }, "human_annot": { @@ -6927,48 +6947,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Partiellement correct" - }, - { - "annot": "annot_3", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Partiellement correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Partiellement correct" - }, - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -7137,16 +7115,6 @@ "rating": "Erreur inacceptable" } ], - "question": [ - { - "annot": "annot_5", - "rating": "Question douteuse" - }, - { - "annot": "annot_9", - "rating": "Question douteuse" - } - ], "no_answer": [ { "annot": "annot_8", @@ -7198,33 +7166,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "artilleurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cathédrale", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "artilleurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "artilleurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les artilleurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les artilleurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les artilleurs allemands.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -7252,12 +7227,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -7323,33 +7292,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "allemands", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "allemands", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "artilleurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "allemands", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemands", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "allemands", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les artilleurs allemands.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -7377,12 +7353,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -7472,33 +7442,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa pensée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sa pensée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sa pensée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "sa pensée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "la manière de penser du général", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "sa pensée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "\"sa pensée\" est exprimée de manière directe et incisive.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -7538,12 +7515,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -7602,33 +7573,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la polémique éclate entre les deux régions linguistiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la polémique éclate entre les deux régions linguistiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la polémique éclate entre les deux régions linguistiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la polémique éclate entre les deux régions linguistiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la polémique éclate entre les deux régions linguistiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la polémique éclate entre les deux régions linguistiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la polémique éclate entre les deux régions linguistiques.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -7662,12 +7640,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -7730,33 +7702,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -7790,12 +7769,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -7852,33 +7825,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -7906,12 +7886,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -7974,33 +7948,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -8034,12 +8015,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -8102,33 +8077,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une province allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "province allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une province allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une province allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une province allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une province allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une province allemande.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -8156,12 +8138,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -8227,33 +8203,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "allemande", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "allemande", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "allemande", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Allemagne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemagne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Allemagne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Allemagne.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -8263,12 +8246,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -8358,33 +8335,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "20 % de socialistes siégeront dès 1919", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "20 % de socialistes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le Conseil national.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 } }, "human_annot": { @@ -8424,12 +8408,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -8480,33 +8458,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Conseil national", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -8540,12 +8525,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -8608,33 +8587,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le doux soleil d' avril,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le doux soleil d'avril", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -8674,12 +8660,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -8730,33 +8710,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le doux soleil d' avril,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "doux soleil d' avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le doux soleil d'avril", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -8796,12 +8783,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -8858,33 +8839,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "soldats allemands", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Les assassins", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -8906,12 +8894,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -8981,33 +8963,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "soldats allemands", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "magistrats et les fonctionnaires de la police judiciaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les magistrats et les fonctionnaires de la police judiciaire.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -9047,12 +9036,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -9110,33 +9093,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "réconciliation humaine et de recherche du bonheur social", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "de la liberté", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -9176,12 +9166,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -9232,33 +9216,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "social", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "social", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le bonheur social", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -9280,12 +9271,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -9360,33 +9345,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 } }, "human_annot": { @@ -9426,12 +9418,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -9482,33 +9468,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 } }, "human_annot": { @@ -9548,12 +9541,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -9610,33 +9597,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -9670,12 +9664,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -9732,33 +9720,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nommé président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pedraic Pearse est nommé président du gouvernement provisoire et commandant en chef des forces républicaines.", - "rougeL": 0.6956521739130436 + "rougeL": 0.6956521739130436, + "HScore": 1.0 } }, "human_annot": { @@ -9798,12 +9793,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -9854,33 +9843,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "président du gouvernement provisoire", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "président du gouvernement provisoire et commandant en chef des forces républicaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pedraic Pearse est nommé président du gouvernement provisoire et commandant en chef des forces républicaines.", - "rougeL": 0.6956521739130436 + "rougeL": 0.6956521739130436, + "HScore": 1.0 } }, "human_annot": { @@ -9908,12 +9904,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -9982,33 +9972,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la Suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Suisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -10036,12 +10033,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -10111,33 +10102,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les principes de justice gouvernent la conscience publique.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -10165,12 +10163,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -10233,33 +10225,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la conscience publique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -10299,12 +10298,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -10361,33 +10354,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -10415,12 +10415,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -10483,33 +10477,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "en Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Ulster", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -10525,12 +10526,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -10623,33 +10618,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "germanophilie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "germanophilie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "germanophilie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Ces trois affaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les trois affaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "ces trois affaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les trois affaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -10695,12 +10697,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -10758,42 +10754,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' affaire des colonels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' affaire des colonels : à la fin de l' année 1915, le bruit se répand rapidement que deux hauts officiers de l' Etat - major se sont rendus coupables de trahison", - "rougeL": 0.2727272727272727 + "rougeL": 0.2727272727272727, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' affaire Hoffmann : Le conseiller fédéral Arthur Hoffmann et Robert Grimm (socialiste bernois et conseiller national) s' entremettent en 1917 pour obtenir l' adhésion de la Russie ", - "rougeL": 0.1212121212121212 + "rougeL": 0.1212121212121212, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "- L' affaire des colonels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "affaires vont affecter les relations entre les deux parties du pays ; en outre, elles vont affecter l' image de la neutralité de la Suisse à l' étranger.", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "La lettre au Conseil fédéral du 20 juillet 1915, L' affaire des colonels, L' affaire Hoffmann", - "rougeL": 0.375 + "rougeL": 0.375, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les affaires des colonels et l'affaire Hoffmann", - "rougeL": 0.36363636363636365 + "rougeL": 0.36363636363636365, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -10887,33 +10884,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "retrouver les traditions qui nous guident en toutes matières sauf dans nos rapports avec l' Irlande", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "retrouver les traditions qui nous guident en toutes matières sauf dans nos rapports avec l' Irlande", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les traditions qui nous guident en toutes matières sauf dans nos rapports avec l' Irlande", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "retrouver les traditions qui nous guident en toutes matières sauf dans nos rapports avec l' Irlande", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Traditions qui nous guident en toutes matières sauf dans nos rapports avec l' Irlande", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "retrouver les traditions qui nous guident en toutes matières sauf dans nos rapports avec l' Irlande", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les traditions qui nous guident en toutes matières sauf dans nos rapports avec l' Irlande", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 } }, "human_annot": { @@ -10959,12 +10963,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -11015,33 +11013,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "munitions et les vivres à distribuer", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "munitions et les vivres à distribuer", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les munitions et les vivres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "les munitions et les vivres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "munitions et vivres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "munitions et les vivres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les munitions et les vivres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -11087,12 +11092,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -11145,33 +11144,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -11211,12 +11217,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -11267,33 +11267,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "preparation générale insuffisante de toute notre armée", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la préparation générale insuffisante de toute notre armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -11333,12 +11340,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -11395,33 +11396,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "baïonnettes plantées aux fusils par quelques sections obstinées, clairons qui sonnent la charge, bonds suprêmes d' isolés héroïques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "officiers qui se font tuer debout, baonnettes plantées aux fusils par quelques sections obstinées, clairons qui sonnent la charge, bonds suprêmes d' isolés héro", - "rougeL": 0.19999999999999998 + "rougeL": 0.19999999999999998, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "officiers qui se font tuer debout, baonnettes plantées aux fusils par quelques sections obstinées, clairons qui sonnent la charge, bonds suprêmes d' isolés héro", - "rougeL": 0.19999999999999998 + "rougeL": 0.19999999999999998, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "baïonnettes plantées aux fusils par quelques sections obstinées, clairons qui sonnent la charge,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "tuer debout, baïonnettes plantées aux fusils par quelques sections obstinées, clairons qui sonnent la charge, bonds suprêmes d' isolés héroïques, rien n' y fait.", - "rougeL": 0.06451612903225806 + "rougeL": 0.06451612903225806, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "feu", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le feu", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -11443,12 +11451,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -11523,33 +11525,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Gallieni, à Joffre ou à Foch", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gallieni, à Joffre ou à Foch", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' Allemagne. Sans ces malentendus, non seulement la bataille aurait été gagnée, mais la guerre tout entière.", - "rougeL": 0.35294117647058826 + "rougeL": 0.35294117647058826, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemands", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "contre l'Allemagne", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -11577,12 +11586,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -11642,33 +11645,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gallieni, à Joffre ou à Foch", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Allemagne", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 } }, "human_annot": { @@ -11708,12 +11718,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -11767,33 +11771,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des données sur les intentions militaires des Alliés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "données sur les intentions militaires des Alliés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le bulletin d'information de l'état-major suisse contient plusieurs données sur les intentions militaires des Alliés.", - "rougeL": 0.5555555555555556 + "rougeL": 0.5555555555555556, + "HScore": 1.0 } }, "human_annot": { @@ -11803,12 +11814,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -11889,33 +11894,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les intentions militaires des Alliés", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "intentions militaires des Alliés", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "intentions militaires des Alliés", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "les intentions militaires des Alliés", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "données sur les intentions militaires des Alliés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur les intentions militaires des Alliés.", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 } }, "human_annot": { @@ -11949,12 +11961,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -12035,33 +12041,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"ses enfants\".", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -12089,12 +12102,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -12175,78 +12182,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "Camembert_baseline": { "answer_pred": "ses enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "llama-2_lora": { "answer_pred": "les enfants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.55 }, "mixtral-8x7b": { "answer_pred": "La voix de la patrie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.15 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"La voix de la patrie\".", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.05 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_3", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_4", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_6", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_7", - "rating": "Partiellement correct" - }, - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -12663,33 +12635,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Elle dénombrait avec fierté les pays qu' elle avait découverts et lancés dans le courant de la vie générale", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Europe,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Europe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -12735,12 +12714,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -12791,33 +12764,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ambassadeur d' Angleterre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ambassadeur d' Angleterre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ambassadeur d' Angleterre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' ambassadeur d' Angleterre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "M. Asquith", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' ambassadeur d' Angleterre à Berlin", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'ambassadeur d'Angleterre", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 } }, "human_annot": { @@ -12839,12 +12819,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -12913,33 +12887,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ambassadeur d' Angleterre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ambassadeur d' Angleterre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ambassadeur d' Angleterre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' ambassadeur d' Angleterre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "M. Asquith", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' ambassadeur d' Angleterre à Berlin", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "M. Paul Cambon", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -12949,12 +12930,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -13041,33 +13016,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "répression", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "répression contre les bolcheviks", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "répression organisée contre les bolcheviks", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "répression organisée contre les bolcheviks", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mutineries dans la marine allemande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "répression organisée contre les bolcheviks", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la répression organisée contre les bolcheviks", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -13089,12 +13071,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -13163,42 +13139,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "répression", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "répression organisée contre les bolcheviks", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "répression organisée contre les bolcheviks", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bolcheviks", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "mutineries dans la marine allemande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "répression organisée contre les bolcheviks en juillet", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la répression organisée contre les bolcheviks", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -13291,33 +13268,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -13357,12 +13341,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -13413,33 +13391,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de 1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -13467,12 +13452,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -13553,33 +13532,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "n' a pas l' air de le comprendre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il envisage, c' est de ne commencer à réunir les forces nécessaires à combattre le crime qu' au moment où il aura constaté que l' adversaire a déjà passé aux actes", - "rougeL": 0.08 + "rougeL": 0.08, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ne commencer à réunir les forces nécessaires à combattre le crime qu' au moment où il aura constaté que l' adversaire a déjà passé aux actes (...)", - "rougeL": 0.08333333333333333 + "rougeL": 0.08333333333333333, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' adversaire a déjà passé aux actes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "ne pas jusque-là", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "ne commencer à réunir les forces nécessaires à combattre le crime qu' au moment où il aura constaté que l' adversaire a déjà passé aux actes", - "rougeL": 0.08333333333333333 + "rougeL": 0.08333333333333333, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le gouvernement zurichois ne comprend pas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -13589,12 +13575,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -13681,42 +13661,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' année 1914, sur l' ensemble de la guerre et sur la fin de la guerre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vous trouvez sur Cliotexte des textes sur le début de la guerre : l' année 1914, sur l' ensemble de la guerre et sur la fin de la guerre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "G. Pedroncini, 1917, les mutineries de l' armée française, coll. Archives Julliard - Gallimard, 1968", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "G. Pedroncini,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l' année 1914, sur l' ensemble de la guerre et sur la fin de la guerre, utiles aussi à ce chapitre.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Vous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les lecteurs/trices de Cliotexte.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -13809,33 +13790,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions (...) posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions (...) posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -13881,12 +13869,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -13937,33 +13919,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "quelques exemplaires de l' original,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "exemplaires de l' original", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "quelques exemplaires de l'original", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -13997,12 +13986,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -14059,33 +14042,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' original", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' original", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' original", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' original,", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "soigneusement cachés par des Belges", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' original", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de l'original", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -14107,12 +14097,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -14205,33 +14189,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "rats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ils choisissaient d' abord les jeunes sans barbe sur les joues. Ils se mettaient en boule et ils commençaient à manger cette chair d' entre le nez et la bouche, puis le bord", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les jeunes sans barbe sur les joues", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "rats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les rats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les rats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les rats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -14265,12 +14256,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -14334,33 +14319,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": " Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -14400,12 +14392,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -14456,33 +14442,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Elizabeth I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -14522,12 +14515,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -14584,33 +14571,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -14620,12 +14614,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -14709,33 +14697,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -14763,12 +14758,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -14852,33 +14841,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Henry Floch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Henry Floch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Lucie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Henry Floch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Henry Floch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Henry Floch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Henry Floch et cinq de ses camarades (Durantet, Blanchard, Gay, Pettelet et Quinault)", - "rougeL": 0.45454545454545453 + "rougeL": 0.45454545454545453, + "HScore": 1.0 } }, "human_annot": { @@ -14906,12 +14902,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -14980,33 +14970,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "quelques exemplaires de l' original,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "exemplaires de l' original", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -15046,12 +15043,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -15102,33 +15093,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quelques exemplaires de l' original", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "quelques exemplaires de l' original,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "soigneusement cachés par des Belges", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "exemplaires de l' original", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les exemplaires de l'original", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 } }, "human_annot": { @@ -15138,12 +15136,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -15230,33 +15222,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Tous ces défauts provenaient de la préparation générale insuffisante de toute notre armée et devinrent particulièrement apparents au moment de notre première rencontre avec un adversaire plus habile que nous", - "rougeL": 0.17391304347826084 + "rougeL": 0.17391304347826084, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "reconnaissances défectueuses, de liaisons mal organisées, de mauvaises appréciations de la situation et de maints autres défauts de commandement et de manoeuvre", - "rougeL": 0.09999999999999999 + "rougeL": 0.09999999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "retours défectueuses, de liaisons mal organisées, de mauvaises appréciations de la situation et de maints autres défauts de commandement et de manoeuvre", - "rougeL": 0.10526315789473684 + "rougeL": 0.10526315789473684, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "mauvaises appréciations de la situation et de maints autres défauts de commandement et de manoeuvre", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Tous ces défauts provenaient de la préparation générale insuffisante de toute notre armée et devinrent particulièrement apparents au moment de notre première rencontre avec un adversaire plus habile que nous.", - "rougeL": 0.17391304347826084 + "rougeL": 0.17391304347826084, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "devinrent particulièrement apparents au moment de notre première rencontre avec un adversaire plus habile que nous", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les tristes résultats de reconnaissances défectueuses, de liaisons mal organisées, de mauvaises appréciations de la situation et de maints autres défauts de commandement et de manoeuvre.", - "rougeL": 0.08695652173913042 + "rougeL": 0.08695652173913042, + "HScore": 1.0 } }, "human_annot": { @@ -15302,12 +15301,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -15358,42 +15351,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions (...) posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions (...) posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions (...) posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -15480,33 +15474,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions (...) posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions (...) posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions (...) posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la diversité la plus extrême en ce qui concerne les questions posées par ce conflit", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la diversité la plus extrême", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 } }, "human_annot": { @@ -15528,12 +15529,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -15602,33 +15597,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "notre idéal de réconciliation humaine et de recherche du bonheur social ne sombre pas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "camarades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "camarades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "notre idéal de réconciliation humaine et de recherche du bonheur social ne sombre pas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "les soldats de la liberté", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "notre idéal", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Nous (les soldats de la liberté) recherchons le bonheur social.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -15662,12 +15664,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -15734,33 +15730,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vikings", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vikings", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les Vikings", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les Vikings", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vikings", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les Vikings", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les Vikings", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -15800,12 +15803,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -15862,33 +15859,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la guerre servira à la défense de la patrie, de la démocratie, à la libération des peuples opprimés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vos opinions, vos sentiments, vos souffrances", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vos opinions, vos sentiments, vos souffrances", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "vos opinions, vos sentiments, vos souffrances.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "votre opinion, vos sentiments, vos souffrances", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "vos opinions, vos sentiments, vos souffrances", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -15934,12 +15938,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -15990,33 +15988,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "inférieur à 400 grammes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "400 grammes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "400 grammes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "400 grammes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "400 grammes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "inférieur à 400 grammes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "inférieur à 400 grammes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -16032,12 +16037,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -16130,33 +16129,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "redonnez -nous l' offensive", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "redonnez -nous l' offensive comme vous l' avez donnée à ceux qui les ont enfoncés, ne nous laissez pas succomber à la teutonisation, délivrez -nous", - "rougeL": 0.10526315789473684 + "rougeL": 0.10526315789473684, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' offensive comme vous l' avez donnée", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' offensive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "poing quotidien", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' offensive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'offensive", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 } }, "human_annot": { @@ -16178,12 +16184,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -16258,33 +16258,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ulrich Wille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ulrich Wille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ulrich Wille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' Ulrich Wille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ulrich Wille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ulrich Wille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ulrich Wille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -16324,12 +16331,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -16386,33 +16387,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un matériel de plus en plus important", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un matériel de plus en plus important", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un matériel de plus en plus important", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "matériel de plus en plus important", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un matériel de plus en plus important", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un matériel de plus en plus important", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un matériel de plus en plus important", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -16440,12 +16448,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -16514,33 +16516,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Von Klück,", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -16568,12 +16577,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -16636,33 +16639,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Von Klück,", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Von Klück", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -16702,12 +16712,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -16764,42 +16768,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "juillet - août", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "en juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "août", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "en juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -16892,33 +16897,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "toute question politique ou domestique se rapportant entièrement ou en partie à leurs affaires intérieures", - "rougeL": 0.375 + "rougeL": 0.375, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "toute question politique ou domestique se rapportant entièrement ou en partie à leurs affaires intérieures", - "rougeL": 0.375 + "rougeL": 0.375, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "que toute question politique ou domestique se rapportant entièrement ou en partie à leurs affaires intérieures", - "rougeL": 0.375 + "rougeL": 0.375, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "seul ressort de la juridiction des États-Unis,", - "rougeL": 0.22727272727272727 + "rougeL": 0.22727272727272727, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "exclusif de décider quelles questions dépendent de leur juridiction intérieure", - "rougeL": 0.043478260869565216 + "rougeL": 0.043478260869565216, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "déclarent que toute question politique ou domestique se rapportant entièrement ou en partie à leurs affaires intérieures, incluant l' immigration, le travail, le trafic côtier, les tarifs, le commerce (...) et tout autre question domestique sont du seul ressort de la juridiction des États-Unis, et ne doivent pas, par ce traité, être soumis d' aucune façon à l' arbitrage ou à la considération du Conseil ou de l' Assemblée de la Société des Nations.", - "rougeL": 0.9873417721518987 + "rougeL": 0.9873417721518987, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les États-Unis se réservent le droit exclusif de décider quelles questions dépendent de leur juridiction intérieure.", - "rougeL": 0.07843137254901959 + "rougeL": 0.07843137254901959, + "HScore": 0.5 } }, "human_annot": { @@ -16958,12 +16970,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -17020,33 +17026,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "si, dans une guerre entre l' Allemagne et la Russie, il restera neutre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "si, dans une guerre entre l' Allemagne et la Russie, il restera neutre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "si, dans une guerre entre l' Allemagne et la Russie, il restera neutre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "remise des forteresses de Toul et Verdun", - "rougeL": 0.10526315789473685 + "rougeL": 0.10526315789473685, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "si, ce qu' il n' y a pas lieu de supposer, le gouvernement français déclare qu' il reste neutre, je prie votre Excellence de déclarer au gouvernement français que nous devons exiger comme gage de sa neutralité la remise des forteresses de Toul et Verdun, que nous occuperons et que nous restituerons après que la guerre avec la Russie est terminée.", - "rougeL": 0.20833333333333334 + "rougeL": 0.20833333333333334, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "remise des forteresses de Toul et Verdun", - "rougeL": 0.10526315789473685 + "rougeL": 0.10526315789473685, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "La remise des forteresses de Toul et Verdun.", - "rougeL": 0.10526315789473685 + "rougeL": 0.10526315789473685, + "HScore": 0.5 } }, "human_annot": { @@ -17092,12 +17105,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -17148,33 +17155,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "On", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "On", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "On", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "On trouve de la terre partout, dans ses poches, dans son mouchoir, dans ses habits, dans ce qu' on mange.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "touche que j'ai", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "on", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le protagoniste qui retrouve de la terre partout dans ses poches, dans son mouchoir, dans ses habits, dans ce qu'on mange est mentionné dans l'article.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -17220,12 +17234,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -17276,33 +17284,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "examen des autres questions territoriales et politiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "examen des autres questions territoriales et politiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "examen des autres questions territoriales et politiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' examen des autres questions territoriales et politiques,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "diriger l'examen des autres questions territoriales et politiques", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' examen des autres questions territoriales et politiques, et notamment celles relatives à l' Arménie, aux Etats balkaniques et aux territoires faisant partie de l' ancien royaume de Pologne", - "rougeL": 0.37037037037037035 + "rougeL": 0.37037037037037035, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le même esprit d'équité et de justice commande l'examen des autres questions territoriales et politiques", - "rougeL": 0.5000000000000001 + "rougeL": 0.5000000000000001, + "HScore": 0.5 } }, "human_annot": { @@ -17324,12 +17339,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -17404,33 +17413,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "L' auteur de l' attentat,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -17458,12 +17474,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -17526,33 +17536,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Cabrinovic", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -17592,12 +17609,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -17654,33 +17665,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' ennemi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ennemi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ennemi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' ennemi.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'ennemi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' ennemi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "On cherche en vain l'ennemi.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -17696,12 +17714,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -17800,33 +17812,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fête spontanée, organisée par des pacifistes marginaux et des cercles de la jeunesse de gauche, pour fêter la victoire des bolcheviques en Russie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fête spontanée, organisée par des pacifistes marginaux et des cercles de la jeunesse de gauche, pour fêter la victoire des bolcheviques en Russie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fête spontanée, organisée par des pacifistes marginaux et des cercles de la jeunesse de gauche, pour fêter la victoire des bolcheviques en Russie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "émeute de Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "émeute de Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "émeute de Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'émeute de Zurich", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 } }, "human_annot": { @@ -17866,12 +17885,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -17929,33 +17942,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vous trouvez sur Cliotexte d' autres textes de Stephan Zweig", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Stefan Zweig", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Cliotexte", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Vous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Cliotexte", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -17971,12 +17991,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -18057,15 +18071,15 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le gros des forces sur les communications principales de l' ennemi afin de l' obliger soit à abandonner rapidement ses fronts actuels, soit à accepter de nouveaux combats dans les plus mauvaises conditions", "rougeL": 0.19999999999999998 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "toutes les forces ennemies", "rougeL": 0.3333333333333333 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "toutes les forces ennemies", "rougeL": 0.3333333333333333 }, @@ -18139,33 +18153,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "confiance et la gratitude", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "spectacle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "spectacle qu' il a donné et le loyal concours qu' il nous a apporté", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "spectacle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le spectacle qu' il a donné et le loyal concours qu' il nous a apporté", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "loyal concours", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le loyal concours", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -18199,12 +18220,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -18267,33 +18282,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1920", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -18315,12 +18337,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -18386,33 +18402,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1920", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1919 déjà", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -18440,12 +18463,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -18511,33 +18528,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pedraic Pearse", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 } }, "human_annot": { @@ -18571,12 +18595,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -18639,33 +18657,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Amérique du Sud", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -18693,12 +18718,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -18767,33 +18786,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "\" Vous, mes petites, vous ne vous marierez pas, il faudra travailler \"", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "\"Vous, mes petites, vous ne vous marierez pas, il faudra travailler\"", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -18827,12 +18853,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -18889,33 +18909,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il y avait de l' amertume dans sa voix.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Vous, mes petites, vous ne vous marierez pas, il faudra travailler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "\"Vous, mes petites, vous ne vous marierez pas, il faudra travailler\"", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -18949,12 +18976,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -19017,33 +19038,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la mobilisation des femmes et des enfants", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à l'armée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -19083,12 +19111,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -19139,33 +19161,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "espoir qu' une prompte victoire dissipera ce cauchemar", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'armée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -19205,12 +19234,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -19267,33 +19290,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la paix revient.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -19327,12 +19357,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -19389,33 +19413,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la paix revient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la paix revient.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -19449,12 +19480,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -19517,33 +19542,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chef de l' Etat - major général", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de l' État", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chef de l' Etat - major général", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chef de l' Etat - major général.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "chef de l' Etat - major général", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "chef de l' Etat - major général", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "chef de l'Etat-major général", - "rougeL": 0.4000000000000001 + "rougeL": 0.4000000000000001, + "HScore": 1.0 } }, "human_annot": { @@ -19571,12 +19603,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -19639,33 +19665,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Etat - major général", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' Etat", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' Etat - major général", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Etat - major général.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Etat - major général", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Etat - major général", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Theophil von Sprecher est nommé chef de l'Etat-major général.", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 } }, "human_annot": { @@ -19681,12 +19714,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -19767,33 +19794,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au moment où il aura constaté que l' adversaire a déjà passé aux actes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "au moment où il aura constaté que l' adversaire a déjà passé aux actes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "quand la révolte a atteint son plein développement", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au moment où il aura constaté que l'adversaire a déjà passé aux actes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -19809,12 +19843,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -19889,33 +19917,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "trop tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "toujours trop tard", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -19943,12 +19978,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -20017,33 +20046,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -20299,48 +20335,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -20463,33 +20457,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des sociétés pour la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -20511,12 +20512,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -20591,33 +20586,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -20651,12 +20653,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -20713,33 +20709,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Hoffmann", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hoffmann.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -20773,12 +20776,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -20841,33 +20838,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des données sur les intentions militaires des Alliés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "données sur les intentions militaires des Alliés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le bulletin d'information de l'Etat-major suisse, qui contient plusieurs données sur les intentions militaires des Alliés.", - "rougeL": 0.5555555555555556 + "rougeL": 0.5555555555555556, + "HScore": 0.5 } }, "human_annot": { @@ -20907,12 +20911,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -20963,33 +20961,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "intentions militaires des Alliés", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les intentions militaires des Alliés", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plusieurs données sur les intentions militaires des Alliés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les intentions militaires des Alliés.", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -21029,12 +21034,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -21091,33 +21090,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -21151,12 +21157,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -21210,33 +21210,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1908 - 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -21270,12 +21277,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -21335,33 +21336,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "août", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "août", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -21401,12 +21409,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -21463,33 +21465,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -21523,12 +21532,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -21587,33 +21590,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "professeur", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "professeur", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "professorat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -21653,12 +21663,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -21729,33 +21733,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' armée ne serait plus en mesure de prévenir la sédition ou de la tuer dans l' oeuf", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "armée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "révolutionnaires et pour le pouvoir", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "la sédition", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la révolte", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la sédition", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la sédition", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -21771,12 +21782,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -21875,33 +21880,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sans trop le connaître", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le peuple, sans trop le connaître, c'est-ce que le peuple connaît, c'est-ce que le peuple connaît, c'est-ce que le peuple connaît, c'est-ce que le peuple connaît, c'est-ce que", - "rougeL": 0.05555555555555555 + "rougeL": 0.05555555555555555, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Tchèques se flattaient qu' il leur serait secourable, à cause de l' influence que sa femme, issue d' une de vieilles familles de Bohême, exerçait sur son esprit", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "son fanatisme clérical et de son avarice", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le peuple ne connaît pas le prince", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "fanatisme clérical et de son avarice", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le peuple connaît son fanatisme clérical et son avarice", - "rougeL": 0.9 + "rougeL": 0.9, + "HScore": 1.0 } }, "human_annot": { @@ -21917,12 +21929,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -22007,33 +22013,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "je collais à l' intérieur de cette reliure", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "je collais à l' intérieur de cette reliure", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "je collais à l' intérieur de cette reliure", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "je collais à l' intérieur de cette reliure,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "moi, je collais à l' intérieur de cette reliure, pour cacher les bouts de l' élastique, mes petits carrés de papier rose.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "je", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la personne non spécifiée dans cet extrait de l'article", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -22049,12 +22062,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -22136,33 +22143,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -22190,12 +22204,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -22277,33 +22285,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "redonnez -nous l' offensive", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "redonnez -nous l' offensive comme vous l' avez donnée à ceux qui les ont enfoncés", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' offensive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' offensive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "poing quotidien", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' offensive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'offensive", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 } }, "human_annot": { @@ -22343,12 +22358,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -22405,33 +22414,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Plusieurs jeunes et jolies Arméniennes liées ensemble", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Arméniennes liées ensemble", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Plusieurs jeunes et jolies Arméniennes liées ensemble", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Plusieurs jeunes et jolies Arméniennes", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des jeunes et jolies Arméniennes", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "jeunes et jolies Arméniennes", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Plusieurs jeunes et jolies Arméniennes", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 } }, "human_annot": { @@ -22465,12 +22481,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -22534,33 +22544,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "20", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "20 obus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Le 75 tire 20 obus à la minute.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -22594,12 +22611,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -22656,33 +22667,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "20 obus à la minute", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des obus", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -22716,12 +22734,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -22784,33 +22796,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 0.9285714285714286 + "rougeL": 0.9285714285714286, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 0.9285714285714286 + "rougeL": 0.9285714285714286, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables, d' un poids inférieur à 400 grammes", - "rougeL": 0.8823529411764706 + "rougeL": 0.8823529411764706, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les projectiles explosibles ou chargés de matières fulminantes ou inflammables, d' un poids inférieur à 400 grammes", - "rougeL": 0.8823529411764706 + "rougeL": 0.8823529411764706, + "HScore": 1.0 } }, "human_annot": { @@ -22820,12 +22839,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -22906,33 +22919,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les projectiles explosibles ou chargés de matières fulminantes ou inflammables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -22972,12 +22992,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -23034,33 +23048,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les poches, dans son mouchoir, dans ses habits", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "hantise, un cauchemar de terre et de boue, et vous ne sauriez avoir idée de la touche que j' ai - mon fusil a l' air d'", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une hantise, un cauchemar de terre et de boue", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "terre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de la terre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "On trouve de la terre partout, dans ses poches, dans son mouchoir, dans ses habits, dans ce qu'on mange.", - "rougeL": 0.19999999999999998 + "rougeL": 0.19999999999999998, + "HScore": 0.0 } }, "human_annot": { @@ -23082,12 +23103,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -23164,33 +23179,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les belligérants n' ont pas le droit d' utiliser son territoire comme champ de bataille ou voie de passage ; ils ne peuvent pas le soumettre à un blocus économique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "droit : les belligérants n' ont pas le droit d' utiliser son territoire comme champ de bataille ou voie de passage ; ils ne peuvent pas le soumettre à un blocus économique", - "rougeL": 0.9714285714285714 + "rougeL": 0.9714285714285714, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' Etat ne peut être rendu responsable de manifestations d' opinions privées de ses ressortissants", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "champ de bataille ou voie de passage", - "rougeL": 0.38095238095238093 + "rougeL": 0.38095238095238093, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "droit : les belligérants n' ont pas le droit d' utiliser son territoire comme champ de bataille ou voie de passage ; ils ne peuvent pas le soumettre à un blocus économique.", - "rougeL": 0.9714285714285714 + "rougeL": 0.9714285714285714, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "seuls les organes officiels de l' Etat peuvent être rendus responsables de manifestations d' opinion ; par contre, l' Etat ne peut être rendu responsable de manifestations d' opinions privées de ses ressortissants. On ne peut donc pas mettre d' entraves à la liberté de presse. S' il a des devoirs à accomplir, un Etat neutre a aussi des droits : les belligérants n' ont pas le droit d' utiliser son territoire comme champ de bataille ou voie de passage ; ils ne peuvent pas le soumettre à un blocus économique.", - "rougeL": 0.4722222222222222 + "rougeL": 0.4722222222222222, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "un Etat neutre a aussi des droits ", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -23236,12 +23258,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -23293,37 +23309,44 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.65 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Il y a deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Suisse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Il y a deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, "llama-2_lora": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.65 }, "mixtral-8x7b": { "answer_pred": "deux ans auparavant", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.8 }, "GPT-3.5": { "answer_pred": "il y a deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.75 } }, "human_annot": { - "llama-2-70b": [ + "mixtral-8x7b": [ { "annot": "annot_1", "rating": "Correct" @@ -23365,7 +23388,7 @@ "rating": "Correct" } ], - "mixtral-8x7b": [ + "MT5-large": [ { "annot": "annot_1", "rating": "Correct" @@ -23400,14 +23423,14 @@ }, { "annot": "annot_9", - "rating": "Correct" + "rating": "Partiellement correct" }, { "annot": "annot_10", - "rating": "Correct" + "rating": "Erreur acceptable (\"humaine\")" } ], - "MT5-large": [ + "llama-2_lora": [ { "annot": "annot_1", "rating": "Correct" @@ -23449,7 +23472,49 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2_lora": [ + "FLAN-T5-large": [ + { + "annot": "annot_1", + "rating": "Erreur inacceptable" + }, + { + "annot": "annot_2", + "rating": "Erreur inacceptable" + }, + { + "annot": "annot_3", + "rating": "Erreur inacceptable" + }, + { + "annot": "annot_4", + "rating": "Erreur inacceptable" + }, + { + "annot": "annot_5", + "rating": "Erreur inacceptable" + }, + { + "annot": "annot_6", + "rating": "Erreur inacceptable" + }, + { + "annot": "annot_7", + "rating": "Erreur inacceptable" + }, + { + "annot": "annot_8", + "rating": "Erreur inacceptable" + }, + { + "annot": "annot_9", + "rating": "Erreur acceptable (\"humaine\")" + }, + { + "annot": "annot_10", + "rating": "Erreur inacceptable" + } + ], + "GPT-3.5": [ { "annot": "annot_1", "rating": "Correct" @@ -23484,140 +23549,56 @@ }, { "annot": "annot_9", - "rating": "Partiellement correct" + "rating": "Correct" }, { "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" + "rating": "Partiellement correct" } ], - "FLAN-T5-large": [ + "Camembert_baseline": [ { "annot": "annot_1", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_2", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_3", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_4", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_5", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_6", - "rating": "Erreur inacceptable" + "rating": "Erreur acceptable (\"humaine\")" }, { "annot": "annot_7", - "rating": "Erreur inacceptable" + "rating": "Correct" }, { "annot": "annot_8", - "rating": "Erreur inacceptable" + "rating": "Erreur acceptable (\"humaine\")" }, { "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" + "rating": "Correct" }, { "annot": "annot_10", - "rating": "Erreur inacceptable" + "rating": "Correct" } ], - "GPT-3.5": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], - "Camembert_baseline": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], - "T5-large": [ + "T5-large": [ { "annot": "annot_1", "rating": "Correct" @@ -23758,33 +23739,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "missionnaires et ses diplomates", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Japonais", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les missionnaires et diplomates japonais.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -23800,12 +23788,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -23881,33 +23863,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -23935,12 +23924,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -24028,33 +24011,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sans trop le connaître", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Tchèques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Tchèques se flattaient qu' il leur serait secourable, à cause de l' influence que sa femme, issue d' une de vieilles familles de Bohême, exerçait sur son esprit", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "ne l' aimait pas.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le jugeant sur ce qu' on rapportait de son fanatisme clérical et de son avarice", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "son fanatisme clérical et de son avarice", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le peuple ne le connaît pas.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -24100,12 +24090,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -24156,33 +24140,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les moteurs à essence.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -24216,12 +24207,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -24278,33 +24263,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les moteurs à essence", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -24338,12 +24330,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -24406,33 +24392,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -24460,12 +24453,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -24529,33 +24516,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un nouveau mode de civilisation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -24589,12 +24583,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -24688,33 +24676,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Elle dénombrait avec fierté les pays qu' elle avait découverts et lancés dans le courant de la vie générale, les peuples qu' elle avait nourris de sa substance et façonnés à son image, les société", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Europe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -24748,12 +24743,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -24816,33 +24805,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tué à la guerre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Il n'est pas indiqué dans l'article comment Albert Barbet a été tué.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -24882,12 +24878,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -24938,33 +24928,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à la guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Il n'y a pas de mention de l'événement au cours duquel Albert Barbet est tué dans l'article.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -24992,12 +24989,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -25066,33 +25057,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le monstre germanique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "monstre germanique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le monstre germanique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "monstre germanique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le monstre germanique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le monstre germanique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le monstre germanique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -25126,12 +25124,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -25194,33 +25186,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "aux affaires que le retour de la paix et la liquidation des dépenses de la guerre", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "aux affaires que le retour de la paix et la liquidation des dépenses de la guerre", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "aux affaires que le retour de la paix et la liquidation des dépenses de la guerre", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "aux affaires que le retour de la paix et la liquidation des dépenses de la guerre (3) vont sans doute faire éclore en Russie.", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "affaires que le retour de la paix et la liquidation des dépenses de la guerre vont sans doute faire éclore en Russie", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "aux affaires que le retour de la paix et la liquidation des dépenses de la guerre vont sans doute faire éclore en Russie", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "aux affaires que le retour de la paix et la liquidation des dépenses de la guerre", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -25266,12 +25265,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -25322,33 +25315,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -25388,12 +25388,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -25444,33 +25438,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jean Jaurès", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jean Jaurès", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jean Jaurès", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le cercueil de Jean Jaurès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -25498,12 +25499,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -25572,33 +25567,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le bourrage de crâne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bourrage de crâne exactions", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bourrage de crâne exactions", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "bourrage de crâne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "bourrage de crâne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "exactions inventées de civils belges", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les exactions.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -25644,12 +25646,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -25700,15 +25696,15 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "France", "rougeL": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "France", "rougeL": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "France", "rougeL": 1.0 }, @@ -25775,33 +25771,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -25835,12 +25838,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -25904,33 +25901,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "La Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -25940,12 +25944,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -26032,33 +26030,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des derniers temps qui précèdent l' agression nazie contre la Pologne", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des derniers temps qui précèdent l' agression nazie contre la Pologne", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "derniers temps qui précèdent l' agression nazie contre la Pologne", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "derniers temps qui précèdent l' agression nazie contre la Pologne.", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "après son décès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "derniers temps qui précèdent l' agression nazie contre la Pologne", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des derniers temps qui précèdent l' agression nazie contre la Pologne", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 } }, "human_annot": { @@ -26104,12 +26109,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -26154,33 +26153,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "après son décès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "après son décès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des derniers temps qui précèdent l' agression nazie contre la Pologne", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "derniers temps qui précèdent l' agression nazie contre la Pologne.", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "après son décès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "derniers temps qui précèdent l' agression nazie contre la Pologne", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le texte a été édité et une analyse du manuscrit a confirmé que celui-ci date bien des derniers temps qui précèdent l' agression nazie contre la Pologne.", - "rougeL": 0.6875000000000001 + "rougeL": 0.6875000000000001, + "HScore": 1.0 } }, "human_annot": { @@ -26190,12 +26196,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -26294,33 +26294,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ils", - "rougeL": 0 + "rougeL": 0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "armée allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "armée allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les Allemands", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemands", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ils", - "rougeL": 0 + "rougeL": 0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Aucune séquence de mots dans l'article ne répond à la question \"Qui mange de la paille ?\".", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -26330,12 +26337,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -26434,42 +26435,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ces moribonds", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ces moribonds", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Épuisés, à bout de souffle, luttant un contre deux depuis des jours, n' ayant pas le temps de dormir, n' ayant pas le temps de manger, ne marchant que par le prodige ", - "rougeL": 0.09523809523809523 + "rougeL": 0.09523809523809523, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Ces moribonds", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les plus humbles au plus illustres", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "ces hommes-là", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les hommes/moribonds", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -26562,33 +26564,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "profiter de l' avantage du terrain et de l' occupation pour réserver ses forces", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' avantage de la défensive, qui est de profiter de l' avantage du terrain et de l' occupation pour réserver ses forces", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' avantage de la défensive, qui est de profiter de l' avantage du terrain et de l' occupation pour réserver ses forces", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' avantage de la défensive, qui est de profiter de l' avantage du terrain et de l' occupation pour réserver ses forces,", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'avantage de la défensive, qui est de profiter de l' avantage du terrain et de l' occupation pour réserver ses forces, se trouve perdu en partie, car on est obligé alors d' employer à la défensive passive des troupes qu' on aurait pu employer en meilleure place.", - "rougeL": 0.08333333333333333 + "rougeL": 0.08333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' avantage de la défensive", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "L'avantage de la défensive a perdu.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -26610,12 +26619,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -26717,33 +26720,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "juste pour me mettre à table", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "J'", - "rougeL": 0 + "rougeL": 0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' auteur ne raconte pas sa propre enfance, mais celle d' une petite fille à Paris pendant la Première Guerre, et qui a treize ou quatorze ans en 1918", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "J' étais entrée aux Piles électriques.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "quand je rencontrais une voiture à chevaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "J' arrivais à midi et demie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la petite fille", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 } }, "human_annot": { @@ -26753,12 +26763,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -26845,33 +26849,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Autriche contre la Russie", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Autriche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Autriche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Autriche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la France", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' Autriche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Autriche", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -26899,12 +26910,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -26973,33 +26978,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Allemand sérieux, grave, allant toujours au fond des choses et trop indifférent à leur apparence", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemand sérieux", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemand", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Allemand", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Allemand sérieux, grave, allant toujours au fond des choses et trop indifférent à leur apparence.", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Allemand sérieux, grave, allant toujours au fond des choses et trop indifférent à leur apparence", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Allemand sérieux, grave, allant toujours au fond des choses et trop indifférent à leur apparence.", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 } }, "human_annot": { @@ -27015,12 +27027,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -27101,33 +27107,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -27161,12 +27174,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -27229,33 +27236,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à larges coups d' ailes tranquilles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Quand l'aube n'était pas encore bien débarrassée, les corbeaux arrivaient.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.95 } }, "human_annot": { @@ -27469,48 +27483,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -27675,33 +27647,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Quand l' aube n' était pas encore bien débarrassée,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à larges coups d' ailes tranquilles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' aube", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Quand l'aube n'était pas encore bien débarrassée", - "rougeL": 0.6250000000000001 + "rougeL": 0.6250000000000001, + "HScore": 1.0 } }, "human_annot": { @@ -27711,12 +27690,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -27803,33 +27776,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'infanterie ennemie", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 } }, "human_annot": { @@ -27869,12 +27849,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -27925,33 +27899,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "allemande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "L' infanterie ennemie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'infanterie ennemie", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 } }, "human_annot": { @@ -27973,12 +27954,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -28053,33 +28028,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans une tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans une tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans une tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans une tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -28095,12 +28077,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -28176,42 +28152,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "première ligne", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "première ligne", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans une tranchée de première ligne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -28310,33 +28287,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Une statuette en bronze et or, datant de l'époque d'Hammurabi.", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -28358,12 +28342,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -28432,33 +28410,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -28498,12 +28483,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -28560,33 +28539,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Howard Carter (1874 - 1939)", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 } }, "human_annot": { @@ -28620,12 +28606,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -28682,33 +28662,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Howard Carter (1874 - 1939)", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Howard Carter", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 } }, "human_annot": { @@ -28754,12 +28741,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -28840,33 +28821,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jeunesse Galba", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jeunesse Galba", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -28900,12 +28888,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -28968,33 +28950,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans le nord", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans un environnement de collines et de vallées", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans un environnement de collines et de vallées", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "dans un environnement de collines et de vallées", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sud", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "dans le nord", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans le nord.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -29010,12 +28999,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -29096,33 +29079,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -29156,12 +29146,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -29224,33 +29208,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -29284,12 +29275,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -29346,33 +29331,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -29406,12 +29398,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -29474,33 +29460,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jules César.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -29714,48 +29707,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -29926,33 +29877,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les jardins suspendus de la reine Sémiramis une des sept merveilles du monde", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "apogée vers le VIe siècle av. J.-C durant le règne de Nabuchodonosor II", - "rougeL": 0.21052631578947367 + "rougeL": 0.21052631578947367, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son apogée vers le VIe siècle av. J.-C. durant le règne de Nabuchodonosor II", - "rougeL": 0.21052631578947367 + "rougeL": 0.21052631578947367, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "jardins suspendus de la reine Sémiramis", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "apogée vers le VIe siècle av. J.-C.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Babylone", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Babylone connaît son apogée vers le VIe siècle av. J.-C.", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 0.0 } }, "human_annot": { @@ -29992,12 +29950,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -30054,33 +30006,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son père", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son père", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son père", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "son père", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "son père", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "son père", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il perd rapidement son père.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -30108,12 +30067,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -30182,33 +30135,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "651, lors de la conquête musulmane de la Perse", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Yazdgard III est tué lors de la conquête musulmane de la Perse.", - "rougeL": 0.5882352941176471 + "rougeL": 0.5882352941176471, + "HScore": 0.5 } }, "human_annot": { @@ -30248,12 +30208,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -30311,33 +30265,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Phéniciens", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Phéniciens", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Phéniciens", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' alphabet des Phéniciens.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' alphabet des Phéniciens", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "alphabet des Phéniciens", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'alphabet des Phéniciens", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { @@ -30377,12 +30338,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -30440,42 +30395,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "256 fantassins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "256 fantassins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "256 fantassins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "256 fantassins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "phalange macédonienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "fantassins", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la phalange macédonienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -30568,33 +30524,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fabrication de la bière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la fabrication de la bière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "fabrication de la bière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ninkasi a créé la bière.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -30622,12 +30585,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -30690,33 +30647,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ninkasi a créé la bière.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -30744,12 +30708,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -30818,33 +30776,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sainte Geneviève", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sainte Geneviève", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sainte Geneviève", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "sainte Geneviève", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "sainte Geneviève", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la population galvanisée par sainte Geneviève", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sainte Geneviève", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -30884,12 +30849,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -30947,33 +30906,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Arius", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Arius un prêtre catholique d' Alexandrie", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Arius", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Arius", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Arius", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Arius", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Arius un prêtre catholique d' Alexandrie", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 1.0 } }, "human_annot": { @@ -31013,12 +30979,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -31069,33 +31029,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "d' Arius un prêtre catholique d' Alexandrie", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "d' Arius un prêtre catholique d' Alexandrie", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Arius un prêtre catholique d' Alexandrie", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' Arius", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Arius, un prêtre catholique d' Alexandrie, d' origine libyenne, de la fin du IIe et du début du IVe siècle.", - "rougeL": 0.7586206896551725 + "rougeL": 0.7586206896551725, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Arius un prêtre catholique d' Alexandrie, d' origine libyenne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "d'Alexandrie, d'origine libyenne", - "rougeL": 0.39999999999999997 + "rougeL": 0.39999999999999997, + "HScore": 1.0 } }, "human_annot": { @@ -31105,12 +31072,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -31197,33 +31158,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jean-Baptiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jean-Baptiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jean-Baptiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jean-Baptiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jean-Baptiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jean-Baptiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean-Baptiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -31257,12 +31225,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -31325,33 +31287,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Enki", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Namma", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Enki", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Enki", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Enki (son fils ou son petit-fils)", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Enki", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "À Enki", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -31391,12 +31360,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -31453,33 +31416,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "Camembert_baseline": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "llama-2_lora": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "mixtral-8x7b": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "GPT-3.5": { "answer_pred": "le matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -31735,48 +31705,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Partiellement correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -31899,33 +31827,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "villa Giulia", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "matériel archéologique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le matériel archéologique a été trouvé à l'extérieur de la Ville.", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 } }, "human_annot": { @@ -31953,12 +31888,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -32027,33 +31956,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "celui qu' elle choisirait lors du festin auquel il convia ses hôtes grecs", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "festin", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "festin", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "festin", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le festin auquel il convia ses hôtes grecs", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les noces de Gyptis", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au festin", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 } }, "human_annot": { @@ -32081,12 +32017,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -32149,33 +32079,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "lors du festin auquel il convia ses hôtes grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "lors du festin", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lors du festin", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "lors du festin auquel il convia ses hôtes grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "durant le banquet", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "au cours du banquet", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "lors du festin", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -32215,12 +32152,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -32277,42 +32208,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sassanides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Sassanides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Sassanides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Sassanides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Sassanides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Zeugma", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Shapur Ier attaque Zeugma.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -32399,33 +32331,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sassanides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Zeugma", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Sassanides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Zeugma", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Sassanides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Zeugma", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Zeugma", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -32453,12 +32392,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -32527,33 +32460,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les eaux de pluie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "eaux de pluie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "eaux de pluie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "eaux de pluie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "eaux de pluie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "eaux de pluie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'eau des deux fleuves, le Tigre et l'Euphrate.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -32593,12 +32533,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -32655,33 +32589,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "avant que Lutèce ne soit fondée par les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "IIIe siècle av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Avant que Lutèce ne soit fondée par les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Avant que Lutèce ne soit fondée par les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "IIIe siècle av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "IIIe siècle av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Depuis le IIIe siècle av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -32703,12 +32644,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -32795,33 +32730,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Mars", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Mars", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Mars, fils de Jupiter et de Junon", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dieu de la guerre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Mars est fougueux, jeune, irréfléchi : il représente la jeunesse, à la différence de Saturne ou Jupiter qui représentent l' âge mûr. Mars est en effet aussi le dieu de l' agriculture, de la fertilité et de la végétation.", - "rougeL": 0.07142857142857142 + "rougeL": 0.07142857142857142, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Mars (mythologie)", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Mars, fils de Jupiter et de Junon", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 } }, "human_annot": { @@ -32855,12 +32797,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -32923,33 +32859,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la plus importante collection au monde d' objets minoens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la plus importante collection au monde d' objets minoens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la plus importante collection au monde d' objets minoens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "fresques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "une grande partie des fresques trouvées dans les fouilles du palais de Cnossos", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Le musée archéologique d' Héraklion trouve la plus importante collection au monde d' objets minoens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "On y trouve la plus importante collection au monde d' objets minoens, dont une grande partie des fresques trouvées dans les fouilles du palais de Cnossos.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -32959,12 +32902,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -33055,33 +32992,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Phrygie", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Phrygie", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -33103,12 +33047,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -33177,33 +33115,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi de Phrygie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -33237,12 +33182,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -33305,33 +33244,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "combats de gladiateurs et de bêtes sauvages", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -33371,12 +33317,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -33428,33 +33368,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "gladiateurs et de bêtes sauvages", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "gladiateurs et de bêtes sauvages", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les gladiateurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -33488,12 +33435,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -33557,33 +33498,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chimie, la physique et l' astronomie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chimie, la physique et l' astronomie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chimie, la physique et l' astronomie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chimie, la physique et l' astronomie.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Chimie, physique et astronomie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "chimie, physique et astronomie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la chimie, la physique et l' astronomie.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -33611,12 +33559,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -33703,33 +33645,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sainte Foy", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Sainte Foy", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Sainte Foy", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Sainte Foy Sainte Foy", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Sainte Foy", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Sainte Foy", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Sainte Foy", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -33763,12 +33712,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -33832,33 +33775,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sidon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Sidon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Sidon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Sidon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Sidon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Sidon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Sidon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -33892,12 +33842,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -33960,33 +33904,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "colons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les Phéniciens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "colons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Phéniciens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "des colons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "des colons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les Grecs.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -33996,12 +33947,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -34088,33 +34033,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1551 - 1553", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -34154,12 +34106,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -34210,33 +34156,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1551 et 1553", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1551 et 1553", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1551 - 1553", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1551 - 1553", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "entre 1551 et 1553", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -34252,12 +34205,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -34350,33 +34297,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Tartare", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le Tartare", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le Tartare", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le Tartare", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tartare", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Tartare", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans le Tartare", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -34392,12 +34346,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -34508,33 +34456,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jeunesse Galba", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jeunesse Galba", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Galba Galba", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Galba perd son père.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -34544,12 +34499,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -34636,33 +34585,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean-Baptiste fut arrêté.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -34702,12 +34658,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -34782,33 +34732,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Enfers", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les âmes des humains décédés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -34824,12 +34781,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -34910,33 +34861,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Tyr", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Tyr", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Tyr", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tyr", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tyr", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Tyr", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Tyr", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -34970,12 +34928,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -35038,33 +34990,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le déclin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le déclin s' installe et la ville fait l' objet de multiples combats et de guerres civiles", - "rougeL": 0.19999999999999998 + "rougeL": 0.19999999999999998, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le déclin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le déclin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "immigrations", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le déclin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le déclin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -35092,12 +35051,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -35166,33 +35119,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fleuve Léthé", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Léthé", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'eau du fleuve Léthé", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { @@ -35232,12 +35192,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -35288,33 +35242,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Léthé", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' eau du fleuve Léthé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'eau du fleuve Léthé", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { @@ -35336,12 +35297,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -35416,42 +35371,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Aujourd'hui, on dit encore qu' elle est de « mauvaise humeur »", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Aujourd'hui", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "on dit encore", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Aujourd'hui,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Quatre éléments, quatre humeurs, quatre caractères selon la médecine antique.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "on", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "on dit encore", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -35544,33 +35500,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vitruve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Aqueducs Vitruve", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vitruve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vitruve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vitruve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vitruve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Vitruve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -35598,12 +35561,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -35669,33 +35626,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cette ville, située dans une grande plaine, est de forme carrée ; chacun de ses côtés a cent vingt stades de long, ce qui fait pour l' enceinte de la place quatre cent quatre-vingts", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Babylone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de Babylone : « Cette ville, située dans une grande plaine, est de forme carrée ; chacun de ses côtés a cent vingt stades de long, ce qui fait pour l' enceinte de la place quatre cent quatre-vingt", - "rougeL": 0.14814814814814814 + "rougeL": 0.14814814814814814, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Cette ville, située dans une grande plaine, est de forme carrée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Babylone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Babylone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de Babylone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -35711,12 +35675,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -35791,33 +35749,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cette ville, située dans une grande plaine, est de forme carrée ; chacun de ses côtés a cent vingt stades de long", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Babylone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cette ville, située dans une grande plaine, est de forme carrée ; chacun de ses côtés a cent vingt stades de long, ce qui fait pour l' enceinte de la place quatre cent quatre-vingts stades", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Cette ville, située dans une grande plaine, est de forme carrée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de forme carrée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Babylone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de Babylone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -35839,12 +35804,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -35919,33 +35878,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -35979,12 +35945,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -36041,33 +36001,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Metellus Scipion et Caton d' Utique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Metellus Scipion et Caton d'Utique", - "rougeL": 0.9 + "rougeL": 0.9, + "HScore": 1.0 } }, "human_annot": { @@ -36101,12 +36068,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -36169,33 +36130,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "musée archéologique d' Héraklion et le musée national archéologique d' Athènes", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "entre le musée archéologique d' Héraklion et le musée national archéologique d' Athènes", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "entre le musée archéologique d' Héraklion et le musée national archéologique d' Athènes.", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -36241,12 +36209,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -36292,33 +36254,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "musée archéologique d' Héraklion et le musée national archéologique d' Athènes", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Héraklion et Athènes.", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Cnossos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -36346,12 +36315,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -36421,33 +36384,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Grand Roi", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Grand Roi", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -36493,12 +36463,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -36543,33 +36507,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -36609,12 +36580,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -36665,33 +36630,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le « Grand Roi »", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -36737,12 +36709,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -36805,33 +36771,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Howard Carter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Howard Carter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Howard Carter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Howard Carter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Howard Carter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Howard Carter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Howard Carter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -36865,12 +36838,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -36933,33 +36900,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "premier record connu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le premier record connu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "95 pieds", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "premier record connu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "95 pieds", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le premier record connu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le premier record connu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -37005,12 +36979,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -37062,33 +37030,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Darius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Darius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Darius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Darius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Darius aurait envoyé une lettre à Alexandre le Grand après la bataille d' Issos, qui demande le retrait des troupes d' Asie et la libération des prisonniers et selon Justin et Quinte en échange d' une rançon, mais Arrian ne fait aucune allusion à une rançon ; Quinte décrit que le ton de la lettre était offensant, mais ses demandes sont refusées par Alexandre.", - "rougeL": 0.12 + "rougeL": 0.12, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Darius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Darius aurait envoyé une lettre.", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -37116,12 +37091,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -37190,33 +37159,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -37244,12 +37220,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -37312,33 +37282,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "commerciaux", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "commerciaux", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des comptoirs commerciaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -37366,12 +37343,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -37440,33 +37411,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "argent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.16666666666666666 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "argent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.16666666666666666 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "argent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.16666666666666666 }, "Camembert_baseline": { "answer_pred": "argent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.16666666666666666 }, "llama-2_lora": { "answer_pred": "argent qu' il avait promis", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.16666666666666666 }, "mixtral-8x7b": { "answer_pred": "l' argent qu' il avait promis", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.16666666666666666 }, "GPT-3.5": { "answer_pred": "l'argent", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.16666666666666666 } }, "human_annot": { @@ -37496,32 +37474,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -37704,28 +37656,6 @@ "rating": "Erreur inacceptable" } ], - "question": [ - { - "annot": "annot_3", - "rating": "Question douteuse" - }, - { - "annot": "annot_4", - "rating": "Question douteuse" - }, - { - "annot": "annot_7", - "rating": "Question douteuse" - }, - { - "annot": "annot_8", - "rating": "Question douteuse" - }, - { - "annot": "annot_10", - "rating": "Question douteuse" - } - ], "no_answer": [ { "annot": "annot_3", @@ -37788,33 +37718,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -37848,12 +37785,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -37911,33 +37842,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une dizaine de kilos", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -37971,12 +37909,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -38040,33 +37972,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "comment ces maisons se présentaient et comment elles étaient construites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "comment ces maisons se présentaient et comment elles étaient construites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "comment ces maisons se présenteraient et comment elles étaient construites", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "comment ces maisons se présentaient et comment elles étaient construites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Comment se présentait la maison principale ?", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "comment ces maisons se présentaient et comment elles étaient construites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les archéologues ont compris comment ces maisons étaient construites.", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.5 } }, "human_annot": { @@ -38106,12 +38045,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -38169,33 +38102,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quelques en années auparavant", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "quelques décennies auparavant.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -38229,12 +38169,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -38292,33 +38226,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quelques en années auparavant", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "quelques décennies auparavant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -38352,12 +38293,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -38421,33 +38356,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "au sud de l' île Tibérine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "île Tibérine", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au sud de l' île Tibérine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "juste au sud de l' île Tibérine", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sud de l' île Tibérine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "juste au sud de l' île Tibérine", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au sud de l'île Tibérine", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { @@ -38469,12 +38411,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -38549,33 +38485,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -38615,12 +38558,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -38677,33 +38614,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -38737,12 +38681,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -38801,33 +38739,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -38867,12 +38812,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -38931,33 +38870,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains,", - "rougeL": 0.6956521739130436 + "rougeL": 0.6956521739130436, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour obtenir sa libération", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.23529411764705882 + "rougeL": 0.23529411764705882, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à cause de l' encerclement fait par les Romains", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -39003,12 +38949,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -39053,42 +38993,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains,", - "rougeL": 0.6956521739130436 + "rougeL": 0.6956521739130436, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.6399999999999999 + "rougeL": 0.6399999999999999, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -39193,33 +39134,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bateaux de commerce fabriqués dans du bois de cèdre et rendus étanches grâce à un calfatage à l' aide bitume", - "rougeL": 0.23529411764705882 + "rougeL": 0.23529411764705882, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "rendus étanches", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bateaux de commerce fabriqués dans du bois de cèdre et rendus étanches", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bateaux de commerce", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "bateaux de commerce fabriqués dans du bois de cèdre et rendus étanches grâce à un calfatage à l' aide bitume", - "rougeL": 0.23529411764705882 + "rougeL": 0.23529411764705882, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bateaux de commerce", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "du bois de cèdre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -39235,12 +39183,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -39333,33 +39275,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Nabuchodonosor II", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Nabuchodonosor II", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' Empire babylonien", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Nabuchodonosor II", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "VIe siècle av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Babylone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Babylone.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -39375,12 +39324,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -39462,33 +39405,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Crésus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Crésus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Crésus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Crésus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Crésus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Crésus, le riche roi de Lydie", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Crésus, le riche roi de Lydie", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 } }, "human_annot": { @@ -39498,12 +39448,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -39593,33 +39537,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "grec ancien.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -39659,12 +39610,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -39715,33 +39660,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "en grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en grec ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -39775,12 +39727,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -39843,33 +39789,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.75 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.75 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.75 }, "Camembert_baseline": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.75 }, "llama-2_lora": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.75 }, "mixtral-8x7b": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.75 }, "GPT-3.5": { "answer_pred": "les combats de gladiateurs et de bêtes sauvages", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.6 } }, "human_annot": { @@ -40041,48 +39994,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Partiellement correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -40290,33 +40201,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "combats de gladiateurs et de bêtes sauvages", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "combats de gladiateurs et de bêtes sauvages", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "combats de gladiateurs et de bêtes sauvages", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les combats de gladiateurs et de bêtes sauvages.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -40362,12 +40280,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -40413,33 +40325,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les bêtes sauvages", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -40479,12 +40398,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -40542,33 +40455,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un oppidum où s' était établie la tribu gauloise des Parisii depuis le IIIe siècle av. J.-C.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un oppidum où s' était établie la tribu gauloise des Parisii depuis le IIIe siècle av. J.-C.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un oppidum où s' était établie la tribu gauloise des Parisii depuis le IIIe siècle av. J.-C.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "oppidum", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un oppidum", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un oppidum où s' était établie la tribu gauloise des Parisii depuis le IIIe siècle av. J.-C.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "il existait un oppidum où s' était établie la tribu gauloise des Parisii depuis le IIIe siècle av. J.-C.", - "rougeL": 0.2727272727272727 + "rougeL": 0.2727272727272727, + "HScore": 1.0 } }, "human_annot": { @@ -40590,12 +40510,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -40670,33 +40584,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "propréteur en Bétique", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il est nommé consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -40742,12 +40663,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -40792,33 +40707,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "consul", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "propréteur en Bétique", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "préteur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "préteur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -40828,12 +40750,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -40932,33 +40848,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les âmes des Justes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les âmes des Justes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "âmes des Justes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les âmes des Justes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -40980,12 +40903,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -41061,33 +40978,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "de l' affronter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "de l' affronter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de l' affronter", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "de l' affronter.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "affronter Attila", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "décide de l' affronter", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "d'affronter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -41097,12 +41021,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -41189,33 +41107,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "donne la vie à l' homme créé à partir de l' argile", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "donne la vie", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "donne la vie à l' homme", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "À donner la vie à l'homme.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -41261,12 +41186,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -41311,33 +41230,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la vie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le sang donne la vie à l'homme.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -41371,12 +41297,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -41439,33 +41359,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un nœud inextricable", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "roi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Trancher le nud gordien", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "roi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le symbole de la royauté et de la divinité chez les Perses, les Étrusques, les Germains et les Illyriens.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Il devient souverain.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -41511,12 +41438,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -41561,33 +41482,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "roi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -41621,12 +41549,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -41683,33 +41605,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "souverain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Trancher le nud gordien", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Trancher le nœud gordien", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "symbole de la royauté et de la divinité chez les Perses, les Étrusques, les Germains et les Illyriens.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "symbole de la royauté et de la divinité chez les Perses, les Étrusques, les Germains et les Illyriens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le symbole de la royauté et de la divinité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -41737,12 +41666,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -41811,33 +41734,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "édile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "édile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "édile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "grand pontife (chef de la religion nationale romaine)", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -41847,12 +41777,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -41933,33 +41857,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -41993,12 +41924,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -42055,33 +41980,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "édile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "édile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "édile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "il devient édile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "grand pontife", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il devient édile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il devient édile.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -42127,12 +42059,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -42183,33 +42109,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en 206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "206 av. J.-C.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "début de la Dynastie Han en 206.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -42243,12 +42176,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -42305,33 +42232,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "206 av. J.-C.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "206", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -42377,12 +42311,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -42433,33 +42361,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -42493,12 +42428,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -42555,33 +42484,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les Romains", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les Romains.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -42597,12 +42533,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -42683,33 +42613,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quatorze années de combats", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "la conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "632 - 651", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "après quatorze années de combats perdus contre les envahisseurs arabes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -42737,12 +42674,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -42808,42 +42739,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "lors de la conquête musulmane de la Perse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "lors de la conquête musulmane de la Perse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lors de la conquête musulmane de la Perse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "lors de la conquête musulmane de la Perse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "632 - 651", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "651", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la conquête musulmane de la Perse", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -42939,33 +42871,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'agriculture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -42975,12 +42914,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -43067,33 +43000,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "flore et faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la flore et la faune.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -43121,12 +43061,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -43189,33 +43123,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "flore et faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la flore et la faune.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -43249,12 +43190,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -43317,33 +43252,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le Circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Circus Maximus contient 250 000 places.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le Circus Maximus.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -43365,12 +43307,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -43439,33 +43375,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "toute la façade du Mont Palatin", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le Circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "toute la façade du Mont Palatin", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le Circus Maximus contient 250000 places. Il occupe toute la façade du Mont Palatin.", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le circus Maximus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -43493,12 +43436,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -43567,33 +43504,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "statuette", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Une statuette en bronze et or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -43627,12 +43571,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -43695,33 +43633,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Han", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -43767,12 +43712,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -43817,33 +43756,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "début de la Dynastie Han", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Époque hellénistique", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la Dynastie Han", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -43859,12 +43805,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -43945,33 +43885,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "peuples résidant en Gaule", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -43999,12 +43946,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -44067,33 +44008,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -44133,12 +44081,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -44195,33 +44137,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les villes de l' Empire romain,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Rome et les villes de l' Empire romain", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -44243,12 +44192,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -44318,33 +44261,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Dans les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Dans les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "villes de l' Empire romain,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans les villes de l' Empire romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -44384,12 +44334,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -44447,33 +44391,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son père adoptif", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "faire construire un théâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un autre terrain proche du temple d' Apollon", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "faire construire un théâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -44507,12 +44458,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -44569,33 +44514,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un autre terrain proche du temple d' Apollon", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le projet de son père adoptif.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -44629,12 +44581,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -44709,42 +44655,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le roi Cécrops", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le roi Cécrops", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "roi Cécrops", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "roi Cécrops", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le roi Cécrops", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le roi Cécrops", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le roi Cécrops.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -44838,33 +44785,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en Occident", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Occident", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rome", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Occident", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en Occident", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rome", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "à Rome", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 } }, "human_annot": { @@ -44898,12 +44852,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -44979,33 +44927,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "empire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "empire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' empire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' Empire perse achéménide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' empire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' Empire perse achéménide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' Empire perse achéménide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -45021,12 +44976,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -45107,33 +45056,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Constantinople", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Constantinople", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Constantinople", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "capitale de l' Empire romain.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Byzance fut une ville grecque avant de devenir, sous le nom de Constantinople, la capitale de l' Empire romain.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Constantinople", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "sous le nom de Constantinople la capitale de l' Empire romain.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { @@ -45155,12 +45111,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -45230,33 +45180,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Empire romain", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Empire romain", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Empire romain", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Empire romain.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Empire romain", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": " Empire romain", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Byzance est devenue la capitale de l'Empire romain.", - "rougeL": 0.36363636363636365 + "rougeL": 0.36363636363636365, + "HScore": 1.0 } }, "human_annot": { @@ -45278,12 +45235,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -45359,33 +45310,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "flore et faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -45413,12 +45371,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -45481,33 +45433,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la flore et la faune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la flore et la faune.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -45535,12 +45494,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -45609,33 +45562,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pour venger Pompée.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -45663,12 +45623,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -45731,33 +45685,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -45797,12 +45758,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -45859,33 +45814,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une nuée de flèches tirées par-dessus l' épaule", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une nuée de flèches tirées par-dessus l' épaule", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nuée de flèches tirées par-dessus l' épaule", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une nuée de flèches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une nuée de flèches tirées par-dessus l' épaule.", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une nuée de flèches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une nuée de flèches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -45901,12 +45863,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -45988,33 +45944,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des documents administratifs et des listes qui ne nous apprennent rien sur les événements historiques", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "difficiles à interpréter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "difficiles à interpréter : ce sont des documents administratifs et des listes qui ne nous apprennent rien sur les événements historiques", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "difficiles à interpréter : ce sont des documents administratifs et des listes qui ne nous apprennent rien sur les événements historiques", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "difficiles à interpréter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "difficultes à interpréter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "difficiles à interpréter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -46048,12 +46011,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -46112,33 +46069,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "documents administratifs et des listes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "documents administratifs et des listes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "écriture se développe, mais les textes écrits à cette époque sont difficiles à interpréter : ce sont des documents administratifs et des listes qui ne nous apprennent rien sur les événements historiques", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des documents administratifs et des listes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "écriture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "documents administratifs et des listes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Il n'y a pas de mention des médias utilisés à cette époque dans l'article.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -46172,12 +46136,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -46242,33 +46200,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -46308,12 +46273,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -46368,33 +46327,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -46422,12 +46388,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -46500,33 +46460,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vercingétorix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vercingétorix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vercingétorix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vercingétorix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vercingétorix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vercingétorix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Vercingétorix décide", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -46554,12 +46521,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -46628,33 +46589,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -46694,12 +46662,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -46750,33 +46712,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Gaulois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -46798,12 +46767,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -46878,33 +46841,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.6956521739130436 + "rougeL": 0.6956521739130436, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains,", - "rougeL": 0.6956521739130436 + "rougeL": 0.6956521739130436, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5925925925925926 + "rougeL": 0.5925925925925926, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "après un long siège, lors duquel la population gauloise d' Alésia est privée de nourriture à cause de l' encerclement fait par les Romains", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' encerclement fait par les Romains", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -46938,12 +46908,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -47006,33 +46970,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fabrication de la bière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "fabrication de la bière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -47048,12 +47019,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -47128,33 +47093,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la bière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -47188,12 +47160,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -47268,42 +47234,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "jardins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "jardins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "jardins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ces jardins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "jardins suspendus", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jardins suspendus de Sémiramis", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les jardins suspendus", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -47396,33 +47363,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "venger Pompée.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -47456,12 +47430,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -47524,33 +47492,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -47584,12 +47559,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -47646,33 +47615,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "rassembler toutes les tribus gauloises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -47706,12 +47682,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -47774,33 +47744,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "celle de la mythologie grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -47840,12 +47817,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -47896,33 +47867,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grecque", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grecque", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "grecque", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mythologie grecque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -47968,12 +47946,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -48036,33 +48008,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -48096,12 +48075,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -48165,33 +48138,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Grand Roi", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Grand Roi", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -48231,12 +48211,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -48287,33 +48261,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le « Grand Roi »", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi des Perses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -48341,12 +48322,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -48427,33 +48402,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les ressources indispensables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -48481,12 +48463,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -48561,33 +48537,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "métaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "métaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "métaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "métaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "métaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "métaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "métaux indispensables", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -48621,12 +48604,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -48698,33 +48675,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trois saisons et une préquelle", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "série sur Spartacus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "série sur Spartacus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "série sur Spartacus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Spartacus. Fiction. Le personnage de Spartacus a été porté à l' écran, dans un péplum hollywoodien. Ce film a été réalisé par Stanley Kubrick. C' est l' acteur Kirk Douglas qui interprétait ce gladiateur rebelle. Steven S. DeKnight a aussi crée une série sur Spartacus. Elle contient 39 épisodes en trois saisons et une préquelle. Sources. https://fr.wikipedia.org/wiki/Spartacus_%28s%C3%A9rie_t%C3%A9l%C3%A9vis%C3%A9e%29. https://fr.wikipedia.org/wiki/Spartacus. http://www.cosmovisions.com/Spartacus.htm. http://www.larousse.fr/encycloped", - "rougeL": 0.07874015748031496 + "rougeL": 0.07874015748031496, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "une série sur Spartacus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La série sur Spartacus.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -48758,12 +48742,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -48844,33 +48822,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Abbas Ibn Firnas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Abbas Ibn Firnas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Abbas Ibn Firnas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Abbas Ibn Firnas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Abbas Ibn Firnas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Abbas Ibn Firnas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Abbas Ibn Firnas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -48904,12 +48889,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -48984,42 +48963,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "femme étrusque", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -49124,42 +49104,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "titre des anciens empereurs allemands (Kaiser) et des anciens souverains de la Bulgarie (de 893 à 1422 et de 1908 à 1946) et de la Serbie (de 1346 à 1371) et de la Russi", - "rougeL": 0.0625 + "rougeL": 0.0625, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Kaiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Kaiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les empereurs romains", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Kaiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "du titre des anciens empereurs allemands (Kaiser)", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "du titre de « César »", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -49252,33 +49233,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mort du héro", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la mort du héro", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la mort du héro", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "après la mort du héro.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -49294,12 +49282,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -49374,33 +49356,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "héro", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "héro", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Après la mort du héros Héraclès.", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -49410,12 +49399,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -49502,33 +49485,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "gladiateurs et de bêtes sauvages", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "gladiateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "gladiateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "gladiateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "gladiateurs et de bêtes sauvages", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "gladiateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les combats de gladiateurs et de bêtes sauvages.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -49550,12 +49540,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -49630,42 +49614,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pompée et du Sénat romain", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pompée et du Sénat romain", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pompée et du Sénat romain", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pompée et du Sénat romain", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Second triumvirat", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "rapprochement de Pompée et du Sénat romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le rapprochement de Pompée et du Sénat romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -49754,33 +49739,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mort de Crassus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mort de Crassus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ce triumvirat prend fin à la mort de Crassus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "rapprochement de Pompée et du Sénat romain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "mort de Crassus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la mort de Crassus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la mort de Crassus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -49826,12 +49818,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -49884,33 +49870,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "faire périr l' humanité en la noyant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le dieu Enki intervient à chaque fois pour sauver les hommes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "faire périr l' humanité en la noyant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "leurs créatures, les hommes deviennent trop nombreux et qu' ils font trop de bruits.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "pour faire disparaître les hommes", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Les dieux considèrent que leurs créatures, les hommes deviennent trop nombreux et qu' ils font trop de bruits.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "faire périr l'humanité en la noyant", - "rougeL": 0.7999999999999999 + "rougeL": 0.7999999999999999, + "HScore": 0.5 } }, "human_annot": { @@ -49926,12 +49919,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -50008,33 +49995,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "faire périr l' humanité en la noyant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "faire périr l' humanité en la noyant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "faire périr l' humanité en la noyant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "faire périr l' humanité en la noyant.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "périr l'humanité en la noyant", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "faire périr l' humanité en la noyant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "faire périr l'humanité en la noyant.", - "rougeL": 0.7999999999999999 + "rougeL": 0.7999999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -50074,12 +50068,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -50138,33 +50126,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "En Egypte et en Mésopotamie", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "En Egypte et en Mésopotamie", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "En Egypte et en Mésopotamie", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "En Egypte et en Mésopotamie", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Egypte et en Mésopotamie (l' Irak)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "En Egypte et en Mésopotamie (l' Irak)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En Egypte et en Mésopotamie.", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 } }, "human_annot": { @@ -50174,12 +50169,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -50260,33 +50249,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Egypte et en Mésopotamie", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Egypte et en Mésopotamie", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "En Egypte et en Mésopotamie", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "En Egypte et en Mésopotamie", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Egypte et en Mésopotamie (l' Irak)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Egypte et Mésopotamie (l'Irak)", - "rougeL": 0.7368421052631577 + "rougeL": 0.7368421052631577, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En Egypte et en Mésopotamie (l' Irak)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -50332,12 +50328,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -50388,42 +50378,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "César", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "César", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "César", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "62 av. J.-C.,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "César", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il est élu préteur", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "il", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -50528,33 +50519,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "roi Hammourabi", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le roi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -50576,12 +50574,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -50657,33 +50649,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Une coalition des peuples résidant en Gaule", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -50723,12 +50722,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -50785,33 +50778,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -50845,12 +50845,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -50907,33 +50901,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "rotation", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "rotation", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "rotation", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "rotation", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un mouvement de rotation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -50979,12 +50980,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -51035,33 +51030,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trancher le nud gordien", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trancher le nud gordien", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -51083,12 +51085,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -51157,33 +51153,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trancher le nud gordien", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trancher le nud gordien", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "trancher le nœud gordien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -51205,12 +51208,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -51297,33 +51294,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "province d' Afrique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "province d' Afrique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Galba Galba", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Othon", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Galba", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Néron l'a envoyé en Taraconaise.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -51339,12 +51343,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -51425,33 +51423,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "des verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "une horloge à eau, une machine à couper les pierres et une sorte de métronome.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "machine à voler, verres de lecture, horloge à eau, machine à couper les pierres, métronome, verre à quartz", - "rougeL": 0.7826086956521738 + "rougeL": 0.7826086956521738, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Abbas Ibn Firnas a inventé des verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -51485,12 +51490,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -51547,33 +51546,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "verres de lecture, horloge à eau, machine à couper les pierres, métronome, verre à quartz", - "rougeL": 0.8571428571428572 + "rougeL": 0.8571428571428572, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "des verres de lecture, une horloge à eau, une machine à couper les pierres et une sorte de métronome, le verre à quartz.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 } }, "human_annot": { @@ -51583,12 +51589,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -51687,33 +51687,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les âmes des Justes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les âmes des Justes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les âmes des Justes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les âmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -51759,12 +51766,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -51816,33 +51817,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en Anatolie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Anatolie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Anatolie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Anatolie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en Anatolie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en Anatolie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Anatolie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -51858,12 +51866,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -51945,33 +51947,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Veni vidi vici", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Veni vidi vici", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Veni vidi vici", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Veni vidi vici", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Veni vidi vici", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Veni vidi vici", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Veni vidi vici", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -52011,12 +52020,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -52073,33 +52076,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le roi doit agir pour le bien de ses sujets. Il garantit la paix et la justice, pour cela il édicte des lois et dispose d' une armée. Il doit favoriser le bonheur matériel, en ordonnant les grands travaux nécessaires (surtout dans un pays dépendant de l' irrigation) ou en annulant les dettes.", - "rougeL": 0.1111111111111111 + "rougeL": 0.1111111111111111, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "agit pour le bien de ses sujets", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le roi agit pour le bien de ses sujets.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -52109,12 +52119,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -52195,33 +52199,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "pour le bien de ses sujets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -52249,12 +52260,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -52323,33 +52328,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' agriculture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "plateaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une région de plateaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'agriculture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -52395,12 +52407,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -52451,42 +52457,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "il choisit ses soldats et ne les achète pas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Il refuse de donner l' argent qu' il avait promis aux prétoriens en disant qu' il choisit ses soldats et ne les achète pas", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "il choisit ses soldats et ne les achète pas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "il choisit ses soldats et ne les achète pas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il les achète", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "qu' il choisit ses soldats et ne les achète pas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "il choisit ses soldats et ne les achète pas.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -52580,33 +52587,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -52646,12 +52660,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -52702,33 +52710,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Vinius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -52762,12 +52777,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -52830,33 +52839,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -52890,12 +52906,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -52952,33 +52962,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "héro", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Héraclès", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -53000,12 +53017,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -53092,33 +53103,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "titre des anciens souverains de la Bulgarie (de 893 à 1422 et de 1908 à 1946) et de la Serbie (de 1346 à 1371) et de la Russie (de 1472 à 1917)", - "rougeL": 0.0689655172413793 + "rougeL": 0.0689655172413793, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Kaiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Kaiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Les empereurs romains", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Kaiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "César", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le titre des anciens empereurs allemands (Kaiser)", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 0.5 } }, "human_annot": { @@ -53158,12 +53176,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -53226,42 +53238,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "titre des anciens souverains de la Bulgarie (de 893 à 1422 et de 1908 à 1946) de la Serbie (de 1346 à 1371) et de la Russie (de 1472 à 1917)", - "rougeL": 0.0689655172413793 + "rougeL": 0.0689655172413793, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Kaiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les empereurs romains portèrent le titre de « César »", - "rougeL": 0.631578947368421 + "rougeL": 0.631578947368421, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "(Kaiser)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Kaiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "César", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le titre des anciens empereurs allemands est à l'origine du titre des empereurs romains qui portaient le titre de « César ».", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -53354,33 +53367,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Tigellin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Tigellin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Tigellin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tigellin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tigellin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Tigellin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Tigellin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -53414,12 +53434,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -53483,33 +53497,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trouver une solution", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trouver une solution", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de trouver une solution", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de trouver une solution", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une solution", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "de trouver une solution", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Elle demande à Enki de trouver une solution.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -53537,12 +53558,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -53605,33 +53620,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son fils ou son petit-fils", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une solution", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une solution", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une solution", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une solution", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une solution", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une solution.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -53665,12 +53687,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -53751,33 +53767,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son verdict", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "verdict", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son verdict", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' énergie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "s'il lève le pouce, le blessé a la vie sauve. s' il l' abaisse, c' est la mort.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "son verdict", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son verdict.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -53799,12 +53822,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -53879,42 +53896,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jules César", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -54019,33 +54037,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "rideau principal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "aulaeum", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "aulaeum", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "rideaux de scène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "aulaeum", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Aulaeum", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le rideau principal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -54085,12 +54110,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -54147,33 +54166,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quatre fluides", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quatre fluides", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "humeurs", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "humeurs", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "quatre fluides", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "quatre fluides", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les humeurs", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -54207,12 +54233,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -54269,33 +54289,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quatre", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quatre", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quatre", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "quatre", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "quatre", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "quatre fluides", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "quatre fluides", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -54335,12 +54362,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -54397,33 +54418,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "avant la construction d' un grand barrage sur l' Euphrate", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au cours des années 1990", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au cours des années 1990, juste avant la construction d' un grand barrage sur l' Euphrate", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "juste avant la construction d' un grand barrage sur l' Euphrate,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "au cours des années 1990", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "au cours des années 1990", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "juste avant la construction d'un grand barrage sur l'Euphrate", - "rougeL": 0.631578947368421 + "rougeL": 0.631578947368421, + "HScore": 1.0 } }, "human_annot": { @@ -54433,12 +54461,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -54520,33 +54542,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "construction d' un grand barrage sur l' Euphrate", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "construction d' un grand barrage sur l' Euphrate", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "construction d' un grand barrage sur l' Euphrate", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "construction d' un grand barrage sur l' Euphrate,", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "construction d' un grand barrage sur l' Euphrate", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la construction d' un grand barrage sur l' Euphrate", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la construction d'un grand barrage sur l'Euphrate", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 1.0 } }, "human_annot": { @@ -54580,12 +54609,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -54661,33 +54684,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ils sont valets d' armes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Spartiates", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les citoyens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "L' hilote", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Spartiates", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Il est très exceptionnel qu' on leur demande de combattre (cela se fera pendant la guerre du Péloponnèse qui oppose Sparte à Athènes dans la seconde moitié du Ve siècle av. J.-C.)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "on leur demande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -54715,12 +54745,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -54789,33 +54813,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jésus partit", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une colombe descendit du ciel", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Plus tard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean-Baptiste fut arrêté.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -54855,12 +54886,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -54917,33 +54942,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "après la mort du héro.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -54977,12 +55009,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -55045,33 +55071,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une des épreuves des Jeux olympiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "épreuves des Jeux olympiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une des épreuves des Jeux olympiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une des épreuves des Jeux olympiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une épreuve des Jeux olympiques", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' une des épreuves des Jeux olympiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une des épreuves des Jeux olympiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -55105,12 +55138,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -55173,33 +55200,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un morceau de bois d' environ 1,60 m.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -55245,12 +55279,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -55295,33 +55323,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un morceau de bois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "un morceau de bois d' environ 1,60 m.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -55361,12 +55396,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -55423,33 +55452,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "encercle sur un oppidum (un camp militaire sur colline), à Alésia", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "armées de Jules César", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son père", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "une victoire importante", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "une victoire importante contre les Romains lors de la bataille de Gergovie", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "bataille de Gergovie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Vercingétorix perd contre les Romains.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -55477,12 +55513,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -55553,33 +55583,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hammurabi", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de l' époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "époque d'Hammurabi", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 } }, "human_annot": { @@ -55607,12 +55644,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -55675,33 +55706,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "de l' époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de l' époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de l' époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "d' Hammurabi", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "époque d' Hammurabi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "époque d'Hammurabi", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 } }, "human_annot": { @@ -55741,12 +55779,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -55803,33 +55835,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les « îles Cassitérides »", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "les îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "aux « îles Cassitérides » (probablement les îles Scilly au large de la Cornouaille britannique)", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -55869,12 +55908,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -55922,33 +55955,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cassitérides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.95 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.95 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.95 }, "Camembert_baseline": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.95 }, "llama-2_lora": { "answer_pred": "les îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.95 }, "mixtral-8x7b": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.95 }, "GPT-3.5": { "answer_pred": "les îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.95 } }, "human_annot": { @@ -56162,48 +56202,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -56365,33 +56363,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les « îles Cassitérides »", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les « îles Cassitérides", - "rougeL": 0.14285714285714288 + "rougeL": 0.14285714285714288, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les îles Cassitérides (probablement les îles Scilly au large de la Cornouaille britannique)", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 } }, "human_annot": { @@ -56437,12 +56442,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -56502,33 +56501,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fortes variations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "très fortes pluies et de sécheresses prolongées", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "alternance de très fortes pluies et de sécheresses prolongées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "changements climatiques. À cette époque, les sédiments montrent une alternance de très fortes pluies et de sécheresses prolongées.", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "par des variations climatiques", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "forces variations", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "par des variations climatiques", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -56544,12 +56550,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -56630,33 +56630,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "3500 ans avant Jésus-Christ", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "3500 ans avant Jésus-Christ", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "environ 3500 ans avant Jésus-Christ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "3500 ans avant Jésus-Christ,", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "environ 3500 ans avant Jésus-Christ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "environ 3500 ans avant Jésus-Christ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "environ 3500 ans avant Jésus-Christ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -56672,12 +56679,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -56753,33 +56754,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "3500 ans avant Jésus-Christ", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "3500 ans avant Jésus-Christ", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "3500 ans avant Jésus-Christ", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "3500 ans avant Jésus-Christ,", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "environ 3500 ans avant Jésus-Christ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "environ 3500 ans avant Jésus-Christ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "environ 3500 ans avant Jésus-Christ", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -56813,12 +56821,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -56894,33 +56896,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' alphabet étrusque est dérivé de l' alphabet grec", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "alphabet étrusque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' alphabet étrusque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' alphabet étrusque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' alphabet grec", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l'alphabet étrusque", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'alphabet étrusque", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 } }, "human_annot": { @@ -56930,12 +56939,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -57035,33 +57038,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "arme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "arme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "arme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' arme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le javelot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' arme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'arme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -57107,12 +57117,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -57175,33 +57179,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les cavaliers", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les cavaliers", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "les cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -57217,12 +57228,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -57312,33 +57317,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les cavaliers", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les cavaliers parthes l' attaquent grâce à une nuée de flèches tirées par-dessus l' épaule", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "parthes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "parthes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -57384,12 +57396,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -57443,33 +57449,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -57503,12 +57516,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -57565,33 +57572,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "venger Pompée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "venger Pompée.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -57625,12 +57639,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -57693,33 +57701,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -57753,12 +57768,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -57815,33 +57824,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jules César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -57875,12 +57891,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -57943,33 +57953,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -57979,12 +57996,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -58065,33 +58076,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une bibliothèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -58101,12 +58119,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -58193,33 +58205,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "comme un oiseau afin de battre des ailes", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "comme un oiseau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "comme un oiseau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "comme un oiseau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "comme un oiseau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "comme un oiseau afin de battre des ailes", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "comme un oiseau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -58265,12 +58284,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -58316,33 +58329,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des ailes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bouger ses bras comme un oiseau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bouger", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "des ailes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "comme un oiseau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "comme un oiseau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "comme un oiseau.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -58352,12 +58372,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -58445,33 +58459,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à la même date que la mort de Crassus en 53 av. J.C", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à la même date", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -58505,12 +58526,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -58568,33 +58583,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 53 av. J.C", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -58622,12 +58644,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -58697,33 +58713,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Auguste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Auguste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Auguste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Auguste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Auguste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Auguste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Auguste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -58757,12 +58780,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -58837,33 +58854,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sant Martí d' Empúries", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Paléopolis", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ancien îlot est en totalité occupé par le village de Sant Mart d' Empries", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "La Paléopolis", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Paléopolis", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' ancien îlot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "à Sant Martí d' Empúries", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 } }, "human_annot": { @@ -58909,12 +58933,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -58966,33 +58984,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux passages latéraux couverts", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "deux passages latéraux couverts", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux passages latéraux couverts", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "deux passages latéraux couverts", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un passages latéraux couverts", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "deux passages latéraux couverts (un aditus maximus)", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "deux passages latéraux couverts", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -59032,12 +59057,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -59088,33 +59107,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux passages latéraux couverts", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "deux passages latéraux couverts", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux passages latéraux couverts", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "on utilise deux passages latéraux couverts", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "par deux passages latéraux couverts (un aditus maximus) situés de part et d'autre de l' orchestra.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "deux passages latéraux couverts (un aditus maximus) situés de part et d' autre de l' orchestra", - "rougeL": 0.4210526315789474 + "rougeL": 0.4210526315789474, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "on utilise deux passages latéraux couverts.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 } }, "human_annot": { @@ -59136,12 +59162,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -59228,33 +59248,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Nymphidius Sabinus", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Nymphidius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Nymphidius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Nymphidius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Nymphidius Sabinus", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Nymphidius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Nymphidius Sabinus", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 } }, "human_annot": { @@ -59276,12 +59303,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -59356,33 +59377,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "caractère monumental", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un caractère monumental", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un caractère monumental", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un caractère monumental", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un caractère monumental", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un caractère monumental", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un caractère monumental", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -59416,12 +59444,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -59478,33 +59500,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "monumental", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "monumental", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "monumental", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "monumental", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "monumental", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "monumental", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "monumental", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -59544,12 +59573,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -59606,33 +59629,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour ses sujets", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -59660,12 +59690,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -59734,33 +59758,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans le courant du - IIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "- IIe siècle", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "- IIe siècle", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "courant du - IIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "- IIe siècle", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "- IIe siècle", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "dans le courant du - IIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -59800,12 +59831,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -59862,33 +59887,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cassitérides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les \"îles Cassitérides\" (probablement les îles Scilly au large de la Cornouaille britannique).", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 } }, "human_annot": { @@ -59928,12 +59960,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -59987,42 +60013,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Péloponnèse. Ils ont dû quitter le Péloponnèse", - "rougeL": 0.5882352941176471 + "rougeL": 0.5882352941176471, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "du Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ils ont dû quitter le Péloponnèse après la mort du héros.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -60109,33 +60136,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Péloponnèse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -60175,12 +60209,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -60237,33 +60265,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les « îles Cassitérides »", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "océan Atlantique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "les îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les « îles Cassitérides » (probablement les îles Scilly au large de la Cornouaille britannique)", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -60297,12 +60332,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -60356,33 +60385,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cassitérides", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les « îles Cassitérides", - "rougeL": 0.14285714285714288 + "rougeL": 0.14285714285714288, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "îles Cassitérides", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les \"îles Cassitérides\" (probablement les îles Scilly au large de la Cornouaille britannique)", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 } }, "human_annot": { @@ -60410,12 +60446,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -60481,33 +60511,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Arthur Evans (1851 - 1941)", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' archéologue britannique Arthur Evans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'archéologue britannique Arthur Evans", - "rougeL": 0.8888888888888888 + "rougeL": 0.8888888888888888, + "HScore": 1.0 } }, "human_annot": { @@ -60547,12 +60584,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -60603,33 +60634,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Arthur Evans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -60669,12 +60707,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -60731,33 +60763,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "après la mort du héro", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -60791,12 +60830,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -60871,33 +60904,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les cavaliers", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les cavaliers", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "les cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -60925,12 +60965,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -61008,33 +61042,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les cavaliers", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les cavaliers parthes l' attaquent grâce à une nuée de flèches tirées par-dessus l' épaule", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "parthes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les cavaliers parthes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -61044,12 +61085,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -61139,33 +61174,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "On", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "On", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "On", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "On a longtemps pensé que les vestiges", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "archéologue", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "On", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "On n'a pas continué les fouilles de l'Odéon.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -61187,12 +61229,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -61267,33 +61303,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son père adoptif", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jules César envisage de faire construire un théâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "faire construire un théâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le projet de son père adoptif", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -61333,12 +61376,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -61407,33 +61444,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "métaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Les ressources indispensables.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -61473,12 +61517,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -61535,33 +61573,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Rome.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -61595,12 +61640,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -61658,33 +61697,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Galba est arrivé à Rome.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -61718,12 +61764,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -61792,33 +61832,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la maison de Muzalar", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "maison de Muzalar", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "site dit de la maison de Muzalar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "site dit de la maison de Muzalar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans les fouilles du site dit de la maison de Muzalar", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "site dit de la maison de Muzalar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "site dit de la maison de Muzalar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -61840,12 +61887,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -61920,33 +61961,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Dans la grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -61986,12 +62034,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -62043,33 +62085,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Denisova", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Denisova", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Denisova", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la grotte de Denisova", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -62109,12 +62158,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -62172,33 +62215,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -62232,12 +62282,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -62296,33 +62340,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -62362,12 +62413,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -62426,33 +62471,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "50300 ans (à 2200 ans près)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"datant de 50300 ans\".", - "rougeL": 0.39999999999999997 + "rougeL": 0.39999999999999997, + "HScore": 0.5 } }, "human_annot": { @@ -62486,12 +62538,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -62548,33 +62594,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "50300 ans", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "50300 ans (à 2200 ans près)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "datant de 50300 ans", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 } }, "human_annot": { @@ -62620,12 +62673,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -62676,33 +62723,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "presque toujours", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "presque toujours", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "toujours", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "presque toujours", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "depuis les sociétés agricoles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Testart propose de considérer la piste de l' ostentation, nous faisant découvrir par là que la richesse n' est pas l' apanage des sociétés agricoles.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la richesse existe presque toujours", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -62718,12 +62772,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -62804,33 +62852,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Maximin Ier le Thrace", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Maximin Ier le Thrace", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Maximin Ier le Thrace", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' empire romain)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Maximin Ier le Thrace", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Maximin Ier le Thrace", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Alexandre Sévère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -62852,12 +62907,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -62929,33 +62978,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "colonne romaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Maximin Ier le Thrace", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "colonne romaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Alexandre Sévère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "une colonne romaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une colonne romaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une colonne romaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -62965,12 +63021,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -63054,33 +63104,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chasseurs - cueilleurs en voie de sédentarisation", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des chasseurs - cueilleurs en voie de sédentarisation", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "se fixer dans cette plaine froide et aride entourée de montagnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de se fixer dans cette plaine froide et aride entourée de montagnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "chasseurs - cueilleurs en voie de sédentarisation", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "se fixer dans cette plaine froide et aride entourée de montagnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "se fixer", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 0.5 } }, "human_annot": { @@ -63090,12 +63147,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -63182,33 +63233,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "connaissance, roule contre un éboulis et meurt", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "connaissance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "connaissance, roule contre un éboulis et meurt", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le tigre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "connaissance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "conscience", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "conscience", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -63236,12 +63294,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -63310,33 +63362,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "culture, l' économie et la vie des Européens.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le passage des populations préhistoriques de la prédation à la production a influencé la culture, l'économie et la vie des Européens", - "rougeL": 0.31578947368421056 + "rougeL": 0.31578947368421056, + "HScore": 1.0 } }, "human_annot": { @@ -63358,12 +63417,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -63432,33 +63485,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "culture, l' économie et la vie des Européens.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "culture, l' économie et la vie des Européens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la culture, l'économie et la vie des Européens", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -63504,12 +63564,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -63560,33 +63614,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fondation britannique Leverhulme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "fondation britannique Leverhulme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la fondation britannique Leverhulme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -63602,12 +63663,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -63683,33 +63738,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Thibaut Devièse", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -63743,12 +63805,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -63824,42 +63880,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ils ont ainsi pu chasser une plus grande variété d' animaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "membres de la lignée humaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "membres de la lignée humaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "membres de la lignée humaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les membres de la lignée humaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ils", - "rougeL": 0 + "rougeL": 0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les membres de la lignée humaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -63953,33 +64010,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa famille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sa famille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sa famille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "sa famille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sa famille, qui est aisée", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "sa famille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "sa famille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -64019,12 +64083,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -64087,33 +64145,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1630.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à partir de 1630", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -64153,12 +64218,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -64209,33 +64268,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1630.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1630", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à partir de 1630", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à partir de 1630", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -64275,12 +64341,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -64349,33 +64409,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "il perd connaissance, roule contre un éboulis et meurt", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Australopithèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' Australopithèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Australopithèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l' Australopithèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' Australopithèque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'Australopithèque", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 } }, "human_annot": { @@ -64409,12 +64476,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -64477,33 +64538,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "au milieu du ve siècle avant notre ère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour venir s' installer dans le Sud de la Botte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour venir s' installer dans le Sud de la Botte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une partie des Samnites ont quitté l' Italie centrale pour venir s' installer dans le Sud de la Botte", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "parce qu' au milieu du ve siècle avant notre ère, une partie des Samnites ont quitté l' Italie centrale pour venir s' installer dans le Sud de la Botte", - "rougeL": 0.34782608695652173 + "rougeL": 0.34782608695652173, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "pour venir s' installer dans le Sud de la Botte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour venir s'installer dans le Sud de la Botte.", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -64525,12 +64593,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -64599,33 +64661,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "au milieu du ve siècle avant notre ère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour venir s' installer dans le Sud de la Botte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour venir s' installer dans le Sud de la Botte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une partie des Samnites ont quitté l' Italie centrale pour venir s' installer dans le Sud de la Botte", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour venir s'installer dans le Sud de la Botte", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour venir s' installer dans le Sud de la Botte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour venir s'installer dans le Sud de la Botte", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -64659,12 +64728,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -64727,33 +64790,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "carbone à l'état de trace", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -64781,12 +64851,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -64855,33 +64919,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "poissons et coquillages", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "protéines animales", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "poissons et coquillages", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "(poissons et coquillages compris)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "poissons et coquillages", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "poissons et coquillages", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les restes laissés par les grands carnivores", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -64897,12 +64968,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -64983,33 +65048,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tout le contraire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Tout le contraire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Tout le contraire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tout le contraire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tout le contraire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Tout le contraire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Tout le contraire.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -65037,12 +65109,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -65108,33 +65174,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "enfants", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les enfants", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les enfants", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les enfants", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les enfants", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les enfants", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les enfants", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -65162,12 +65235,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -65236,42 +65303,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "59,3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "59,3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "59,3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "59,3 tonnes,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "59,3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "59.3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "59,3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -65358,42 +65426,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "59,3", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "59,3", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "59,3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "59,3 tonnes,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "59,3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "59.3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "59,3 tonnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -65486,33 +65555,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trop d' énergie et de nourriture pour sa mère qui l' élève seule", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trop d' énergie et de nourriture pour sa mère qui l' élève seule", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trop d' énergie et de nourriture pour sa mère qui l' élève seule", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "trop d' énergie et de nourriture", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "trop d' énergie et de nourriture", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "trop d' énergie et de nourriture", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "trop d'énergie et de nourriture", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -65528,12 +65604,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -65614,33 +65684,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -65662,12 +65739,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -65737,33 +65808,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -65809,12 +65887,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -65866,33 +65938,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "autres membres du clan", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les autres membres du clan", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les chasseurs tuent les animaux.", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -65920,12 +65999,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -65988,33 +66061,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à piéger, à tuer ou à charogner de gros animaux", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "piéger, à tuer ou à charogner de gros animaux", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "gros animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -66036,12 +66116,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -66116,42 +66190,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les femmes et les vieillards", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -66246,33 +66321,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -66282,12 +66364,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -66389,33 +66465,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "maisons en terre", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les premières maisons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "premières maisons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "premières maisons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "maisons en terre", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "les premières maisons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les premières maisons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -66443,12 +66526,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -66518,33 +66595,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "restes d' animaux", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "des lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des lances en bois et des restes d'animaux", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 1.0 } }, "human_annot": { @@ -66584,12 +66668,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -66642,33 +66720,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "lances en bois et des restes d' animaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des lances en bois et des restes d'animaux", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 1.0 } }, "human_annot": { @@ -66678,12 +66763,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -66772,33 +66851,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "troisième", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "troisième", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "troisième", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "troisième", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "troisième", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "troisième", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le troisième vase carolingien", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 } }, "human_annot": { @@ -66838,12 +66924,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -66901,33 +66981,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour peupler l'Asie du Sud-Est et l'Europe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -66961,12 +67048,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -67023,33 +67104,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "peupler l' Asie du Sud-Est et l' Europe.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour peupler l'Asie du Sud-Est et l'Europe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour peupler l'Asie du Sud-Est et l'Europe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -67059,12 +67147,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -67151,33 +67233,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plus haut dans la vallée de l' Hérault", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vallée de l' Hérault", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plus haut dans la vallée de l' Hérault", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "vallée de l' Hérault,", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "val de Gellone", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "un lieu entouré de spectaculaires falaises et surmonté d' un château fort perché sur un piton", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "dans le val de Gellone", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -67217,12 +67306,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -67279,33 +67362,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "en Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Grotte Chauvet", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "en Europe, en Espagne, en Ardèche (Grotte Chauvet), sur l'île de Sulewesi", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -67315,12 +67405,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -67401,33 +67485,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Ardèche", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "En France (Ardèche) et en Espagne.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -67437,12 +67528,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -67529,33 +67614,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -67595,12 +67687,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -67653,33 +67739,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "moléculaires", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des marqueurs moléculaires fabriqués à partir d' ADN mitochondrial humain", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "marqueurs moléculaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des marqueurs moléculaires fabriqués à partir d'ADN mitochondrial humain.", - "rougeL": 0.4210526315789474 + "rougeL": 0.4210526315789474, + "HScore": 0.5 } }, "human_annot": { @@ -67725,12 +67818,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -67783,33 +67870,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tambo", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tambo", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -67837,12 +67931,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -67905,33 +67993,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tambo", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tambo", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -67977,12 +68072,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -68027,33 +68116,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tambo", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Tambo", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le projet européen Tambo.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -68081,12 +68177,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -68155,42 +68245,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "différemment", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "différemment", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "différemment", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "différemment", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ils agiront différemment", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les mêmes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "ils agiront différemment.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -68283,33 +68374,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "par l' archerie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "expression ostentatoire du statut", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans l' expression ostentatoire", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -68343,12 +68441,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -68405,33 +68497,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ostentatoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' expression ostentatoire de leur statut", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans l' expression ostentatoire", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -68471,12 +68570,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -68533,33 +68626,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une analyse de l'os coxal.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -68773,48 +68873,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -68979,33 +69037,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "os coxal", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "os coxal", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de l' os coxal", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "analyse de l' os coxal", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une analyse de l'os coxal", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -69033,12 +69098,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -69107,33 +69166,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petits thermes privés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "thermes privés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "petits thermes privés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "petits thermes privés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des thermes privés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "petits thermes privés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les petits thermes privés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -69167,12 +69233,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -69235,33 +69295,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "connaissance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "connaissance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "connaissance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les doigts de sa main droite se crispent autour du pouce", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "connaissance", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "conscience", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le australopithèque a perdu connaissance", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -69307,12 +69374,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -69363,33 +69424,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "notre devoir d' en assumer la responsabilité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Nous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Nous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Nous pouvons faire des choix concernant notre comportement.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "nous pouvons faire des choix concernant notre comportement", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "nous, Homo sapiens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -69399,12 +69467,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -69505,33 +69567,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "somptueux textiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ces somptueux textiles servaient à envelopper les corps de défunts notables", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ces somptueux textiles servaient à envelopper les corps de défunts notables", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "textiles", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sommptueux textiles", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "somptueux textiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les somptueux textiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -69559,12 +69628,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -69643,33 +69706,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "somptueux textiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ces somptueux textiles servaient à envelopper les corps de défunts notables", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ces somptueux textiles servaient à envelopper les corps de défunts notables", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ces somptueux textiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "somptueux textiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "somptueux textiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "somptueux textiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -69685,12 +69755,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -69775,33 +69839,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Guillaume", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Guillaume", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Guillaume", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Guillaume", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Guillaume", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Guillaume", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Guillaume", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -69835,12 +69906,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -69903,33 +69968,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Université de Wroclaw", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "projet européen Tambo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Józef Szykulski dirige le projet européen Tambo.", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 } }, "human_annot": { @@ -69957,12 +70029,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -70031,33 +70097,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sémite d' éleveurs nomades", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "éleveurs nomades", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "éleveurs nomades", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "peuple sémite d' éleveurs nomades,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "amorrite", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "un peuple sémite d' éleveurs nomades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les éleveurs nomades", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -70091,12 +70164,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -70153,33 +70220,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "peuple sémite d' éleveurs nomades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "éleveurs nomades", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "éleveurs nomades", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "peuple sémite d' éleveurs nomades,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "un peuple sémite d' éleveurs nomades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un peuple sémite d'éleveurs nomades", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un peuple sémite d' éleveurs nomades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -70195,12 +70269,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -70281,33 +70349,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sous l' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sous l' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sous l' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sous l' eau.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "eaux profondes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' eau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sous l'eau.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -70347,12 +70422,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -70403,33 +70472,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sous l' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sous l' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sous l' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sous l' eau.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sous l'eau", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sous l' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sous l'eau.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -70463,12 +70539,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -70531,33 +70601,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une météorite", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "météorite", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "météorite", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "météorite", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "environ 100 km de diamètre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "une météorite", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une météorite", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -70597,12 +70674,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -70659,33 +70730,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une partie des Samnites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une partie des Samnites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une partie des Samnites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une partie des Samnites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une partie des Samnites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une partie des Samnites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une partie des Samnites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -70719,12 +70797,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -70781,33 +70853,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Samnites", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Samnites", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Samnites", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Samnites", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Samnites", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Samnites", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Samnites", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -70817,12 +70896,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -70909,33 +70982,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "S. Venault et ses collègues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -70975,12 +71055,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -71039,33 +71113,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "génome néandertalien moyen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un génome néandertalien moyen obtenu à partir de plusieurs individus", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un génome néandertalien moyen obtenu à partir de plusieurs individus", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un génome néandertalien moyen obtenu à partir de plusieurs individus", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un génome néandertalien moyen obtenu à partir de plusieurs individus", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un génome néandertalien moyen obtenu à partir de plusieurs individus", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un génome néandertalien moyen obtenue à partir de plusieurs individus", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -71105,12 +71186,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -71162,33 +71237,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "génome néandertalien moyen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "néandertalien moyen", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "néandertalien moyen", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "génome néandertalien moyen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un génome néandertalien moyen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "génome néandertalien moyen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un génome néandertalien moyen obtenue à partir de plusieurs individus", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -71234,12 +71316,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -71303,42 +71379,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la consommation d' aliments caloriques a favorisé la croissance du cerveau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le basculement de nos ancêtres vers une alimentation carnée a favorisé la croissance du cerveau", - "rougeL": 0.7777777777777778 + "rougeL": 0.7777777777777778, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "basculement de nos ancêtres vers une alimentation carnée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le basculement de nos ancêtres vers une alimentation carnée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la consommation d' aliments caloriques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "consommation d' aliments caloriques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la consommation d'aliments caloriques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -71431,33 +71508,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -71491,12 +71575,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -71554,33 +71632,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "anciens colorants andins", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 } }, "human_annot": { @@ -71626,12 +71711,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -71683,33 +71762,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "oscillations climatiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "oscillations climatiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l' amplitude des oscillations a augmenté depuis environ 1,4 million d' années", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "plus nos connaissances sur ces oscillations climatiques progressent, plus nous réalisons à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 0.6896551724137931 + "rougeL": 0.6896551724137931, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "nous réalisons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -71755,12 +71841,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -71805,33 +71885,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "instables", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ont dû être instables", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les anciens environnements de nos ancêtres ont dû être instables.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -71841,12 +71928,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -71933,33 +72014,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "endroits qu' elle nous aurait conduits à désigner comme prometteurs", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "endroits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sites ayant produit des découvertes importantes, tel Tim's Confession", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "endroits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sites ayant produit des découvertes importantes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "signatures spectrales", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les sites ayant produit des découvertes importantes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -72005,12 +72093,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -72062,33 +72144,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -72116,12 +72205,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -72184,33 +72267,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -72256,12 +72346,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -72312,33 +72396,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "coévolution des gènes et de la culture", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "coévolution des gènes et de la culture", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "coévolution des gènes et de la culture", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "coévolution des gènes et de la culture", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "coévolution des gènes et de la culture", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "coévolution des gènes et de la culture", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sous le nom de coévolution des gènes et de la culture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -72366,12 +72457,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -72440,33 +72525,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Quel est le nombre d'années ?", - "rougeL": 0.14285714285714288 + "rougeL": 0.14285714285714288, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Nous réalisons à quel point les anciens environnements de nos ancêtres ont dû être instables.", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 } }, "human_annot": { @@ -72494,12 +72586,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -72568,33 +72654,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -72628,12 +72721,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -72691,33 +72778,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur le fond sédimentaire du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -72751,12 +72845,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -72820,33 +72908,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -72886,12 +72981,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -72943,33 +73032,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "outils en pierre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en pierre", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "outils primitifs en pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Des outils primitifs en pierre.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -73015,12 +73111,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -73072,33 +73162,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -73108,12 +73205,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -73194,33 +73285,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "laitière", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "laitière", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' exploitation laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -73230,12 +73328,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -73334,33 +73426,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Scandinavie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Scandinavie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Scandinavie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Scandinavie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Scandinavie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Scandinavie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Scandinavie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -73388,12 +73487,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -73462,33 +73555,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1 à 4 % de matériel génétique néandertalien.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -73522,12 +73622,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -73584,33 +73678,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matériel génétique néandertalien", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1 à 4 % de matériel génétique néandertalien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -73644,12 +73745,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -73712,33 +73807,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Totolapa au Chiapas", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "formation de Totolapa", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Totolapa", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "formation de Totolapa", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 } }, "human_annot": { @@ -73748,12 +73850,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -73834,33 +73930,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans la formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la formation de Totolapa au Chiapas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au Chiapas", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 } }, "human_annot": { @@ -73894,12 +73997,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -73962,33 +74059,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "étudier les anciens colorants andins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "étudier les anciens colorants andins.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -74022,12 +74126,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -74091,33 +74189,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tombes néolithiques d' une vaste zone allant de la vallée du Rhône à la Slovaquie", - "rougeL": 0.5217391304347826 + "rougeL": 0.5217391304347826, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tombes néolithiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tombes néolithiques d' une vaste zone allant de la vallée du Rhône à la Slovaquie", - "rougeL": 0.5217391304347826 + "rougeL": 0.5217391304347826, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tombes néolithiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tombes néolithiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les tombes néolithiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les tombes néolithiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -74163,12 +74268,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -74216,33 +74315,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tombes néolithiques d' une vaste zone allant de la vallée du Rhône à la Slovaquie", - "rougeL": 0.5217391304347826 + "rougeL": 0.5217391304347826, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tombes néolithiques d' une vaste zone allant de la vallée du Rhône à la Slovaquie", - "rougeL": 0.5217391304347826 + "rougeL": 0.5217391304347826, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tombes néolithiques d' une vaste zone allant de la vallée du Rhône à la Slovaquie", - "rougeL": 0.5217391304347826 + "rougeL": 0.5217391304347826, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "néolithiques", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "de la vallée du Rhône à la Slovaquie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "tombes néolithiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les tombes néolithiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -74276,12 +74382,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -74347,33 +74447,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nouvelles techniques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "nouvelles techniques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nouvelles techniques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "nouvelles techniques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des techniques", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nouvelles techniques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de nouvelles techniques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -74401,12 +74508,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -74475,33 +74576,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "comportement", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "notre comportement", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "comportement", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "notre comportement", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "nous pouvons faire des choix concernant notre comportement", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "comportement", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "notre comportement", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -74517,12 +74625,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -74604,33 +74706,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les produits non disponibles sur place", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les produits non disponibles sur place", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les produits non disponibles sur place", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les produits non disponibles sur place", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "produits non disponibles sur place", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "produits non disponibles sur place, tel du silex de bonne qualité pour confectionner des outils", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "du silex de bonne qualité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -74670,12 +74779,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -74726,33 +74829,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "silex de bonne qualité pour confectionner des outils", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "silex de bonne qualité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "silex de bonne qualité pour confectionner des outils", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "silex de bonne qualité pour confectionner des outils", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "du silex de bonne qualité pour confectionner des outils", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "du silex de bonne qualité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "du silex de bonne qualité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -74786,12 +74896,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -74854,33 +74958,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour la reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour la reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour la reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -74926,12 +75037,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -74982,33 +75087,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Des aperçus de quelques sites à fossiles clefs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Des aperçus de quelques sites à fossiles clefs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Des aperçus de quelques sites à fossiles clefs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Des aperçus de quelques sites à fossiles clefs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des sites à fossiles clefs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "quelques sites à fossiles clefs", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "quelques sites à fossiles clefs", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -75054,12 +75166,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -75110,33 +75216,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Italie centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Italie centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Italie centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Italie centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Italie centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Italie centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de l'Italie centrale", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -75170,12 +75283,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -75238,33 +75345,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "haplotypes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "haplotypes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "haplotypes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "haplotypes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "humains actuels", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "haplotypes (groupes de gènes transmis ensemble)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les haplotypes provenant des Néandertaliens", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 1.0 } }, "human_annot": { @@ -75310,12 +75424,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -75369,33 +75477,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1991", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1991", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1991", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1991", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1991", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1991", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1991", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -75423,12 +75538,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -75497,33 +75606,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "matières organiques", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matières organiques", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à partir de matières organiques", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "matières organiques, tels le bois ou la peau.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à partir de matières organiques, tels le bois ou la peau.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "matières organiques, tels le bois ou la peau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à partir de matières organiques, tels le bois ou la peau.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -75533,12 +75649,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -75622,33 +75732,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bois ou la peau", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matières organiques", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matières organiques, tels le bois ou la peau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bois ou la peau", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "matières organiques, tels le bois ou la peau.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bois ou la peau", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le bois ou la peau", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 } }, "human_annot": { @@ -75694,12 +75811,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -75753,33 +75864,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "des bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "des bijoux égarés ou des effets militaires abandonnés lors des derniers conflits.", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.5 } }, "human_annot": { @@ -75789,12 +75907,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -75875,33 +75987,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "bijoux égarés ou des effets militaires abandonnés lors des derniers conflits", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 1.0 } }, "human_annot": { @@ -75935,12 +76054,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -76003,33 +76116,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "systèmes militaires et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la vie sociale par des systèmes militaires et religieux", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -76075,12 +76195,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -76125,33 +76239,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "par des systèmes militaires (pour se défendre des concurrents) et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "militaires et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "militaires et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Les systèmes militaires et religieux.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -76167,12 +76288,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -76253,33 +76368,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "primates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "primates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "primates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "primates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "primates, fossiles ou modernes", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "primates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les primates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -76319,12 +76441,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -76384,33 +76500,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Amérique", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Amérique", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Amérique", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Amériques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Eurasiatiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Amérique", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Amérique", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -76444,12 +76567,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -76512,33 +76629,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans le IIe millénaire", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "du début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -76578,12 +76702,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -76635,33 +76753,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "IIe millénaire", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "IIe millénaire", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "IIe millénaire", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "du début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "début du IIe millénaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -76701,12 +76826,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -76764,33 +76883,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "23 sites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "23 sites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "23 sites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "résidus laitiers dans plus de 2200 tessons de poteries", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "23 sites datés entre le VIIe et le Ve millénaire", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "les poteries provenant de 23 sites", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les sites du Nord-Ouest de l' Anatolie", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 } }, "human_annot": { @@ -76818,12 +76944,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -76892,33 +77012,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "réaliser une analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "réaliser une analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "R. Clarke demande à Laurent Bruxelles de réaliser une analyse stratigraphique poussée du lit de mort de Little Foot.", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 } }, "human_annot": { @@ -76958,12 +77085,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -77014,33 +77135,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une analyse stratigraphique poussée du lit de mort de Little Foot", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 } }, "human_annot": { @@ -77068,12 +77196,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -77142,33 +77264,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vers l'au-delà", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers l'au-delà", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -77202,12 +77331,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -77264,33 +77387,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' au-delà", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vers l'au-delà", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' au-delà", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers l'au-delà", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -77300,12 +77430,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -77392,33 +77516,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "équipe d' archéologues de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "archéologues de l' INRAP", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "équipe d' archéologues de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "équipe d' archéologues de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "INRAP", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une équipe d' archéologues de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean-François Pasty dirige l'équipe d'archéologues de l'INRAP.", - "rougeL": 0.11111111111111112 + "rougeL": 0.11111111111111112, + "HScore": 1.0 } }, "human_annot": { @@ -77458,12 +77589,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -77515,33 +77640,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "INRAP", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "archéologues de l' INRAP", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "INRAP", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "équipe d' archéologues de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "INRAP", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "une équipe d' archéologues de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'équipe d'archéologues de l'INRAP", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 } }, "human_annot": { @@ -77551,12 +77683,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -77656,42 +77782,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petites populations isolées", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les pressions environnementales et la culture matérielle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les pressions environnementales et la culture matérielle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les pressions environnementales et la culture matérielle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les pressions environnementales et la culture matérielle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pressions environnementales et la culture matérielle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les ingrédients agissent différemment.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -77784,33 +77911,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "daims et bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les daims et les bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -77856,12 +77990,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -77906,33 +78034,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "daims et bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "daims et de bisons", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les loups tuent les daims et les bisons.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -77960,12 +78095,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -78034,42 +78163,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "systèmes militaires et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -78156,33 +78286,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes) et religieux (pour maintenir la cohésion des groupes) et religieux (pour maintenir la cohésion", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "militaires et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "militaires et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -78210,12 +78347,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -78284,33 +78415,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "armoise et autres hautes herbes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "armoise et autres hautes herbes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 } }, "human_annot": { @@ -78344,12 +78482,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -78406,33 +78538,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "armoise et autres hautes herbes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "armoise et autres hautes herbes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "arbrisseaux d' armoise et autres hautes herbes", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les arbrisseaux d'armoise et autres hautes herbes", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 } }, "human_annot": { @@ -78454,12 +78593,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -78534,33 +78667,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ADN récupéré d' un fragment d' os de doigt datant de 40000 ans et trouvé dans une grotte sibérienne", - "rougeL": 0.967741935483871 + "rougeL": 0.967741935483871, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "via l' ADN", - "rougeL": 0.3157894736842105 + "rougeL": 0.3157894736842105, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' ADN récupéré d' un fragment d' os de doigt datant de 40000 ans et trouvé dans une grotte sibérienne", - "rougeL": 0.967741935483871 + "rougeL": 0.967741935483871, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' ADN récupéré d' un fragment d' os de doigt", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "par l' ADN récupéré d' un fragment d' os de doigt datant de 40000 ans et trouvé dans une grotte sibérienne.", - "rougeL": 0.967741935483871 + "rougeL": 0.967741935483871, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ADN récupéré d' un fragment d' os de doigt", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "via l'ADN récupéré d'un fragment d'os de doigt", - "rougeL": 0.38461538461538464 + "rougeL": 0.38461538461538464, + "HScore": 1.0 } }, "human_annot": { @@ -78570,12 +78710,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -78656,33 +78790,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ADN récupéré d' un fragment d' os de doigt datant de 40000 ans et trouvé dans une grotte sibérienne", - "rougeL": 0.967741935483871 + "rougeL": 0.967741935483871, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "via l' ADN récupéré d' un fragment d' os de doigt datant de 40000 ans et trouvé dans une grotte sibérienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "via l' ADN récupéré d' un fragment d' os de doigt", - "rougeL": 0.6086956521739131 + "rougeL": 0.6086956521739131, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "via l' ADN récupéré d' un fragment d' os de doigt", - "rougeL": 0.6086956521739131 + "rougeL": 0.6086956521739131, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "via l' ADN récupéré d' un fragment d' os de doigt datant de 40000 ans et trouvé dans une grotte sibérienne.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ADN récupéré d' un fragment d' os de doigt datant de 40000 ans et trouvé dans une grotte sibérienne", - "rougeL": 0.967741935483871 + "rougeL": 0.967741935483871, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "via l'ADN récupéré d'un fragment d'os de doigt", - "rougeL": 0.38461538461538464 + "rougeL": 0.38461538461538464, + "HScore": 1.0 } }, "human_annot": { @@ -78728,12 +78869,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -78784,33 +78919,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "véritable fouille subaquatique méthodique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une véritable fouille subaquatique méthodique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une véritable fouille subaquatique méthodique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fouille subaquatique méthodique", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une fouille subaquatique méthodique", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "réalisé une véritable fouille subaquatique méthodique", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "réalisé une véritable fouille subaquatique méthodique.", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -78838,12 +78980,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -78906,33 +79042,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "subaquatique méthodique", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "subaquatique méthodique", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fouille subaquatique méthodique", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fouille subaquatique méthodique", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "subaquatique méthodique", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "fouille subaquatique méthodique", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une véritable fouille subaquatique méthodique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -78978,12 +79121,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -79034,42 +79171,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "oppidum", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "oppidum", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "oppidum", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "oppidum", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "forteresse gauloise", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "oppidum (forteresse gauloise)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "On utilisait la forteresse au mont Cavalier à l'âge du Fer.", - "rougeL": 0.15384615384615383 + "rougeL": 0.15384615384615383, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -79164,42 +79302,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sépulture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sépulture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sépulture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sépulture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sépulture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une sépulture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sépulture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -79286,33 +79425,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Kébara en Israël", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Kébara en Isral", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Kébara en Isral", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Kébara en Israël", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Kébara en Israël", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une sépulture contenant un squelette authentiquement néandertalien", - "rougeL": 0.19999999999999998 + "rougeL": 0.19999999999999998, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "à Kébara en Israël", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -79328,12 +79474,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -79414,33 +79554,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les reptiles mammaliens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -79456,12 +79603,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -79539,33 +79680,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "j' en aie le cœur net", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "j' en aie le cur net", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "j' en aie le cur net", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Ch. Emerson", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le narrateur", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "je décide", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Je", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -79599,12 +79747,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -79679,33 +79821,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tous les membres du groupe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "tous les membres du groupe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "tous les membres du groupe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 } }, "human_annot": { @@ -79745,12 +79894,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -79813,33 +79956,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "tous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "tous les membres du groupe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "tous les membres du groupe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "tous les membres du groupe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -79873,12 +80023,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -79953,42 +80097,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' atmosphère vierge de toute pollution lumineuse du Great Divide Basin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' atmosphère vierge de toute pollution lumineuse du Great Divide Basin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' atmosphère vierge de toute pollution lumineuse du Great Divide Basin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' atmosphère vierge de toute pollution lumineuse du Great Divide Basin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' atmosphère vierge de toute pollution lumineuse du Great Divide Basin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' atmosphère vierge de toute pollution lumineuse du Great Divide Basin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'atmosphère vierge de toute pollution lumineuse du Great Divide Basin.", - "rougeL": 0.8571428571428572 + "rougeL": 0.8571428571428572, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -80093,33 +80238,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les gènes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "gènes néandertaliens", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les gènes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "gènes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "gènes néandertaliens", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "gènes participant à la création des tissus des testicules", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les gènes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -80129,12 +80281,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -80221,33 +80367,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les bâtisseurs de cathédrales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -80293,12 +80446,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -80350,33 +80497,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -80410,12 +80564,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -80472,33 +80620,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "jour de la mort de leur maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le jour de la mort de leur maître.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -80508,12 +80663,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -80600,33 +80749,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -80654,12 +80810,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -80723,33 +80873,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "en Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "en Moselle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -80789,12 +80946,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -80852,33 +81003,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -80918,12 +81076,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -80980,33 +81132,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mâchoires et de ses dents", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mâchoires et de ses dents de grandes tailles", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses mâchoires et de ses dents de grandes tailles", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dents", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "mâchoires et dents de grandes tailles", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mâchoires et de ses dents", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ses mâchoires et ses dents de grandes tailles.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -81022,12 +81181,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -81102,33 +81255,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mâchoires et de ses dents", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mâchoires et de ses dents de grandes tailles", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses mâchoires et de ses dents", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dents", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "mâchoires et dents de grandes tailles", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mâchoires et de ses dents", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ses mâchoires et ses dents de grandes tailles.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -81174,12 +81334,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -81230,33 +81384,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sous les 25 mètres d' eau qui ont envahi la grotte depuis la déglaciation il y au moins 4000 ans", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sous les 25 mètres d' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sous les 25 mètres d' eau qui ont envahi la grotte depuis la déglaciation il y au moins 4000 ans", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sous les 25 mètres d' eau qui ont envahi la grotte", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "25 mètres d'eau", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.45 }, "mixtral-8x7b": { "answer_pred": "sous les 25 mètres d' eau qui ont envahi la grotte depuis la déglaciation il y au moins 4000 ans", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sous les 25 mètres d'eau", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 0.8 } }, "human_annot": { @@ -81554,48 +81715,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -81682,33 +81801,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes).", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "militaires et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -81730,12 +81856,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -81804,33 +81924,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes) et religieux (pour maintenir la cohésion des groupes) et religieux (pour maintenir la", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la vie sociale par des systèmes militaires (pour se défendre des concurrents) et religieux (pour maintenir la cohésion des groupes)", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "militaires et religieux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la vie sociale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -81858,12 +81985,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -81932,33 +82053,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "explication de l' évolution rapide des hominines à l' époque des glaciations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "explication de l' évolution rapide des hominines à l' époque des glaciations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' explication de l' évolution rapide des hominines à l' époque des glaciations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' explication de l' évolution rapide des hominines à l' époque des glaciations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "explication de l' évolution rapide des hominines à l' époque des glaciations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' explication de l' évolution rapide des hominines à l' époque des glaciations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'explication de l'évolution rapide des hominines à l'époque des glaciations.", - "rougeL": 0.608695652173913 + "rougeL": 0.608695652173913, + "HScore": 1.0 } }, "human_annot": { @@ -81998,12 +82126,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -82061,42 +82183,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Sibérie occidentale", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "près du village d' Ust' - Ishim", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -82186,33 +82309,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Sibérie occidentale", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -82240,12 +82370,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -82335,33 +82459,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "carcasses de gnous et d' autres grands mammifères pour les découper et les manger", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "carcasses de gnous et d' autres grands mammifères", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des carcasses de gnous et d' autres grands mammifères", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "gnous et d' autres grands mammifères", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des carcasses de gnous et d' autres grands mammifères", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "carcasses de gnous et d' autres grands mammifères", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les carcasses de gnous et d'autres grands mammifères pour les découper et les manger.", - "rougeL": 0.7826086956521738 + "rougeL": 0.7826086956521738, + "HScore": 1.0 } }, "human_annot": { @@ -82377,12 +82508,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -82463,33 +82588,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cavalerie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "équipe d' archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' équipe d' archéologues qui fouille le mont Castel", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' équipe d' archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une équipe d' archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' équipe d' archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'équipe d'archéologues", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -82511,12 +82643,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -82591,33 +82717,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les aborigènes et une autre fois par les prédécesseurs des Vikings", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les aborigènes et une autre fois par les prédécesseurs des Vikings", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les aborigènes et une autre fois par les prédécesseurs des Vikings", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "aborigènes et une autre fois par les prédécesseurs des Vikings", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "aborigènes et les prédécesseurs des Vikings", - "rougeL": 0.7692307692307693 + "rougeL": 0.7692307692307693, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les aborigènes et les prédécesseurs des Vikings", - "rougeL": 0.7692307692307693 + "rougeL": 0.7692307692307693, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les aborigènes et les prédécesseurs des Vikings", - "rougeL": 0.7692307692307693 + "rougeL": 0.7692307692307693, + "HScore": 1.0 } }, "human_annot": { @@ -82663,12 +82796,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -82719,33 +82846,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "parmi deux possibles", - "rougeL": 0.20689655172413793 + "rougeL": 0.20689655172413793, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "soit une femme vue de face, les bras au-dessus de la tête, associée à un addax (une grosse antilope du désert), soit un arbre associé à une peau animale tendue sur des bâtons croisés", - "rougeL": 0.9387755102040816 + "rougeL": 0.9387755102040816, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "parmi deux possibles", - "rougeL": 0.20689655172413793 + "rougeL": 0.20689655172413793, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "soit", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "sous forme de femme vue de face et d'addax", - "rougeL": 0.17142857142857146 + "rougeL": 0.17142857142857146, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "autour d' un couple d' éléments, choisi parmi deux possibles", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "choisi parmi", - "rougeL": 0.07142857142857144 + "rougeL": 0.07142857142857144, + "HScore": 0.0 } }, "human_annot": { @@ -82773,12 +82907,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -82841,33 +82969,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux", - "rougeL": 0.07407407407407407 + "rougeL": 0.07407407407407407, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "deux", - "rougeL": 0.07407407407407407 + "rougeL": 0.07407407407407407, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "soit une femme vue de face, les bras au-dessus de la tête, associée à un addax", - "rougeL": 0.5945945945945945 + "rougeL": 0.5945945945945945, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "soit", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "une femme vue de face, les bras au-dessus de la tête, associée à un addax (une grosse antilope du désert)", - "rougeL": 0.7317073170731707 + "rougeL": 0.7317073170731707, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une femme vue de face, les bras au-dessus de la tête, associée à un addax (une grosse antilope du désert), soit un arbre associé à une peau animale tendue sur des bâtons croisés", - "rougeL": 0.9387755102040816 + "rougeL": 0.9387755102040816, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Parmi deux possibles.", - "rougeL": 0.20689655172413793 + "rougeL": 0.20689655172413793, + "HScore": 0.0 } }, "human_annot": { @@ -82895,12 +83030,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -82969,33 +83098,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Alexandre Sévère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Maximin Ier le Thrace", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' empereur Maximin Ier le Thrace", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Alexandre Sévère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "une colonne romaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une colonne romaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une colonne romaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -83011,12 +83147,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -83088,33 +83218,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Plus nos connaissances sur ces oscillations climatiques progressent, plus nous réalisons à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Plus nos connaissances sur ces oscillations climatiques progressent, plus nous réalisons à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Des carottes extraites des calottes glaciaires et des boues du fond océanique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "instables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Plus nos connaissances sur ces oscillations climatiques progressent", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "nous réalisons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -83136,12 +83273,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -83216,33 +83347,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Plus nos connaissances sur ces oscillations climatiques progressent, plus nous réalisons à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Plus nos connaissances sur ces oscillations climatiques progressent, plus nous réalisons à quel point les anciens environnements de nos ancêtres ont dû être instables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Plus nos connaissances sur ces oscillations climatiques progressent", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "des carottes extraites des calottes glaciaires et des boues du fond océanique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "nous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "nous réalisons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -83264,12 +83402,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -83344,33 +83476,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "aux archives", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "aux archives", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "aux archives", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "archives", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "aux archives", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "aux archives", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "aux archives", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -83398,12 +83537,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -83472,33 +83605,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Une scène de chasse au taureau datant du VIe millénaire avant notre ère", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Une scène de chasse au taureau datant du VIe millénaire avant notre ère", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "scène de chasse au taureau", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "scène de chasse au taureau", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "une scène de chasse au taureau", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "scène de chasse au taureau", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Une scène de chasse au taureau.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 } }, "human_annot": { @@ -83532,12 +83672,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -83594,33 +83728,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "scène de chasse au taureau datant du VIe millénaire avant notre ère", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chasse au taureau", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chasse au taureau", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "chasse au taureau", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "chasse au taureau", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "scène de chasse au taureau", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Une scène de chasse au taureau.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { @@ -83666,12 +83807,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -83722,33 +83857,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "si les ossements sont séparés en deux groupes, c' est parce que la brèche qui les contenait tous initialement (résultant de l' éboulis sur lequel est mort Little Foot) s' est effondrée,", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "si les ossements sont séparés en deux groupes, c' est parce que la brèche qui les contenait tous initialement (résultant de l' éboulis sur lequel est mort Little Foot) s' est eff", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la brèche qui les contenait tous initialement (résultant de l' éboulis sur lequel est mort Little Foot) s' est effondrée", - "rougeL": 0.30769230769230765 + "rougeL": 0.30769230769230765, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ils comprennent que si les ossements sont séparés en deux groupes, c' est parce que la brèche qui les contenait tous initialement", - "rougeL": 0.9600000000000001 + "rougeL": 0.9600000000000001, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "qu'ils sont séparés en deux groupes", - "rougeL": 0.35294117647058826 + "rougeL": 0.35294117647058826, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "séparés en deux groupes", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les chercheurs apprennent que les ossements sont séparés en deux groupes", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -83764,12 +83906,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -83851,33 +83987,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "construction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "construction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "renforts métalliques de la cathédrale de Beauvais", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "début de la construction", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "construction de la cathédrale de Beauvais", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' usage de tirants et de chaînages en acier", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la construction de la cathédrale de Beauvais", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -83899,12 +84042,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -83991,33 +84128,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ian Hodder", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ian Hodder", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ian Hodder", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ian Hodder", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ian Hodder, de l' Université Stanford", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ian Hodder", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ian Hodder", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -84051,12 +84195,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -84131,33 +84269,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pastoralisme et de l' agriculture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "laitière", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "industrie laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "culture des vases à entonnoir", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l'industrie laitière", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' industrie laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' industrie laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -84173,12 +84318,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -84265,33 +84404,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pastoralisme et de l' agriculture sous la forme de la culture des vases à entonnoir", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "industrie laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "industrie laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "culture des vases à entonnoir", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'industrie laitière", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' industrie laitière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'industrie laitière", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -84307,12 +84453,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -84393,33 +84533,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -84453,12 +84600,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -84515,33 +84656,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -84551,12 +84699,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -84643,33 +84785,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la plus vieille œuvre figurative de la grotte Chauvet – un rhinocéros – date de 35000 à 38000 ans", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 1.0 } }, "human_annot": { @@ -84841,48 +84990,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -85089,33 +85196,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "35000 à 38000 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La séquence de mots dans l'article qui répond à la question \"De quand date la plus vieille oeuvre figurative de la grotte Chauvet ?\" est \"de 35000 à 38000 ans\".", - "rougeL": 0.37037037037037035 + "rougeL": 0.37037037037037035, + "HScore": 1.0 } }, "human_annot": { @@ -85143,12 +85257,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -85217,33 +85325,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un chef de milice gauloise", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -85253,12 +85368,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -85339,33 +85448,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -85399,12 +85515,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -85467,33 +85577,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le plafond d' une salle souterraine s' écroule en partie et crée un puits", - "rougeL": 0.19999999999999998 + "rougeL": 0.19999999999999998, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "un puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "un puits", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Un puits.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -85503,12 +85620,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -85597,33 +85708,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sépulture près de la tombe de Sobekhotep Ier", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une sépulture près de la tombe de Sobekhotep Ier", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une sépulture près de la tombe de Sobekhotep Ier", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une sépulture près de la tombe de Sobekhotep Ier", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sépulture près de la tombe de Sobekhotep Ier", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une sépulture près de la tombe de Sobekhotep Ier", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une sépulture près de la tombe de Sobekhotep Ier", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -85663,12 +85781,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -85725,33 +85837,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Un scribe qui ne sait pas le sumérien, mais quel scribe est -ce là?", - "rougeL": 0.4285714285714285 + "rougeL": 0.4285714285714285, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sumérien", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mésopotamienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "sumérien", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Sumer", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Sumer", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Un proverbe sumérien.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -85761,12 +85880,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -85853,33 +85966,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vers l'au-delà", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers l'au-delà", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -85913,12 +86033,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -85975,33 +86089,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au-delà", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' au-delà", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vers l'au-delà", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "vers l' au-delà", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers l'au-delà", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -86035,12 +86156,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -86103,33 +86218,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petits qui n' étaient pas les siens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les petits qui n' étaient pas les siens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "petits qui n' étaient pas les siens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les petits", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les petits qui n' étaient pas les siens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les petits qui n' étaient pas les siens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les petits", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -86163,12 +86285,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -86231,33 +86347,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lors de fouilles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "2007.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en 2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -86297,12 +86420,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -86353,33 +86470,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "2007.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "2007", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "2007.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -86413,12 +86537,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -86481,33 +86599,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un chef de milice gauloise", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -86535,12 +86660,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -86609,42 +86728,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "telle un chaudron débordant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Afrique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chaudron débordant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' Afrique a émis des vagues successives d' homonidés.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "chaudron débordant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la planète", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour s'installer sur la planète.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -86749,33 +86869,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -86815,12 +86942,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -86883,33 +87004,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le fromage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -86919,12 +87047,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -87011,33 +87133,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "3000 ans environ", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "3000 ans environ", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "3000 ans", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "3000 ans", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "3000 ans environ", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "3000 ans environ", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "depuis 3000 ans environ.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -87053,12 +87182,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -87139,33 +87262,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "INRAP", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "INRAP", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "équipe de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' INRAP", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "INRAP", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une équipe de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Une équipe de l'INRAP.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -87205,12 +87335,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -87262,33 +87386,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "INRAP", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "INRAP", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "équipe de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' INRAP", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "INRAP", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "une équipe de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une équipe de l' INRAP", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -87298,12 +87429,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -87391,33 +87516,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "49 espèces de petite taille", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -87457,12 +87589,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -87513,33 +87639,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "49 espèces de petite taille", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "29 espèces de carnivores de grande taille et 49 espèces de petite taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -87573,12 +87706,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -87653,33 +87780,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les gènes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ossements néandertaliens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "gènes néandertaliens", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "gènes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les gènes des Homo sapiens favorisant la fertilité", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "gènes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les gènes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -87719,12 +87853,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -87781,33 +87909,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'eau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'eau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -87847,12 +87982,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -87903,33 +88032,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'eau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' eau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'eau par évaporation", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -87963,12 +88099,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -88031,33 +88161,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trop d' énergie et de nourriture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trop d' énergie et de nourriture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trop d' énergie et de nourriture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "trop d' énergie et de nourriture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "nourriture et d' autres soins", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "trop d' énergie et de nourriture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de l'énergie et de la nourriture", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -88085,12 +88222,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -88159,33 +88290,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "utilisé des images satellitaires", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Berhane Asfaw", - "rougeL": 0.16666666666666666 + "rougeL": 0.16666666666666666, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Berhane Asfaw, du Service de recherche de la Vallée du Rift à Addis-Abeba", - "rougeL": 0.0909090909090909 + "rougeL": 0.0909090909090909, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "utilisé des images satellitaires", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les collègues ont utilisé des images satellitaires.", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -88201,12 +88339,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -88282,33 +88414,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des images satellitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -88348,12 +88487,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -88411,33 +88544,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "2000 objets", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -88471,12 +88611,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -88533,33 +88667,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "archéologiquement stérile", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "stérile", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "2 mètres de côté", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "un niveau archéologiquement stérile", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "un niveau archéologiquement stérile.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -88587,12 +88728,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -88661,33 +88796,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Min Zhu", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Les collègues chinois et suédois travaillent sur la découverte et l'étude d'un fossile de poisson nommé Entelognathus.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -88709,12 +88851,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -88785,33 +88921,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' Entelognathus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "poisson à plaques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "poisson à plaques, mais doté d' une mâchoire osseuse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "phylogénie des poissons", - "rougeL": 0.1818181818181818 + "rougeL": 0.1818181818181818, + "HScore": 0.5 } }, "human_annot": { @@ -88827,12 +88970,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -88927,33 +89064,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les chasseurs - cueilleurs Hadza, en Tanzanie, et par les San, au Botswana", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les San", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cueilleurs Hadza, en Tanzanie, et par les San, au Botswana", - "rougeL": 0.23529411764705882 + "rougeL": 0.23529411764705882, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "les San", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les chasseurs - cueilleurs Hadza, en Tanzanie, et par les San, au Botswana", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Hadza, en Tanzanie, et par les San, au Botswana", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les chasseurs - cueilleurs Hadza, en Tanzanie, et par les San, au Botswana,", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 } }, "human_annot": { @@ -88999,12 +89143,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -89055,33 +89193,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chasses bien plus anciennes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chasses bien plus anciennes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chasses bien plus anciennes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chasses bien plus anciennes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "chasses bien plus anciennes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "chasses bien plus anciennes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' existence de chasses bien plus anciennes", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -89121,12 +89266,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -89177,33 +89316,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "existence de chasses bien plus anciennes", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "existence de chasses bien plus anciennes", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "existence de chasses bien plus anciennes", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' existence de chasses bien plus anciennes.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'existence de chasses bien plus anciennes", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' existence de chasses bien plus anciennes", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'existence de chasses bien plus anciennes.", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -89237,12 +89383,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -89305,33 +89445,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "par évaporation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "par évaporation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "par évaporation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "Camembert_baseline": { "answer_pred": "par évaporation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "llama-2_lora": { "answer_pred": "par évaporation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "mixtral-8x7b": { "answer_pred": "par évaporation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "GPT-3.5": { "answer_pred": "par évaporation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 } }, "human_annot": { @@ -89545,48 +89692,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -89758,33 +89863,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "presque 16 mètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "presque 16 mètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -89794,12 +89906,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -89881,33 +89987,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "16 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "presque 16 mètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "presque 16 mètres de large", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -89941,12 +90054,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -90010,33 +90117,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la forme symbolique d' une croix)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le transept donne au bâtiment la forme symbolique d'une croix", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 1.0 } }, "human_annot": { @@ -90070,12 +90184,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -90132,33 +90240,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "donne au bâtiment la forme symbolique d' une croix)", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la forme symbolique d' une croix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le transept donne au bâtiment la forme symbolique d'une croix", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 1.0 } }, "human_annot": { @@ -90192,12 +90307,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -90260,33 +90369,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dents servent d' armes dans leurs combats pour la reproduction", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "ces dents servent d' armes dans leurs combats pour la reproduction.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour se reproduire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "combats pour la reproduction", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour la reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -90320,12 +90436,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -90382,33 +90492,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "combats pour la reproduction", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "pour la reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "pour la reproduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "combats pour la reproduction", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans leurs combats pour la reproduction.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -90454,12 +90571,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -90510,33 +90621,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les archéologues.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -90576,12 +90694,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -90635,33 +90747,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "150 mètres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "150 mètres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "150 mètres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "50 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "50 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "environ 50 mètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "150 mètres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 } }, "human_annot": { @@ -90689,12 +90808,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -90757,33 +90870,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "150 mètres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "150 mètres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "150 mètres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "50 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "50 mètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "environ 50 mètres or 150 mètres", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "150 mètres.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 } }, "human_annot": { @@ -90829,12 +90949,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -90885,33 +90999,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fouilleurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fouilleurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fouilleurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fossiles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "des fouilleurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les fouilleurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "on a par exemple trouvé", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -90933,12 +91054,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -91014,42 +91129,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme", - "rougeL": 0.9777777777777777 + "rougeL": 0.9777777777777777, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "que le métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme", - "rougeL": 0.9777777777777777 + "rougeL": 0.9777777777777777, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "que le métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme", - "rougeL": 0.9777777777777777 + "rougeL": 0.9777777777777777, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "le métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme,", - "rougeL": 0.9777777777777777 + "rougeL": 0.9777777777777777, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l'arrivée de gènes mal tolérés dans le patrimoine génétique de l'homme", - "rougeL": 0.36363636363636365 + "rougeL": 0.36363636363636365, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "le métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme, de sorte que la sélection naturelle les a éliminés de régions entières du génome", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "suggère que le métissage entre Homo sapiens et Néandertaliens a provoqué l'arrivée de gènes mal tolérés dans le patrimoine génétique de l'homme, de sorte que la sélection naturelle les a éliminés de régions entières du génome.", - "rougeL": 0.7241379310344828 + "rougeL": 0.7241379310344828, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -91136,33 +91252,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme", - "rougeL": 0.9777777777777777 + "rougeL": 0.9777777777777777, + "HScore": 0.75 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme", - "rougeL": 0.9777777777777777 + "rougeL": 0.9777777777777777, + "HScore": 0.75 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme", - "rougeL": 0.9777777777777777 + "rougeL": 0.9777777777777777, + "HScore": 0.75 }, "Camembert_baseline": { "answer_pred": "métissage entre Homo sapiens et Néandertaliens a provoqué l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme,", - "rougeL": 0.9777777777777777 + "rougeL": 0.9777777777777777, + "HScore": 0.75 }, "llama-2_lora": { "answer_pred": "ils suggèrent que le métissage entre Homo sapiens et Néandertaliens a provoqué l'arrivée de gènes mal tolérés dans le patrimoine génétique de l'homme, de sorte que la sélection naturelle les a éliminés de régions entières du génome.", - "rougeL": 0.689655172413793 + "rougeL": 0.689655172413793, + "HScore": 0.95 }, "mixtral-8x7b": { "answer_pred": "l' arrivée de gènes mal tolérés dans le patrimoine génétique de l' homme, de sorte que la sélection naturelle les a éliminés de régions entières du génome", - "rougeL": 0.3902439024390244 + "rougeL": 0.3902439024390244, + "HScore": 0.8 }, "GPT-3.5": { "answer_pred": "les déserts d'hérédité néandertalienne suggèrent que le métissage entre Homo sapiens et Néandertaliens a provoqué l'arrivée de gènes mal tolérés dans le patrimoine génétique de l'homme, de sorte que la sélection naturelle les a éliminés de régions entières du génome.", - "rougeL": 0.5797101449275363 + "rougeL": 0.5797101449275363, + "HScore": 0.95 } }, "human_annot": { @@ -91460,48 +91583,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Partiellement correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -91588,33 +91669,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des résidus laitiers", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des résidus laitiers", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des résidus laitiers", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "résidus laitiers", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "résidus laitiers", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des résidus laitiers dans plus de 2200 tessons de poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "résidus laitiers", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 } }, "human_annot": { @@ -91654,12 +91742,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -91716,33 +91798,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les femmes et les vieillards", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les vieillards et les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -91788,12 +91877,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -91846,33 +91929,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trois squelettes partiels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dinosaure qui y ressemble, sur la base de trois squelettes partiels mis au jour", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un dinosaure qui y ressemble, sur la base de trois squelettes partiels mis au jour", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "trois squelettes partiels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "un poulet géant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "un dinosaure qui ressemble à un poulet géant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un dinosaure", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -91918,12 +92008,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -91970,33 +92054,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trois squelettes partiels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Anzu wyliei", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un dinosaure qui y ressemble, sur la base de trois squelettes partiels mis au jour dans le Dakota", - "rougeL": 0.5555555555555556 + "rougeL": 0.5555555555555556, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un dinosaure omnivore ressemblant à un poulet géant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un poulet géant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "un dinosaure qui ressemble à un poulet géant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un dinosaure qui y ressemble", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -92012,12 +92103,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -92100,33 +92185,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "pour peupler l' Asie du Sud-Est et l' Europe.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "pour peupler l'Asie du Sud-Est et l'Europe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "pour peupler l'Asie du Sud-Est et l'Europe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -92136,12 +92228,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -92228,33 +92314,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un chef de milice gauloise", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -92294,12 +92387,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -92350,33 +92437,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chef de milice gauloise de la cité des Médiomatriques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un chef de milice gauloise", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "chef de milice gauloise", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le chef de milice gauloise des Médiomatriques", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -92410,12 +92504,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -92483,33 +92571,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hervé le Treut.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -92543,12 +92638,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -92624,33 +92713,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Robert Debré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Robert Debré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Robert Debré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Robert Debré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Robert Debré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Robert Debré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Robert Debré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -92684,12 +92780,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -92752,33 +92842,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Institut photovoltaque d' Île de France", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Institut photovoltaque d' Île de France", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -92794,12 +92891,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -92874,33 +92965,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Institut photovoltaque d' Île de France", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Institut photovoltaque d' Île de France", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'Institut photovoltaïque d'Île de France.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -92910,12 +93008,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -93002,33 +93094,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "médicaments, résidus de l' agriculture ou de l' industrie chimique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "molécules, médicaments, résidus de l' agriculture ou de l' industrie chimique", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -93044,12 +93143,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -93131,33 +93224,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "lasers intenses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "lasers intenses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lasers intenses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "lasers intenses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un rayon laser hyperpuissant", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "rayon laser hyperpuissant", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "un rayon laser hyperpuissant", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.5 } }, "human_annot": { @@ -93203,12 +93303,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -93253,33 +93347,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "lasers intenses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "lasers intenses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "intenses", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "lasers intenses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un rayon laser hyperpuissant", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "rayon laser hyperpuissant", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un rayon laser hyperpuissant", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 } }, "human_annot": { @@ -93289,12 +93390,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -93381,33 +93476,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "maladies génétiques qui affectent les nerfs et les muscles", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -93453,12 +93555,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -93503,33 +93599,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -93563,12 +93666,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -93631,33 +93728,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les mystérieuses « matière noire » et « énergie sombre »", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matière noire » et « énergie sombre »", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les mystérieuses « matière noire » et « énergie sombre »", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les mystérieuses « matière noire » et « énergie sombre »", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la quantité de matière noire et énergie sombre", - "rougeL": 0.5714285714285714 + "rougeL": 0.5714285714285714, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "matière noire et énergie sombre", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les mystérieuses « matière noire » et « énergie sombre »", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 } }, "human_annot": { @@ -93703,12 +93807,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -93771,42 +93869,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ses très officielles fonctions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ses très officielles fonctions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses très officielles fonctions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "ses très officielles fonctions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les fonctions officielles", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "globules et des plaquettes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "ses très officielles fonctions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -93905,33 +94004,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "professeur de médecine, chercheur, spécialiste des virus, directeur de l' institut national de la transfusion sanguine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "professeur de médecine, chercheur, spécialiste des virus, directeur de l' institut national de la transfusion sanguine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "très officielles fonctions", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "ses très officielles fonctions", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "détective littéraire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "très officielles fonctions", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "ses très officielles fonctions", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -93953,12 +94059,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -94030,33 +94130,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "GIEC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "GIEC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "aux travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "aux travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "GIEC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hervé le Treut participe activement aux travaux du GIEC.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -94084,12 +94191,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -94152,33 +94253,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "aux travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "aux travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "aux travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "aux travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Académie des Sciences", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hervé le Treut participe activement aux travaux du GIEC.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -94212,12 +94320,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -94280,33 +94382,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "spéculations hasardeuses en observations plus ou moins fiables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "spéculations hasardeuses en observations plus ou moins fiables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "aller de spéculations hasardeuses en observations plus ou moins fiables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "la cosmologie théorique dut aller de spéculations hasardeuses en observations plus ou moins fiables", - "rougeL": 0.35294117647058826 + "rougeL": 0.35294117647058826, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "par des spéculations hasardeuses en observations plus ou moins fiables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "spéculations hasardeuses en observations plus ou moins fiables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "observations plus ou moins fiables", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -94352,12 +94461,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -94405,33 +94508,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "théorique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "théorique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la cosmologie théorique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la cosmologie théorique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la cosmologie théorique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cosmologie théorique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la cosmologie théorique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -94465,12 +94575,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -94536,33 +94640,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des plats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des plats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "plats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "plats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des plats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -94578,12 +94689,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -94664,33 +94769,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "son laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "son laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -94706,12 +94818,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -94793,33 +94899,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "neuronaux de la locomotion", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "neuronaux de la locomotion", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "neuronaux de la locomotion", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "certains mécanismes neuronaux de la locomotion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "neuronaux de la locomotion", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mécanismes neuronaux de la locomotion", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "certains mécanismes neuronaux de la locomotion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -94841,12 +94954,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -94921,33 +95028,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quantité de matière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matière noire » et « énergie sombre »", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "matière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "quantité de matière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Univers", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "il contient de la matière.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -94975,12 +95089,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -95049,33 +95157,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les ouvriers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mines d' or", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mines d' or", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "ouvriers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "dans le désert Égyptien", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les ouvriers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les ouvriers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -95091,12 +95206,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -95177,33 +95286,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -95237,12 +95353,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -95300,33 +95410,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -95360,12 +95477,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -95423,33 +95534,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -95483,12 +95601,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -95552,33 +95664,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nous avons soulevé un couvercle de pierre et nous avons été éblouis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "nous avons soulevé un couvercle de pierre et nous avons été éblouis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nous avons soulevé un couvercle de pierre et nous avons été éblouis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "nous avons soulevé un couvercle de pierre et nous avons été éblouis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "nous avons soulevé un couvercle de pierre et nous avons été éblouis.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nous avons soulevé un couvercle de pierre et nous avons été éblouis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Christiane Ziegler dit alors : nous avons soulevé un couvercle de pierre et nous avons été éblouis.", - "rougeL": 0.5555555555555556 + "rougeL": 0.5555555555555556, + "HScore": 1.0 } }, "human_annot": { @@ -95606,12 +95725,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -95692,33 +95805,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jean-François Bach", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jean-François Bach", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jean-François Bach", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jean-François Bach,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jean-François Bach", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jean-François Bach", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean-François Bach", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -95752,12 +95872,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -95817,33 +95931,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un grand bouleversement est en train de menacer tous les équilibres", - "rougeL": 0.9 + "rougeL": 0.9, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bouleversement est en train de menacer tous les équilibres", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un grand bouleversement est en train de menacer tous les équilibres : le réchauffement climatique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un grand bouleversement est en train de menacer tous les équilibres : le réchauffement climatique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le réchauffement climatique", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "réchauffement climatique", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un grand bouleversement est en train de menacer tous les équilibres", - "rougeL": 0.9 + "rougeL": 0.9, + "HScore": 0.5 } }, "human_annot": { @@ -95877,12 +95998,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -95963,33 +96078,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Olivier Torrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Olivier Torrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Olivier Torrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Olivier Torrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Olivier Torrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Olivier Torrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Olivier Torrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -96023,12 +96145,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -96103,33 +96219,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "LHC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "anneau de plus de 26 km de circonférence", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le LHC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "LHC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "au sein du grand anneau de plus de 26 km de circonférence", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "dans le LHC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans le LHC, ce grand anneau de plus de 26 km de circonférence", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.0 } }, "human_annot": { @@ -96169,12 +96292,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -96237,33 +96354,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les uns contre les autres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les uns contre les autres à une vitesse proche de la lumière", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le LHC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "les uns contre les autres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "à une vitesse proche de la lumière", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "un grand anneau de plus de 26 km de circonférence où sont envoyés des paquets de protons les uns contre les autres à une vitesse proche de la lumière", - "rougeL": 0.1904761904761905 + "rougeL": 0.1904761904761905, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les uns contre les autres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -96297,12 +96421,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -96383,33 +96501,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -96443,12 +96568,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -96511,33 +96630,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les travaux en relation avec l' exploration spatiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "travaux en relation avec l' exploration spatiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les travaux en relation avec l' exploration spatiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les travaux en relation avec l' exploration spatiale.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le comité mondial de la recherche spatiale", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les travaux en relation avec l' exploration spatiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Cospar organise les travaux en relation avec l'exploration spatiale.", - "rougeL": 0.46153846153846156 + "rougeL": 0.46153846153846156, + "HScore": 1.0 } }, "human_annot": { @@ -96577,12 +96703,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -96633,33 +96753,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en relation avec l' exploration spatiale", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "exploration spatiale", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les travaux en relation avec l' exploration spatiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les travaux en relation avec l' exploration spatiale.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'exploration spatiale", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "travaux en relation avec l' exploration spatiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les travaux en relation avec l'exploration spatiale.", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -96675,12 +96802,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -96761,33 +96882,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "10 ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il y a 10 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "10 ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "il y a 10 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "10 ans avant", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il y a 10 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "il y a 10 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -96815,12 +96943,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -96907,33 +97029,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jeune physicien de l' atmosphère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jeune physicien de l' atmosphère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jean-Louis Fellous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jeune physicien de l' atmosphère,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Jeune physicien de l' atmosphère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Jeune physicien de l' atmosphère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Jean-Louis Fellous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -96973,12 +97102,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -97035,33 +97158,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le temps d' écrire sous un nom d' emprunt dans des revues de poésie", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "revues de poésie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le temps d' écrire sous un nom d' emprunt dans des revues de poésie", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le temps d' écrire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "temps d' écrire sous un nom d' emprunt dans des revues de poésie", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le temps d' écrire sous un nom d' emprunt dans des revues de poésie", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le temps", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 } }, "human_annot": { @@ -97077,12 +97207,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -97163,33 +97287,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en Arabie Saoudite", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -97199,12 +97330,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -97287,33 +97412,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en Arabie Saoudite", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "des malades", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -97323,12 +97455,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -97417,33 +97543,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "20 centimètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "20 centimètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "20 centimètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "20 centimètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "plus de 20 centimètres", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "près de 20 centimètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "près de 20 centimètres.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -97453,12 +97586,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -97539,42 +97666,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "20 centimètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "20 centimètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "20 centimètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "20 centimètres", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "plus de 20 centimètres", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "près de 20 centimètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "mesurent pour certains près de 20 centimètres", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -97679,33 +97807,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' homme de Néandertal,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'homme de Néandertal", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -97739,12 +97874,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -97813,33 +97942,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Néandertal", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' homme de Néandertal,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' homme de Néandertal", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'homme de Néandertal", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -97861,12 +97997,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -97941,33 +98071,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le prestigieux Institut Allemand Max Planck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le prestigieux Institut Allemand Max Planck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Institut Allemand Max Planck", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -98001,12 +98138,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -98064,33 +98195,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Max Planck", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Institut Allemand Max Planck", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Institut Allemand Max Planck", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -98118,12 +98256,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -98193,33 +98325,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "salariés", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -98247,12 +98386,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -98315,33 +98448,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "stress, la qualité du sommeil, la relation entre la santé du dirigeant et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "stress, qualité du sommeil, relation entre la santé du patron et celle de son entreprise, relation entre la santé du patron et celle de ses salariés.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "La santé du dirigeant : de la souffrance patronale à l' entrepreneuriat salutaire", - "rougeL": 0.09090909090909091 + "rougeL": 0.09090909090909091, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la santé du dirigeant", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 0.5 } }, "human_annot": { @@ -98357,12 +98497,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -98461,33 +98595,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jean-Louis Fellous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jean-Louis Fellous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jean-Louis Fellous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jean-Louis Fellous,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "devenir guitariste professionnel", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Jeune physicien de l' atmosphère", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Jean-Louis Fellous", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -98533,12 +98674,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -98589,33 +98724,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dioxyde de carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dioxyde de carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au dioxyde de carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dioxyde de carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dioxyde de carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dioxyde de carbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "du dioxyde de carbone.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -98649,12 +98791,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -98743,33 +98879,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "À l'aide d'ordinateurs superpuissants", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -98797,12 +98940,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -98895,33 +99032,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jean Lilensten", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jean Lilensten", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jean Lilensten", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jean Lilensten", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jean Lilensten", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jean Lilensten", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean Lilensten", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -98955,12 +99099,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -99024,33 +99162,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "biodiversité", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "essences, les sols et toute la biodiversité", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "biodiversité", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "biodiversité", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "essences, sols et toute la biodiversité", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "essences, sols et toute la biodiversité", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les essences, les sols et toute la biodiversité", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -99066,12 +99211,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -99170,33 +99309,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Denis le Bihan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Denis le Bihan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Denis le Bihan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Denis Le Bihan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Denis le Bihan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Denis Le Bihan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Denis Le Bihan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -99212,12 +99358,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -99299,33 +99439,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -99353,12 +99500,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -99421,33 +99562,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -99493,12 +99641,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -99549,33 +99691,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mieux anticiper l' avenir", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mieux anticiper l' avenir", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "afin de mieux anticiper l' avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mieux anticiper l' avenir.", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour mieux anticiper l' avenir", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "afin de mieux anticiper l' avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "afin de mieux anticiper l'avenir", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -99615,12 +99764,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -99672,33 +99815,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mieux anticiper l' avenir", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mieux anticiper l' avenir", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "meilleur anticiper l' avenir", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "mieux anticiper l' avenir.", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "anticiper l' avenir", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "afin de mieux anticiper l' avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "afin de mieux anticiper l'avenir", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -99738,12 +99888,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -99801,33 +99945,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "astrophysique", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "faire de l' astrophysique", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de l' astrophysique", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "faire de l' astrophysique", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "astronomie.", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "astrophysique", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Daniel Kunth a choisi de faire de l'astrophysique.", - "rougeL": 0.3157894736842105 + "rougeL": 0.3157894736842105, + "HScore": 1.0 } }, "human_annot": { @@ -99855,12 +100006,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -99929,33 +100074,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -100001,12 +100153,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -100051,33 +100197,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quatre-vingt", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "quatre-vingt", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -100111,12 +100264,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -100179,33 +100326,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Serge Volkoff", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Serge Volkoff", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Serge Volkoff", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Serge Volkoff", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Serge Volkoff", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Serge Volkoff", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Serge Volkoff", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -100239,12 +100393,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -100307,33 +100455,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une évidence pour la communauté scientifique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une évidence pour la communauté scientifique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une évidence pour la communauté scientifique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une évidence pour la communauté scientifique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "évidence pour la communauté scientifique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une évidence pour la communauté scientifique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une évidence pour la communauté scientifique.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -100367,12 +100522,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -100435,33 +100584,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "100 % carbone, les nanotubes et le graphène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "L' avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "100 % carbone, les nanotubes et le graphène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "nanotubes et le graphène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les nanotubes et le graphène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -100501,12 +100657,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -100563,33 +100713,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "biologistes, des physiciens, des mathématiciens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des biologistes, des physiciens, des mathématiciens.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -100605,12 +100762,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -100685,33 +100836,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "biologistes, des physiciens, des mathématiciens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "biologistes, des physiciens, des mathématiciens.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "une équipe de travail inattendue", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -100751,12 +100909,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -100813,33 +100965,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Tout ce que nous mangeons, il le décortique", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pâtés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pâtés ou le goût de la fraise des bois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Tout ce que nous mangeons, il le décortique. La cuisson des pâtés ou le goût de la fraise des bois,", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tout ce que nous mangeons, il le décortique.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "tout ce que nous mangeons", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ce que nous mangeons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -100879,12 +101038,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -100942,33 +101095,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "matière noire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5625 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la matière noire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.6875 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matière noire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5625 }, "Camembert_baseline": { "answer_pred": "matière noire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5625 }, "llama-2_lora": { "answer_pred": "matière noire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5625 }, "mixtral-8x7b": { "answer_pred": "matière noire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5625 }, "GPT-3.5": { "answer_pred": "la matière noire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.6875 } }, "human_annot": { @@ -101074,40 +101234,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_6", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -101295,12 +101421,6 @@ "annot": "annot_10", "rating": "Correct" } - ], - "question": [ - { - "annot": "annot_4", - "rating": "Question douteuse" - } ] }, "lu_in_question": true, @@ -101346,33 +101466,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "100 % carbone, les nanotubes et le graphène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' avenir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "100 % carbone, les nanotubes et le graphène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nanotubes et le graphène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les nanotubes et le graphène", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -101406,12 +101533,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -101474,33 +101595,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mines de crayons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans des mines de crayons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mines de crayons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "dans des mines de crayons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "dans des mines de crayons ?", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans des mines de crayons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "dans des mines de crayons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -101522,12 +101650,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -101602,33 +101724,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1 demi- millimètre", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1 demi- millimètre de diamètre", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1 demi- millimètre de diamètre à peine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1 demi- millimètre", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1 demi- millimètre de diamètre", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1 demi- millimètre de diamètre", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1 demi-millimètre de diamètre à peine", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -101638,12 +101767,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -101725,33 +101848,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1 demi- millimètre", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1 demi- millimètre de diamètre", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1 demi- millimètre de diamètre à peine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1 demi- millimètre", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "1 demi- millimètre de diamètre", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1 demi- millimètre de diamètre", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1 demi-millimètre", - "rougeL": 0.4000000000000001 + "rougeL": 0.4000000000000001, + "HScore": 0.5 } }, "human_annot": { @@ -101779,12 +101909,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -101866,33 +101990,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Diplodocus, Variraptor, triceratops,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les dinosaures", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -101914,12 +102045,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -102000,33 +102125,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Variraptor, triceratops,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Diplodocus, Variraptor, triceratops, ces animaux étranges ont existé il y a plus de 65 millions d' années, et ils font partie du quotidien d' Éric Buffetaut, directeur de recherches au CNRS.", - "rougeL": 0.1142857142857143 + "rougeL": 0.1142857142857143, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Diplodocus, Variraptor, triceratops", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les dinosaures", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -102048,12 +102180,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -102140,33 +102266,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Serge Haroche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Serge Haroche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Serge Haroche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Serge Haroche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Serge Haroche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Serge Haroche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Serge Haroche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -102200,12 +102333,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -102268,33 +102395,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cerveau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cerveau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cerveau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cerveau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "cerveau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cerveau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le cerveau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -102328,12 +102462,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -102390,33 +102518,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nos songes sans les interpréter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le cerveau", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des rêveurs qui mangent n' importe quoi en dormant, des aveugles qui voient pendant leurs songes, des sourds qui entendent et des paraplégiques qui courent des", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "nos songes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les songes sans les interpréter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nos songes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les songes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -102438,12 +102573,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -102530,33 +102659,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Didier Raoult.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -102590,12 +102726,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -102664,33 +102794,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Didier Raoult", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -102724,12 +102861,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -102792,33 +102923,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Abel", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "origines de l' homme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les origines de l' homme étaient à l' est.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les origines de l' homme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les origines de l'homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -102834,12 +102972,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -102920,33 +103052,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les tombeaux sont peuplés de momies.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -102980,12 +103119,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -103043,33 +103176,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des momies", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -103103,12 +103243,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -103172,33 +103306,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "65 millions d' années", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "65 millions d' années", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "65 millions d' années", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "65 millions d' années,", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "plus de 65 millions d' années", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de 65 millions d' années", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "plus de 65 millions d' années", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -103232,12 +103373,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -103294,33 +103429,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "65 millions d' années", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "65 millions d' années", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "65 millions d' années", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "il y a plus de 65 millions d' années,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "plus de 65 millions d' années", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de 65 millions d' années", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "il y a plus de 65 millions d' années", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -103348,12 +103490,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -103422,33 +103558,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "étudie le stress, la qualité du sommeil, la relation entre la santé du patron et celle de son entreprise, la relation entre la santé du patron et celle de ses salariés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "obtenir des résultats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "obtenir des résultats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "obtenir des résultats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à obtenir des résultats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "obtenir des résultats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à obtenir des résultats.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -103470,12 +103613,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -103544,33 +103681,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des résultats", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "résultats", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des résultats", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des résultats", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des résultats", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "résultats", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des résultats.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -103580,12 +103724,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -103672,33 +103810,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Stéphanie Thiébault a été nommée il y a quelques mois directrice de l' un des 10 instituts du CNRS.", - "rougeL": 0.2608695652173913 + "rougeL": 0.2608695652173913, + "HScore": 1.0 } }, "human_annot": { @@ -103726,12 +103871,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -103794,33 +103933,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il y a quelques mois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Stéphanie Thiébault a été nommée il y a quelques mois directrice de l' un des 10 instituts du CNRS : l' institut écologie et environnement.", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 } }, "human_annot": { @@ -103854,12 +104000,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -103922,33 +104062,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tout savoir sur l' eau du robinet", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un livre très complet « tout savoir sur l' eau du robinet »", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Il a dirigé avec Agathe Euzen un livre très complet « tout savoir sur l' eau du robinet »", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tout savoir sur l' eau du robinet", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Tout savoir sur l' eau du robinet.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' émission « les savanturiers » sur France Inter", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Agathe Euzen est aux commandes d'un livre très complet \"tout savoir sur l'eau du robinet\".", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -103976,12 +104123,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -104044,33 +104185,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tout savoir sur l' eau du robinet", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tout savoir sur l' eau du robinet", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un livre très complet « tout savoir sur l' eau du robinet »", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "tout savoir sur l' eau du robinet", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "journaliste scientifique, animatrice de l' émission « les savanturiers » sur France Inter.", - "rougeL": 0.1 + "rougeL": 0.1, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "un livre très complet « tout savoir sur l' eau du robinet »", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Agathe Euzen dirige avec quelqu'un un livre complet tout savoir sur l'eau du robinet.", - "rougeL": 0.43478260869565216 + "rougeL": 0.43478260869565216, + "HScore": 1.0 } }, "human_annot": { @@ -104086,12 +104234,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -104172,33 +104314,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -104232,12 +104381,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -104297,33 +104440,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Francis Eustache", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -104357,12 +104507,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -104428,33 +104572,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le Dr Laurent Schwartz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Laurent Schwartz", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Dr Laurent Schwartz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Laurent Schwartz", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Laurent Schwartz", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Laurent Schwartz", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Laurent Schwartz", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -104476,12 +104627,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -104557,42 +104702,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "département de médecine expérimentale de l' Université Claude Bernard", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "département de médecine expérimentale de l' Université Claude Bernard", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "département de médecine expérimentale de l' Université Claude Bernard", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Université Claude Bernard, à Lyon,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "departement de médecine expérimentale de l' Université Claude Bernard", - "rougeL": 0.6666666666666667 + "rougeL": 0.6666666666666667, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "département de médecine expérimentale de l' Université Claude Bernard, à Lyon", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le département de médecine expérimentale de l' Université Claude Bernard", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -104706,33 +104852,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -104766,12 +104919,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -104846,33 +104993,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'article ne mentionne pas explicitement quelle image l'homme de Néandertal a perdu.", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 } }, "human_annot": { @@ -104906,12 +105060,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -104974,42 +105122,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "neurobiologiste, philosophe, directeur de recherche émérite au CNRS", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Inter neurobiologiste, philosophe, directeur de recherche", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "neurobiologiste, philosophe, directeur de recherche", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "neurobiologiste, philosophe, directeur de recherche émérite", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "neurobiologiste, philosophe, directeur de recherche émérite au CNRS.", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "neurobiologiste", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Le directeur de recherche émérite au CNRS.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -105102,33 +105251,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "objets extraterrestres, de pierres tombées du ciel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "objets extraterrestres, de pierres tombées du ciel,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "objets extraterrestres, de pierres tombées du ciel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -105138,12 +105294,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -105226,33 +105376,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "objets extraterrestres, de pierres tombées du ciel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "objets extraterrestres, de pierres tombées du ciel,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "des pierres tombées du ciel", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -105286,12 +105443,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -105356,33 +105507,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les chercheurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les chercheurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les chercheurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chercheurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hypnose, méditation, neurofeedback.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les chercheurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les chercheurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -105416,12 +105574,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -105503,33 +105655,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'image de brute épaisse", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.0 } }, "human_annot": { @@ -105557,12 +105716,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -105643,33 +105796,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "brute épaisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son image de brute épaisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -105685,12 +105845,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -105771,33 +105925,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quantité de matière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matière noire » et « énergie sombre »", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "matière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "quantité de matière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Univers", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la quantité de matière qu'il contient.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -105837,12 +105998,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -105899,33 +106054,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les plantes, les écorces ou les feuilles pour se soigner", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les plantes, les écorces ou les feuilles pour se soigner.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les plantes, les écorces ou les feuilles pour se soigner.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -105947,12 +106109,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -106021,33 +106177,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les plantes, les écorces ou les feuilles pour se soigner", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "feuilles, écorces, plantes", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les plantes, les écorces ou les feuilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -106093,12 +106256,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -106161,33 +106318,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "environnement et l' histoire de chacun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "environnement et l' histoire de chacun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' environnement et l' histoire de chacun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' environnement et l' histoire de chacun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' environnement et l' histoire de chacun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l'environnement et l'histoire de chacun", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'environnement et l'histoire de chacun", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -106221,12 +106385,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -106290,33 +106448,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "reconstitue les climats du passé pour imaginer ceux du futur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "reconstitue les climats du passé pour imaginer ceux du futur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "reconstitue les climats du passé pour imaginer ceux du futur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "il reconstitue les climats du passé pour imaginer ceux du futur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour reconstituer les climats du passé", - "rougeL": 0.46153846153846156 + "rougeL": 0.46153846153846156, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "reconstitue les climats du passé pour imaginer ceux du futur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "reconstitue les climats du passé", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -106356,12 +106521,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -106412,33 +106571,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "climats du passé pour imaginer ceux du futur", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les climats du passé pour imaginer ceux du futur", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les climats du passé", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les climats du passé", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "climats du passé", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "climats du passé", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les climats du passé", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -106472,12 +106638,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -106540,33 +106700,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "Camembert_baseline": { "answer_pred": "des nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "llama-2_lora": { "answer_pred": "des nécropoles qui datent des balbutiements de la culture égyptienne", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 0.9 }, "mixtral-8x7b": { "answer_pred": "nécropoles qui datent des balbutiements de la culture égyptienne", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 0.9 }, "GPT-3.5": { "answer_pred": "les nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 } }, "human_annot": { @@ -106822,48 +106989,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -106992,33 +107117,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "n' interagit pas avec la matière visible", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "personne n' a pu identifier", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "personne n' a pu identifier", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "personne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "non, personne n' a pu identifier", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "personne n' a pu identifier", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "personne n'a pu identifier", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -107052,12 +107184,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -107121,33 +107247,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "défricher", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.55 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "défricher", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.55 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à défricher", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.6 }, "Camembert_baseline": { "answer_pred": "grande terre inconnue", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.25 }, "llama-2_lora": { "answer_pred": "à défricher.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.6 }, "mixtral-8x7b": { "answer_pred": "à peine à défricher", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.7 }, "GPT-3.5": { "answer_pred": "défricher le cerveau", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.8 } }, "human_annot": { @@ -107445,48 +107578,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Partiellement correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Partiellement correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -107574,33 +107665,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "recréant de la peau à partir de cellules - souches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en recréant de la peau à partir de cellules - souches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "recréant de la peau à partir de cellules - souches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en recréant de la peau à partir de cellules - souches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "recréer de la peau à partir de cellules - souches", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "recréant de la peau à partir de cellules - souches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en recréant de la peau à partir de cellules-souches", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 } }, "human_annot": { @@ -107616,12 +107714,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -107710,33 +107802,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des nécropoles qui datent des balbutiements de la culture égyptienne.", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nécropoles qui datent des balbutiements de la culture égyptienne", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -107770,12 +107869,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -107838,33 +107931,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -107892,12 +107992,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -107966,33 +108060,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "balbutiements de la culture égyptienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des balbutiements de la culture égyptienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des balbutiements de la culture égyptienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des balbutiements de la culture égyptienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "depuis plus de 3000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "des balbutiements de la culture égyptienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des balbutiements de la culture égyptienne.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -108026,12 +108127,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -108094,33 +108189,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en savoir plus sur leurs mœurs, leur biologie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à en savoir plus sur leurs murs, leur biologie", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "leurs murs, leur biologie", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "en savoir plus sur leurs mœurs, leur biologie.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "chair humaine n' est pas a priori ce qu' il préfère.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "en savoir plus sur leurs mœurs, leur biologie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en savoir plus", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -108136,12 +108238,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -108216,33 +108312,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "leurs mœurs, leur biologie", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "leurs murs, leur biologie", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur leurs murs, leur biologie", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "leurs mœurs, leur biologie.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "biologie, mœurs, classification", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "leurs mœurs, leur biologie", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en savoir plus sur leurs mœurs, leur biologie.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -108258,12 +108361,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -108344,33 +108441,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Quel est le nom du chercheur ?", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -108416,12 +108520,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -108466,33 +108564,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "quand un requin s' attaque à un homme c' est souvent par erreur, car chair humaine n' est pas a priori ce qu' il préfère.", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un requin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -108502,12 +108607,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -108594,33 +108693,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "chair humaine", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -108666,12 +108772,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -108722,33 +108822,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "toutes ses connaissances de botaniste", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "botaniste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "botaniste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "toutes ses connaissances de botaniste", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "toutes ses connaissances de botaniste", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "toutes ses connaissances de botaniste", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ses connaissances de botaniste", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -108788,12 +108895,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -108850,42 +108951,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Si vous voulez savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible", - "rougeL": 0.967741935483871 + "rougeL": 0.967741935483871, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "On demande à Roland Lehoucq comment se battre avec des sabres laser ou si la cape d'invisibilité d'Harry Potter est une chose crédible.", - "rougeL": 0.6666666666666667 + "rougeL": 0.6666666666666667, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -108979,33 +109081,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "étude de la grotte Chauvet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' étude de la grotte Chauvet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' étude de la grotte Chauvet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' étude de la grotte Chauvet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "étude de la grotte Chauvet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' étude de la grotte Chauvet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'étude de la grotte Chauvet", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -109033,12 +109142,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -109101,33 +109204,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grotte Chauvet", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grotte Chauvet", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grotte Chauvet", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "grotte Chauvet", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Grotte Chauvet", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' étude de la grotte Chauvet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'étude de la grotte Chauvet", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -109167,12 +109277,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -109229,33 +109333,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "association de molécules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "association de molécules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' association de molécules,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "molécules, utilisées habituellement dans le traitement d' autres maladies", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -109277,12 +109388,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -109351,33 +109456,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "association de molécules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "association de molécules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "association de molécules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' association de molécules,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "molécules", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -109387,12 +109499,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -109479,33 +109585,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "contre les idées reçues", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -109527,12 +109640,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -109601,33 +109708,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "couleurs ternes qui oscillaient entre le vert et le marron. Ils pouvaient arborer toutes les couleurs de l' arc-en-ciel.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "non, les dinosaures n' avaient pas des couleurs ternes qui oscillaient entre le vert et le marron. Ils pouvaient arborer toutes les couleurs de l' arc-en-ciel. Non, les dinosaures, dont on connaît à ce jour environ un millier d' espèces n' étaient pas tous des géants. Il explique aussi qu' alors qu' on les croyait disparus, les dinosaures sont toujours parmi nous. Ils sont dans votre jardin, sur votre balcon, on mange leurs œufs. Sait -on que les dinosaures n' ont pas disparu ? Que l' on peut encore observer leurs descendants, les oiseaux ? Que ces volatiles disposaient d' un plumage multicolore aux motifs variés ?", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les idées reçues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -109673,12 +109787,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -109729,33 +109837,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Si vous voulez savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "savoir comment se battre avec des sabres laser ou si la cape d' invisibilité d' Harry Potter est une chose crédible,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "réponse à des questions sur la vulgarisation scientifique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "demandez à Roland Lehoucq", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "comment se battre avec des sabres laser ou si la cape d'invisibilité d'Harry Potter est une chose crédible", - "rougeL": 0.787878787878788 + "rougeL": 0.787878787878788, + "HScore": 0.5 } }, "human_annot": { @@ -109765,12 +109880,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -109858,33 +109967,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bio actif marin", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bio actif marin", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ingéniosité des chercheurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "bio actif marin", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des centaines de bio actif marin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des centaines de bio actif marin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des bio actifs marins", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 } }, "human_annot": { @@ -109918,12 +110034,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -109987,33 +110097,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les méduses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -110023,12 +110140,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -110115,33 +110226,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -110157,12 +110275,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -110237,33 +110349,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cellules", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "un dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un dresseur de cellules", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -110273,12 +110392,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -110365,33 +110478,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les lions y chassent le bison", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -110425,12 +110545,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -110487,33 +110601,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les lions y chassent le bison", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -110553,12 +110674,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -110615,33 +110730,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "piste prometteuse et audace quant à la fonction des rêves", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "piste prometteuse et audacieuse quant à la fonction des rêves", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une piste prometteuse et audacieuse quant à la fonction des rêves", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une piste prometteuse et audacieuse quant à la fonction des rêves", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une piste prometteuse et audacieuse quant à la fonction des rêves", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une piste prometteuse et audacieuse quant à la fonction des rêves", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une piste prometteuse et audacieuse quant à la fonction des rêves", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -110669,12 +110791,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -110737,33 +110853,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la fonction des rêves", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "prometteuse et audacieuse quant à la fonction des rêves", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "piste prometteuse et audacieuse quant à la fonction des rêves", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Nos songes serviraient à maintenir notre individualité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "individualité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Nos songes serviraient à maintenir notre individualité", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il pense avoir trouvé une piste prometteuse", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 0.0 } }, "human_annot": { @@ -110803,12 +110926,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -110865,33 +110982,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -110913,12 +111037,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -110987,33 +111105,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -111047,12 +111172,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -111115,33 +111234,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le goût de la recherche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "goût de la recherche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le goût de la recherche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le goût de la recherche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le goût de la recherche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le goût de la recherche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le goût de la recherche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -111169,12 +111295,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -111243,33 +111363,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un dysfonctionnement d'un chromosome X pourrait être à l'origine de cancers.", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 } }, "human_annot": { @@ -111297,12 +111424,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -111365,33 +111486,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "cancers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -111431,12 +111559,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -111511,33 +111633,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Stéphane Douady", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Stéphane Douady", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Stéphane Douady", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Stéphane Douady", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Stéphane Douady", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Stéphane Douady", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Stéphane Douady", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -111571,12 +111700,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -111658,33 +111781,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -111718,12 +111848,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -111786,33 +111910,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Institut photovoltaque d' Île de France", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Institut photovoltaque d' Île de France", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Institut photovoltaïque d'Île de France", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -111858,12 +111989,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -111908,33 +112033,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Institut photovoltaque d' Île de France", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Institut photovoltaque d' Île de France", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Institut photovoltaïque d' Île de France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Institut photovoltaïque d'Île de France", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -111950,12 +112082,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -112036,33 +112162,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des manuscrits, des traités du Moyen Âge, des papyrus égyptiens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "manuscrits, des traités du Moyen ge, des papyrus égyptiens", - "rougeL": 0.8888888888888888 + "rougeL": 0.8888888888888888, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vieux teinturiers, derniers héritiers de traditions parfois plus que millénaire", - "rougeL": 0.10526315789473685 + "rougeL": 0.10526315789473685, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "des manuscrits, des traités du Moyen Âge, des papyrus égyptiens.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des manuscrits, des traités du Moyen Âge, des papyrus égyptiens.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "recettes d' autrefois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "des recettes d' autrefois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -112102,12 +112235,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -112158,33 +112285,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "manuscrits, des traités du Moyen Âge, des papyrus égyptiens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "manuscrits, des traités du Moyen ge, des papyrus égyptiens", - "rougeL": 0.8888888888888888 + "rougeL": 0.8888888888888888, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "manuscrits, des traités du Moyen ge, des papyrus égyptiens", - "rougeL": 0.8888888888888888 + "rougeL": 0.8888888888888888, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "manuscrits, des traités du Moyen Âge, des papyrus égyptiens.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des manuscrits, des traités du Moyen Âge, des papyrus égyptiens.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "manuscrits, des traités du Moyen Âge, des papyrus égyptiens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "manuscrits, traités du Moyen Âge, papyrus égyptiens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -112230,12 +112364,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -112286,33 +112414,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une équipe du laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "équipe du laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une équipe du laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "une équipe du laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "une équipe du laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une équipe du laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "une équipe du laboratoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -112346,12 +112481,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -112415,33 +112544,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "savoir comment les conditions de travail évoluent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "savoir comment les conditions de travail évoluent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à savoir comment les conditions de travail évoluent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "savoir comment les conditions de travail évoluent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "comment les conditions de travail évoluent", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "savoir comment les conditions de travail évoluent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "il cherche à savoir comment les conditions de travail évoluent", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -112487,12 +112623,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -112537,33 +112667,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "comment les conditions de travail évoluent", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "comment les conditions de travail évoluent", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "comment les conditions de travail évoluent", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "comment les conditions de travail évoluent", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "comment évoluent les conditions de travail", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "comment les conditions de travail évoluent", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il voudrait savoir comment les conditions de travail évoluent.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -112597,12 +112734,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -112665,33 +112796,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "influence humaine", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "influence humaine", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "influence humaine", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "influence humaine", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hervé le Treut a pris conscience de l' influence humaine dans l' effet de serre et qu' il a décidé d' y consacrer sa vie de chercheur.", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "de l'influence humaine", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -112731,12 +112869,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -112787,42 +112919,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "humaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "humaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hervé le Treut", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "humaine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hervé le Treut", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' influence humaine", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "l'homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -112915,33 +113048,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Annick Loiseau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Annick Loiseau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Annick Loiseau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Annick Loiseau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Annick Loiseau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Annick Loiseau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Annick Loiseau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -112975,12 +113115,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -113063,33 +113197,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Hervé Le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hervé le Treut", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -113123,12 +113264,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -113191,33 +113326,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sous le sable d' Égypte", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tombeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tombeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tombeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans les tombeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "tombeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les tombeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -113257,12 +113399,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -113319,33 +113455,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -113373,12 +113516,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -113447,33 +113584,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le chimiste amoureux des mots a trouvé comme titre \"les Savanturiers\".", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -113507,12 +113651,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -113575,33 +113713,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "il y a vingt-cinq siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il y a vingt-cinq siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vingt-cinq siècles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "il y a vingt-cinq siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans le désert Égyptien", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "il y a vingt-cinq siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les ouvriers les ont quittées, il y a vingt-cinq siècles", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 } }, "human_annot": { @@ -113641,12 +113786,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -113703,33 +113842,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' ordinateurs superpuissants,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ordinateurs superpuissants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "À l'aide d'ordinateurs superpuissants.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -113769,12 +113915,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -113831,33 +113971,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ils sont soumis, comme certaines mers, à des phénomènes de marées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "qu' ils sont soumis, comme certaines mers, à des phénomènes de marées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ils sont soumis, comme certaines mers, à des phénomènes de marées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "qu' ils sont soumis, comme certaines mers, à des phénomènes de marées.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "qu'ils sont soumis, comme certaines mers, à des phénomènes de marées.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "qu' ils sont soumis, comme certaines mers, à des phénomènes de marées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "qu'ils sont soumis, comme certaines mers, à des phénomènes de marées.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -113879,12 +114026,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -113959,33 +114100,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une première", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une première", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une première", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une première", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une première en recréant de la peau à partir de cellules - souches", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une première", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une première en recréant de la peau à partir de cellules-souches.", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 } }, "human_annot": { @@ -114013,12 +114161,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -114088,33 +114230,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -114160,12 +114309,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -114210,33 +114353,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quatre-vingt", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.9 }, "Camembert_baseline": { "answer_pred": "les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans les années quatre-vingt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -114366,48 +114516,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -114662,33 +114770,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' astronome Pierre Léna", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -114716,12 +114831,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -114786,33 +114895,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pierre Léna", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -114852,12 +114968,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -114916,33 +115026,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "avec les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -114982,12 +115099,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -115038,33 +115149,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "avec les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les yeux fermés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -115104,12 +115222,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -115166,33 +115278,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "climat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le climat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le climat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le climat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le climat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le climat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le climat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -115220,12 +115339,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -115294,33 +115407,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "influence humaine dans l' effet de serre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hervé le Treut a pris conscience de l' influence humaine dans l' effet de serre", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "influence humaine dans l' effet de serre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' influence humaine dans l' effet de serre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour prendre conscience de l' influence humaine dans l' effet de serre", - "rougeL": 0.8333333333333334 + "rougeL": 0.8333333333333334, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' influence humaine dans l' effet de serre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "influence humaine dans l'effet de serre", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { @@ -115360,12 +115480,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -115416,42 +115530,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "influence humaine dans l' effet de serre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hervé le Treut a pris conscience de l' influence humaine dans l' effet de serre", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' influence humaine dans l' effet de serre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Hervé le Treut a pris conscience de l' influence humaine dans l' effet de serre", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en 1986 qu' Hervé le Treut a pris conscience de l' influence humaine dans l' effet de serre et qu' il a décidé d' y consacrer sa vie de chercheur.", - "rougeL": 0.5217391304347826 + "rougeL": 0.5217391304347826, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pris conscience de l' influence humaine dans l' effet de serre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hervé le Treut a pris conscience de l'influence humaine dans l'effet de serre et a décidé d'y consacrer sa vie de chercheur.", - "rougeL": 0.30769230769230765 + "rougeL": 0.30769230769230765, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -115556,33 +115671,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Roland Lehoucq", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Roland Lehoucq", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Roland Lehoucq", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Roland Lehoucq", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Roland Lehoucq", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Roland Lehoucq", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Roland Lehoucq", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -115616,12 +115738,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -115684,33 +115800,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "abandonner un rêve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "abandonner un rêve : celui de devenir guitariste professionnel", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "abandonner un rêve : celui de devenir guitariste professionnel", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' abandonner un rêve : celui de devenir guitariste professionnel.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "devenir guitariste professionnel", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "d' abandonner un rêve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "d'abandonner un rêve : celui de devenir guitariste professionnel.", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 1.0 } }, "human_annot": { @@ -115726,12 +115849,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -115806,42 +115923,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "abandonner un rêve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "abandonner un rêve : celui de devenir guitariste professionnel", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "abandonner un rêve : celui de devenir guitariste professionnel", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' abandonner un rêve : celui de devenir guitariste professionnel.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "abandonner un rêve", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "décide d' abandonner un rêve", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "abandonner un rêve : celui de devenir guitariste professionnel.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -115934,33 +116052,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des nécropoles qui datent des balbutiements de la culture égyptienne", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nécropoles qui datent des balbutiements de la culture égyptienne", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -115976,12 +116101,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -116056,33 +116175,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des nécropoles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -116128,12 +116254,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -116196,78 +116316,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mines d' or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mines d' or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mines d' or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "Camembert_baseline": { "answer_pred": "mines d' or,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "llama-2_lora": { "answer_pred": "des mines d'or", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des mines d' or", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les mines d'or", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -116648,42 +116733,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "programmes d' aides envers les plus démunis", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "programmes d' aides", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "programmes d' aides", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "médicaments", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "par des programmes d' aides", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "certains programmes d' aides", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "par des programmes d'aides", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -116776,33 +116862,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -116836,12 +116929,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -116899,33 +116986,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ronan Allain", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -116959,12 +117053,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -117028,15 +117116,15 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "eau à l' état liquide", "rougeL": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "eau à l' état liquide", "rougeL": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "eau à l' état liquide", "rougeL": 1.0 }, @@ -117100,33 +117188,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en 1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -117166,12 +117261,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -117223,33 +117312,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1995", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -117289,12 +117385,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -117352,33 +117442,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ce titre « les Savanturiers »", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ce titre « les Savanturiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "journaliste scientifique, animatrice de l' émission « les savanturiers » sur France Inter.", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "ce titre « les Savanturiers »", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le chimiste amoureux des mots est à l'origine du titre \"les Savanturiers\".", - "rougeL": 0.5714285714285714 + "rougeL": 0.5714285714285714, + "HScore": 1.0 } }, "human_annot": { @@ -117394,12 +117491,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -117474,33 +117565,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Savanturiers", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "« les Savanturiers »", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 } }, "human_annot": { @@ -117528,12 +117626,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -117602,33 +117694,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ouganda", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ouganda", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ouganda", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ouganda", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ouganda", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ouganda", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ouganda", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -117662,12 +117761,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -117731,33 +117824,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pêcheurs peu scrupuleux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "chair humaine", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "un homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -117797,12 +117897,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -117859,33 +117953,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "depuis deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "depuis deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "depuis deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -117931,12 +118032,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -117981,33 +118076,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "depuis deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "depuis deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "depuis deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "deux ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "depuis deux ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -118023,12 +118125,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -118109,33 +118205,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "émotion qu' elle n' a de cesse de vouloir partager à travers de multiples conférences", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "émotion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Une émotion qu' elle n' a de cesse de vouloir partager à travers de multiples conférences", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Une émotion qu' elle n' a de cesse de vouloir partager à travers de multiples conférences,", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des conférences", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "multiples conférences", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "des conférences", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -118175,12 +118278,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -118243,33 +118340,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Bretagne.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -118309,12 +118413,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -118373,33 +118471,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les mystérieuses « matière noire » et « énergie sombre »", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matière noire » et « énergie sombre »", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les mystérieuses « matière noire » et « énergie sombre »", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les mystérieuses « matière noire » et « énergie sombre »", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la quantité de matière noire et énergie sombre", - "rougeL": 0.5714285714285714 + "rougeL": 0.5714285714285714, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "matière noire et énergie sombre", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les mystérieuses « matière noire » et « énergie sombre ».", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 } }, "human_annot": { @@ -118433,12 +118538,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -118501,33 +118600,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "10 ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il y a 10 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "10 ans", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "il y a 10 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "10 ans avant", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il y a 10 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "il y a 10 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -118549,12 +118655,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -118647,33 +118747,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Rémy Mosseri", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -118707,12 +118814,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -118775,33 +118876,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "explication au phénomène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une explication au phénomène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une explication au phénomène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une explication au phénomène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une explication au phénomène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une explication au phénomène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une explication au phénomène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -118835,12 +118943,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -118921,33 +119023,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le cœlacanthe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le clacanthe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le clacanthe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "le cœlacanthe un poisson préhistorique de 400 millions d' années", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "poisson du fond des âges : le cœlacanthe", - "rougeL": 0.7692307692307693 + "rougeL": 0.7692307692307693, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le cœlacanthe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le cœlacanthe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -118963,12 +119072,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -119049,33 +119152,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nerfs et les muscles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les nerfs et les muscles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les nerfs et les muscles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les nerfs et les muscles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "nerfs et muscles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les nerfs et les muscles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les nerfs et les muscles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -119103,12 +119213,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -119177,33 +119281,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à l' ouest du grand Rift qui coupe en deux la corne de l' Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à l' ouest du grand Rift qui coupe en deux la corne de l' Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ouest du grand Rift", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "à l' ouest du grand Rift qui coupe en deux la corne de l' Afrique.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à l' ouest du grand Rift", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "à l' ouest du grand Rift qui coupe en deux la corne de l' Afrique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à l'ouest du grand Rift qui coupe en deux la corne de l'Afrique", - "rougeL": 0.7 + "rougeL": 0.7, + "HScore": 1.0 } }, "human_annot": { @@ -119243,12 +119354,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -119299,33 +119404,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grand Rift", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grand Rift qui coupe en deux la corne de l' Afrique", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grand Rift", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "grand Rift qui coupe en deux la corne de l' Afrique.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "grand Rift", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "grand Rift", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "à l'ouest du grand Rift qui coupe en deux la corne de l'Afrique.", - "rougeL": 0.7 + "rougeL": 0.7, + "HScore": 1.0 } }, "human_annot": { @@ -119353,12 +119465,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -119427,33 +119533,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "GIEC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "GIEC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "du GIEC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "GIEC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hervé le Treut participe activement aux travaux du GIEC.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -119463,12 +119576,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -119549,33 +119656,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "aux travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "aux travaux du GIEC, qui a rendu public la synthèse de son dernier rapport dimanche dernier", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "travaux", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "aux travaux du GIEC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "GIEC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "travaux", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hervé le Treut participe activement aux travaux du GIEC.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -119621,12 +119735,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -119695,33 +119803,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grand-père", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Michel L' Hour", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son grand-père", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "son grand-père", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "son grand-père", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "enfant, il pratiquait la pêche avec son grand-père", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son grand-père", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -119743,12 +119858,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -119824,33 +119933,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -119884,12 +120000,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -119946,33 +120056,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "conscience", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "des signaux de conscience", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -120018,12 +120135,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -120074,33 +120185,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -120128,12 +120246,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -120196,33 +120308,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "génétiques", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les maladies génétiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -120244,12 +120363,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -120324,33 +120437,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1 million", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1 million", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1 million", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1 million", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1 million", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1 million", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en moyenne 1 million de bulles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -120378,12 +120498,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -120446,33 +120560,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1 million de bulles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1 million de bulles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1 million de bulles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1 million de bulles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1 million de bulles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1 million de bulles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un million de bulles.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -120506,12 +120627,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -120574,33 +120689,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Caen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Caen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Caen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Caen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Caen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Caen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Caen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -120640,12 +120762,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -120706,33 +120822,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "comité mondial de la recherche spatiale, le Cospar.", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le Cospar.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -120754,12 +120877,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -120828,33 +120945,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Cospar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -120894,12 +121018,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -120961,33 +121079,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Olga", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -121021,12 +121146,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -121083,33 +121202,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -121143,12 +121269,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -121211,33 +121331,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "collectionneur privé", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grand public", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "un collectionneur privé, Jacques Doucet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un collectionneur privé, Jacques Doucet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -121277,12 +121404,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -121335,33 +121456,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jacques Doucet", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -121395,12 +121523,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -121477,33 +121599,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Celui-ci", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le Baiser", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les formes qu' il libère sur la toile incarnent un désir rageur et sauvage de déconstruire le corps", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "corps", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le corps", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Le corps", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le corps", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -121549,12 +121678,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -121602,33 +121725,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "souvent", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Matisse et Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Matisse et Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Apollinaire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Picasso et Matisse", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso et Matisse", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à son rival", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -121668,12 +121798,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -121727,33 +121851,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un choix", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "amitiés les plus marquantes de Picasso", - "rougeL": 0.7777777777777778 + "rougeL": 0.7777777777777778, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "histoire de quelques-unes des amitiés les plus marquantes de Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Il nous fallait faire un choix", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l' histoire de quelques-unes des amitiés les plus marquantes de Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' histoire de quelques-unes des amitiés les plus marquantes de Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il nous temp aurait fallait faire un choix", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -121763,12 +121894,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -121852,33 +121977,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quelques-unes des amitiés les plus marquantes de Picasso", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quelques-unes des amitiés les plus marquantes de Picasso", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rivalité amicale avec les artistes Aussitôt arrivé à Paris, Picasso fait connaissance avec la bohème artistique de l' époque", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' histoire de quelques-unes des amitiés les plus marquantes de Picasso.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' histoire de quelques-unes des amitiés les plus marquantes de Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' histoire de quelques-unes des amitiés les plus marquantes de Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'histoire de quelques-unes des amitiés les plus marquantes de Picasso.", - "rougeL": 0.8695652173913043 + "rougeL": 0.8695652173913043, + "HScore": 1.0 } }, "human_annot": { @@ -121912,12 +122044,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -121983,33 +122109,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Gertrude Stein et son frère Léo", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Gertrude Stein et son frère Léo", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -122037,12 +122170,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -122105,33 +122232,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Gertrude Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Gertrude Stein.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -122165,12 +122299,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -122233,33 +122361,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -122293,12 +122428,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -122355,33 +122484,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chantre de la « pop culture » américaine", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une superstar", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -122391,12 +122527,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -122501,33 +122631,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Serge de Diaghilev", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Serge de Diaghilev", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -122555,12 +122692,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -122630,33 +122761,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "déclarer sa flamme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sa flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sa flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sa flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "courte phrase, « ma jolie »", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Picasso aborde son amour pour la belle Eva.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -122696,12 +122834,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -122770,33 +122902,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -122830,12 +122969,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -122904,33 +123037,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -122964,12 +123104,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -123032,33 +123166,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tout le monde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tout le monde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un coup d' il", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "tout le monde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un coup d' œil", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "tout le monde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -123104,12 +123245,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -123163,33 +123298,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "La mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "La mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -123235,48 +123377,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -123609,33 +123709,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -123675,12 +123782,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -123737,33 +123838,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa virilité", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tourmente", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tourmente", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "sa virilité", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sa virilité", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sa virilité", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sa virilité", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -123785,12 +123893,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -123866,33 +123968,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "rideau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "rideau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "rideau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "rideau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "rideau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le rideau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le rideau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -123932,12 +124041,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -123994,33 +124097,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Autoportrait bleu", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -124054,12 +124164,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -124122,33 +124226,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'autoportrait bleu", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -124182,12 +124293,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -124256,33 +124361,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "moralement et financièrement les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il soutient moralement et financièrement les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le pouvoir est menacé par l' armée nationaliste de Franco", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le pouvoir est menacé par l' armée nationaliste de Franco.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "soutenir moralement et financièrement les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "soutient moralement et financièrement les républicains espagnols, dont le pouvoir est menacé par l' armée nationaliste de Franco", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pour soutenir moralement et financièrement les républicains espagnols, dont le pouvoir est menacé par l'armée nationaliste de Franco.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -124322,12 +124434,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -124402,33 +124508,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Cargos Casagemas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -124462,12 +124575,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -124542,33 +124649,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cargos Casagemas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Cargos Casagemas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -124596,12 +124710,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -124682,33 +124790,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cirque Medrano", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cirque Medrano", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cirque Medrano", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cirque Medrano", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "cirque Medrano", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cirque Medrano", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le cirque Medrano", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -124742,12 +124857,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -124813,33 +124922,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -124873,12 +124989,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -124941,33 +125051,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "très tôt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "très tôt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "très tôt dans sa carrière", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "très tôt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "1905, au Salon d' automne de Paris", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "En 1905", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "très tôt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -124977,12 +125094,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -125069,33 +125180,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "train", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "train", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "train", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "train", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "train", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "train", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en train", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -125135,12 +125253,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -125193,33 +125305,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso ne débarque pas tout seul du train", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ne débarque pas tout seul du train", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso ne débarque pas tout seul du train", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Picasso ne débarque pas tout seul du train", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "il arrive à Paris en octobre 1900, Picasso ne débarque pas tout seul du train. Il est accompagné de. Cargos Casagemas, un camarade qu' il a connu deux ans plus tôt à Barcelone pendant ses études d' art.", - "rougeL": 0.06451612903225806 + "rougeL": 0.06451612903225806, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso ne débarque pas tout seul du train", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "en train", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -125247,12 +125366,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -125323,33 +125436,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -125383,12 +125503,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -125452,33 +125566,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "se servir du seul véritable moyen à sa mesure : l' art", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "partisan du pouvoir républicain menacé", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l' art", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l' art.", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il accepte", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l' art", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l'art.", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 } }, "human_annot": { @@ -125506,12 +125627,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -125571,33 +125686,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "se servir du seul véritable moyen à sa mesure : l' art", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "partisan du pouvoir républicain menacé", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l' art", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l' art.", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l' art.", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l' art", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "par l'art", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -125613,12 +125735,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -125708,33 +125824,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -125768,12 +125891,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -125839,33 +125956,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -125899,12 +126023,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -125964,33 +126082,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Claes Oldenburd", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -126030,12 +126155,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -126087,33 +126206,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' Homme au mouton", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' Homme au mouton", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "L' Homme au mouton", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "L' Homme au mouton", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "L' Homme au mouton", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "\"L' Homme au mouton\"", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -126147,12 +126273,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -126216,33 +126336,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1909", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1909", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1909", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1909", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1909", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1909", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "À partir de 1909", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -126282,12 +126409,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -126356,33 +126477,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la jeune République espagnole", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -126410,12 +126538,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -126485,33 +126607,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en 1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en 1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en 1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -126557,12 +126686,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -126608,33 +126731,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1915", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -126662,12 +126792,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -126737,33 +126861,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Casagemas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -126797,12 +126928,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -126860,33 +126985,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Casagemas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -126920,12 +127052,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -126989,33 +127115,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "pas moins de 800 études", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Picasso a réalisé des études.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.0 } }, "human_annot": { @@ -127049,12 +127182,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -127111,33 +127238,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 } }, "human_annot": { @@ -127183,12 +127317,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -127239,33 +127367,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -127293,12 +127428,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -127361,33 +127490,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le taureau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -127415,12 +127551,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -127489,33 +127619,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -127555,12 +127692,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -127618,33 +127749,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -127654,12 +127792,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -127741,33 +127873,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "des avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des avions allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -127789,12 +127928,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -127870,33 +128003,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1971.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -127924,12 +128064,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -127995,33 +128129,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "En 1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1971", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -128061,12 +128202,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -128126,33 +128261,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son ami Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -128186,12 +128328,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -128248,33 +128384,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -128308,12 +128451,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -128376,33 +128513,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ennuyer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' ennuyer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' ennuyer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cette vie commence à l' ennuyer.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' ennuyer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' ennuyer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Cette vie commence à l'ennuyer.", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 } }, "human_annot": { @@ -128418,12 +128562,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -128504,33 +128642,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "métamorphose les objets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "métamorphose les objets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "il métamorphose les objets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "il métamorphose les objets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il détecte en eux des potentialités qui les font passer d' objets du quotidien à des objets d' art.", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "il métamorphose les objets", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "il métamorphose les objets.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -128552,12 +128697,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -128644,33 +128783,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La jeune femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso, Fernande Olivier, Eva Gouel", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la jeune femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -128704,12 +128850,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -128778,33 +128918,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La jeune femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Fernande Olivier, Eva Gouel", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -128838,12 +128985,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -128906,33 +129047,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -128966,12 +129114,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -129028,33 +129170,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur la scène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -129088,12 +129237,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -129156,33 +129299,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son ami et compatriote Casagemas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il s'est volontairement représenté sous les traits d' un homme plus âgé, les joues creusées et le regard perdu", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les joues creusées et le regard perdu", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "regard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le regard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le regard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son ami et compatriote Casagemas.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -129204,12 +129354,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -129284,33 +129428,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en détail", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8125 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en détail plus loin", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.9375 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "en détail", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8125 }, "Camembert_baseline": { "answer_pred": "en détail", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8125 }, "llama-2_lora": { "answer_pred": "en détail plus loin", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.9375 }, "mixtral-8x7b": { "answer_pred": "en détail plus loin", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.9375 }, "GPT-3.5": { "answer_pred": "nous étudierons en détail", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.8125 } }, "human_annot": { @@ -129348,40 +129499,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Partiellement correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -129619,16 +129736,6 @@ "annot": "annot_10", "rating": "Correct" } - ], - "question": [ - { - "annot": "annot_4", - "rating": "Question douteuse" - }, - { - "annot": "annot_8", - "rating": "Question douteuse" - } ] }, "lu_in_question": true, @@ -129686,33 +129793,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La jeune femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso la tient éloignée de son atelier durant la réalisation des « Demoiselles d' Avignon »", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso la tient éloignée de son atelier durant la réalisation des « Demoiselles d' Avignon ».", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Fernande Olivier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La jeune femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -129728,12 +129842,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -129814,33 +129922,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1966 - 1975", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des années 1966 - 1975", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "années 1966 - 1975", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des années 1966 - 1975", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1966 - 1975", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "années 1966 - 1975", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des années 1966 - 1975.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -129886,12 +130001,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -129936,33 +130045,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1966 - 1975", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1966 - 1975", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1966 - 1975", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1966 - 1975", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1966 - 1975", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "années 1966 - 1975", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des années 1966 - 1975.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -129996,12 +130112,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -130064,33 +130174,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sculptures en tôles pliées", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -130112,12 +130229,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -130186,33 +130297,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en tôles pliées", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -130252,12 +130370,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -130314,33 +130426,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sept compagnes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l'arrivée ou le départ d' une femme", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Fernande Olivier", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "une femme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -130362,12 +130481,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -130442,33 +130555,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -130496,12 +130616,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -130588,33 +130702,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa queue-de-cheval et son long cou représentés de manière stylisée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sa queue-de-cheval et son long cou", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sa queue-de-cheval et son long cou", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sa queue-de-cheval et son long cou", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "queue-de-cheval et son long cou", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sa queue-de-cheval et son long cou représentés de manière stylisée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sa queue-de-cheval et son long cou", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -130624,12 +130745,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -130716,33 +130831,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "acide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "acide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "acide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' acide,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de l' acide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "de l' acide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de l' acide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -130752,12 +130874,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -130844,33 +130960,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "allusion cachée", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "allusion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un oiseau ensanglanté", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "il a recours à la métaphore, au symbole, à l' allusion cachée.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "allusion cachée", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "allusion cachée", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'allusion cachée.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -130880,12 +131003,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -130972,33 +131089,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Françoise", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Françoise se lasse cependant des humeurs de Picasso et aussi de ses infidélités.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 } }, "human_annot": { @@ -131026,12 +131150,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -131094,33 +131212,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Françoise", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -131160,12 +131285,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -131240,33 +131359,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "lui ôte tout réalisme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cette sculpture grandeur nature n' a rien de très classique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' Homme au mouton", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' Homme au mouton", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "rêalisme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "L' Homme au mouton", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sa sculpture grandeur nature", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -131282,12 +131408,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -131368,33 +131488,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "femme au corps démesurément étiré", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "la femme au corps démesurément étiré", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.0 } }, "human_annot": { @@ -131416,12 +131543,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -131491,33 +131612,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cette femme au corps démesurément étiré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la femme au corps démesurément étiré", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 1.0 } }, "human_annot": { @@ -131563,12 +131691,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -131620,33 +131742,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bombardement de la ville", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la ville", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bombardement de la ville", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "bombardement de la ville", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Guernica", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la ville par des avions allemands", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la ville", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -131662,12 +131791,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -131748,33 +131871,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -131802,12 +131932,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -131870,33 +131994,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "En 1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1950", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -131936,12 +132067,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -131998,33 +132123,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "La mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "La mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -132064,12 +132196,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -132120,33 +132246,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mort de Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -132186,12 +132319,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -132260,42 +132387,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa signature", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sa signature", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "coup d' il sa signature", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "sa signature", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "son style", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "son style ou même d' identifier d' un coup d' œil sa signature", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son style", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -132389,33 +132517,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Chicago", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Chicago", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Chicago", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Chicago", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Chicago", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Chicago", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Chicago", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -132455,12 +132590,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -132548,33 +132677,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -132608,12 +132744,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -132700,33 +132830,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -132760,12 +132897,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -132828,33 +132959,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -132870,12 +133008,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -132950,33 +133082,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -133016,12 +133155,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -133078,33 +133211,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "doubles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "doubles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "journal intime codé", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "doubles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "mi - homme mi - taureau, ou l' intrépide « Mousquetaire ».", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "son œuvre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Les figures viennent de l'œuvre de Picasso.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -133132,12 +133272,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -133219,33 +133353,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la communauté internationale indignée, qui veut faire la lumière sur le drame? Ou encore de la raison alarmée, qui veut maintenir au cœur de l' horreur les lumières de l' esprit", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "La femme à la lampe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La femme à la lampe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "La femme à la lampe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sur le toit de l' édifice et les vêtements de la femme", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "dans le personnage de la femme à la lampe", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "il cache un symbole", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -133261,12 +133402,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -133347,33 +133482,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "personnage masculin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au personnage masculin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au personnage masculin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "au personnage masculin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au personnage masculin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -133413,12 +133555,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -133469,33 +133605,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "personnage masculin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au personnage masculin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Casagemas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Casagemas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "au personnage masculin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -133523,12 +133666,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -133597,33 +133734,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le 21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -133651,12 +133795,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -133721,33 +133859,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 21 janvier 1939", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -133787,12 +133932,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -133863,33 +134002,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Malaga en Espagne", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Malaga en Espagne", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il part de l'Espagne.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -133911,12 +134057,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -133997,33 +134137,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Espagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Espagne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -134063,12 +134210,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -134125,33 +134266,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -134191,12 +134339,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -134247,33 +134389,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 } }, "human_annot": { @@ -134313,12 +134462,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -134375,33 +134518,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "patte dressée et le triangle d' un bec", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une patte dressée et le triangle d' un bec", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une patte dressée et le triangle d' un bec", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une patte dressée et le triangle d' un bec.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un triangle d' un bec", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une patte dressée et le triangle d' un bec", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une patte dressée et le triangle d'un bec", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -134435,12 +134585,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -134503,33 +134647,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "avec un pistolet dans un café", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "a tiré sur Germaine avec un pistolet dans un café, avant de retourner l' arme contre lui", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Casagemas a tiré sur Germaine avec un pistolet dans un café", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans un café", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "avec un pistolet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Casagemas a tiré avec un pistolet.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -134539,12 +134690,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -134631,33 +134776,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les oppose : « Picasso est imprévisible et capricieux », affirmait Matisse, « Matisse peint de beaux et élégants tableaux », aurait un jour lancé Picasso", - "rougeL": 0.14814814814814814 + "rougeL": 0.14814814814814814, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Matisse et Picasso", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso et Matisse", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ils s' envoient", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -134703,12 +134855,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -134763,33 +134909,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "femme au corps démesurément étiré", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "La femme au corps démesurément étiré", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "La femme marchant Cette femme au corps démesurément étiré", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le corps démesurément étiré", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La femme marchant", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 1.0 } }, "human_annot": { @@ -134805,12 +134958,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -134892,33 +135039,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sculptures en tôles pliées", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -134964,12 +135118,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -135014,33 +135162,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en tôles pliées", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -135080,12 +135235,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -135142,15 +135291,15 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "se servir du seul véritable moyen à sa mesure : l' art", "rougeL": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "partisan du pouvoir républicain menacé", "rougeL": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "il va se servir du seul véritable moyen à sa mesure : l' art", "rougeL": 0.0 }, @@ -135228,33 +135377,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le spectacle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "spectacle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Parade a mobilisé les plus grands talents de l' avant-garde du moment en danse, en théâtre, en musique et en art", - "rougeL": 0.11111111111111112 + "rougeL": 0.11111111111111112, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Le spectacle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le spectacle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Parade", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le spectacle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -135276,12 +135432,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -135381,33 +135531,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Carlota Valdivia", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -135435,12 +135592,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -135528,33 +135679,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Carlota Valdivia", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso, c' est une constante, ne cherche pas à plaire. Il sent qu' il est arrivé au bout de ses recherches, qu' il doit trouver un nouveau souffle.", - "rougeL": 0.2666666666666667 + "rougeL": 0.2666666666666667, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -135582,12 +135740,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -135657,33 +135809,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Autoportrait bleu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Autoportrait bleu", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -135711,12 +135870,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -135791,33 +135944,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -135845,12 +136005,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -135913,33 +136067,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1973", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -135979,12 +136140,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -136041,33 +136196,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' Espagnol", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' Espagnol", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.35 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.35 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.35 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.35 }, "GPT-3.5": { "answer_pred": "Picasso.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.35 } }, "human_annot": { @@ -136323,48 +136485,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_3", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_4", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Partiellement correct" - }, - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -136493,33 +136613,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "support placé à l' arrière", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une sculpture a -t -elle besoin d' être rigide pour être une sculpture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vieille bouée dégonflée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "d' un support placé à l' arrière,", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un support placé à l' arrière", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un support placé à l' arrière", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "d'un support", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -136559,12 +136686,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -136621,33 +136742,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -136675,12 +136803,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -136743,33 +136865,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 25 mai 1937", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -136797,12 +136926,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -136871,33 +136994,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petites piques", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petits piques", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les petites piques", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "petites piques", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "petites piques", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "petites piques", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les petites piques", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -136925,12 +137055,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -136999,33 +137123,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Casagemas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Casagemas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Casagemas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Casagemas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Casagemas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Casagemas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Casagemas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -137059,12 +137190,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -137139,33 +137264,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -137199,12 +137331,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -137267,42 +137393,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "surface bien lisse, bien propre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "faire une surface bien lisse, bien propre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une surface bien lisse, bien propre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "une surface bien lisse, bien propre.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "une surface bien lisse, bien propre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "n' a pas cherché à faire une surface bien lisse, bien propre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "une surface bien lisse, bien propre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -137394,33 +137521,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "de nombreuses esquisses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "de nombreuses esquisses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de nombreuses esquisses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de nombreuses esquisses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de nombreuses esquisses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "de nombreuses esquisses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso a réalisé de nombreuses esquisses pour cette toile très réfléchie.", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 1.0 } }, "human_annot": { @@ -137448,12 +137582,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -137522,33 +137650,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour l' affiche du Congrès mondial des partisans de la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour l' affiche du Congrès mondial des partisans de la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour l' affiche du Congrès mondial des partisans de la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pour l' affiche du Congrès mondial des partisans de la paix.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour l'affiche du Congrès mondial des partisans de la paix", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour l' affiche du Congrès mondial des partisans de la paix", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il l'a utilisé pour l'affiche du Congrès mondial des partisans de la paix.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -137582,12 +137717,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -137650,33 +137779,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "période bleue", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "période bleue", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "période bleue", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "bleu. » Ainsi débute, à l' automne 1901, la « période bleue", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "« période bleue »", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "période bleue", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la « période bleue » de Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -137710,12 +137846,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -137778,33 +137908,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -137838,12 +137975,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -137900,33 +138031,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -137960,12 +138098,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -138058,33 +138190,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso n' a pas vingt ans lorsqu' il quitte l' Espagne.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -138118,12 +138257,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -138186,33 +138319,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "État français", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à l'État", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -138240,12 +138380,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -138308,33 +138442,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "L' État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l' État français", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' État", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'État français", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -138356,12 +138497,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -138436,33 +138571,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sa flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sa flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sa flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sa flamme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ma jolie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Picasso déclare sa flamme à la belle Eva.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -138472,12 +138614,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -138564,33 +138700,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sacrifice suprême, littéralement de l' holocauste", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "On n'exige rien de lui dans cet extrait de l'article.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -138618,12 +138761,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -138686,33 +138823,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sacrifier son fils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il n'y a pas de séquence de mots dans l'article qui répond à la question \"Qu'est-ce qui lui est demandé ?\"", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -138740,12 +138884,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -138814,33 +138952,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bombardement de la ville", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ville", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Guernica", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Guernica", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Guernica", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la ville par des avions allemands", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la ville de Guernica", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -138886,12 +139031,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -138942,33 +139081,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en mars 1938.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -139008,12 +139154,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -139065,33 +139205,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en mars 1938", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -139131,12 +139278,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -139194,33 +139335,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "son ami Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son ami Matisse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -139230,12 +139378,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -139316,33 +139458,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Matisse", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Matisse.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -139382,12 +139531,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -139444,33 +139587,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la jeune République espagnole", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la jeune République espagnole", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "République espagnole", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La jeune République espagnole", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -139480,12 +139630,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -139569,33 +139713,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tableau scandaleux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le Bain turc", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un tableau scandaleux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le Bain turc", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "« Le Bain turc »", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Le Bain turc", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "« Le Bain turc »", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -139617,12 +139768,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -139698,33 +139843,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les toiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les toiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les toiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les toiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des papiers collés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les toiles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un collage de surfaces colorées", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -139758,12 +139910,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -139826,33 +139972,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -139892,12 +140045,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -139949,33 +140096,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean Cocteau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -140009,12 +140163,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -140078,33 +140226,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "800", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "800 études", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -140114,12 +140269,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -140206,33 +140355,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la liaison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la liaison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la liaison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la liaison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la liaison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la liaison", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La liaison est découverte.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -140260,12 +140416,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -140334,33 +140484,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -140394,12 +140551,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -140456,33 +140607,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -140516,12 +140674,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -140584,33 +140736,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour les 90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour les 90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les musées nationaux n' avaient pu prêter que 8 uvres, les seules en leur possession", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "les musées nationaux n' avaient pu prêter que 8 œuvres,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "pour les 90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les 90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les 90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -140650,12 +140809,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -140706,33 +140859,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les musées nationaux n' avaient pu prêter que 8 œuvres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour les 90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le décès de Picasso est donc l' occasion pour la France d' acquérir les pièces majeures qui manquent encore à ses collections", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "les musées nationaux n' avaient pu prêter que 8 œuvres,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les 90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les 90 ans du maître", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -140778,12 +140938,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -140834,33 +140988,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -140894,12 +141055,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -140957,33 +141112,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -141017,12 +141179,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -141086,33 +141242,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sculptures en tôles pliées", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -141152,12 +141315,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -141208,33 +141365,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en tôles pliées", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de grandes sculptures en tôles pliées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -141280,12 +141444,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -141348,33 +141506,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quatre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Quatre assistants aident Picasso.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -141396,12 +141561,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -141477,33 +141636,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -141525,12 +141691,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -141607,33 +141767,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -141667,12 +141834,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -141737,33 +141898,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -141797,12 +141965,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -141859,33 +142021,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jacob", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -141919,12 +142088,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -141987,42 +142150,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "moralement et financièrement les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "soutient moralement et financièrement les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le pouvoir est menacé par l' armée nationaliste de Franco", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "le pouvoir est menacé par l' armée nationaliste de Franco.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "soutenait moralement et financièrement les républicains espagnols", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "soutient moralement et financièrement les républicains espagnols, dont le pouvoir est menacé par l' armée nationaliste de Franco", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le pouvoir est menacé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -142117,33 +142281,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le portrait cubiste", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un portrait cubiste", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso réalise le portrait cubiste de son ami.", - "rougeL": 0.7692307692307693 + "rougeL": 0.7692307692307693, + "HScore": 1.0 } }, "human_annot": { @@ -142189,12 +142360,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -142239,33 +142404,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le portrait cubiste de son ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -142305,12 +142477,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -142367,33 +142533,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "s' installer dans un atelier de Montparnasse", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "s' installer dans un atelier de Montparnasse", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "s' installer dans un atelier de Montparnasse", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "s' installer dans un atelier de Montparnasse,", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un atelier de Montparnasse", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "de s' installer dans un atelier de Montparnasse", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "s'installer dans un atelier de Montparnasse", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -142439,12 +142612,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -142513,33 +142680,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -142573,12 +142747,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -142653,33 +142821,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -142713,12 +142888,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -142781,33 +142950,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "marque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "marque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "marque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "marque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une marque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une marque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une marque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -142841,12 +143017,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -142921,33 +143091,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -142981,12 +143158,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -143055,33 +143226,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -143115,12 +143293,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -143195,33 +143367,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -143255,12 +143434,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -143320,33 +143493,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso réalise le portrait cubiste de son ami.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -143380,12 +143560,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -143442,33 +143616,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Picasso", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Picasso réalise le portrait cubiste de son ami", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -143496,12 +143677,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -143570,33 +143745,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "se convertir au catholicisme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -143630,12 +143812,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -143698,33 +143874,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "neuf mois de travaux préparatoires, de croquis, d' esquisses, de tâtonnements", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "recherches picturales", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "picturales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "recherches picturales", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il dévore tout ce qu' il découvre, et il l' intègre à son art.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "croquis, esquisses, tâtonnements", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "recherches picturales", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -143758,12 +143941,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -143826,33 +144003,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "toute séduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "toute séduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "toute séduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "toute séduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "séduction", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "toute séduction", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le corps de la femme nue a perdu toute séduction", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 } }, "human_annot": { @@ -143880,12 +144064,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -143954,33 +144132,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "10 mètres de haut sur 16 mètres de long", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "10 mètres de haut sur 16 mètres de long", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -144020,12 +144205,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -144073,33 +144252,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "16 mètres de long", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "10 mètres de haut sur 16 mètres de long", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "10 mètres de haut sur 16 mètres de long", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -144121,12 +144307,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -144192,33 +144372,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "16 mètres", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "10 mètres de haut sur 16 mètres de long", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "plus de 10 mètres de haut sur 16 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -144252,12 +144439,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -144322,42 +144503,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grâce à une intervention militaire athénienne", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "renversé et tué grâce à une intervention militaire athénienne", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grâce à une intervention militaire athénienne", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "renversé et tué grâce à une intervention militaire athénienne", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "par une intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "renversé et tué", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "tué grâce à une intervention militaire athénienne", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -144444,33 +144626,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "militaire athénienne", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'intervention militaire athénienne", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 1.0 } }, "human_annot": { @@ -144504,12 +144693,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -144572,33 +144755,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "édifice avec un étage unique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "haute terrasse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "étage unique", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "la « haute terrasse »,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "haute terrasse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "édifice avec un étage unique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la \"haute terrasse\"", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -144620,12 +144810,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -144696,33 +144880,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "haute terrasse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "haute terrasse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "haute terrasse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "haute terrasse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "haute terrasse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "haute terrasse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la \"haute terrasse\"", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -144762,12 +144953,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -144826,33 +145011,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "du VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -144892,12 +145084,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -144948,33 +145134,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "VIe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "VIe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "VIe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "VIe siècle.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -145014,12 +145207,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -145076,33 +145263,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sur ce site", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sur ce site", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ce site", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur ce site", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -145148,12 +145342,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -145200,33 +145388,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Japon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le Japon.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -145254,12 +145449,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -145330,33 +145519,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -145390,12 +145586,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -145458,33 +145648,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tombe de saint Lazare dans la cathédrale d' Autun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tombe de saint Lazare dans la cathédrale d' Autun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tombe de saint Lazare dans la cathédrale d' Autun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tombe de saint Lazare", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tombe de saint Lazare dans la cathédrale d' Autun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "de la tombe de saint Lazare dans la cathédrale d' Autun, actuellement au musée Rolin", - "rougeL": 0.8181818181818181 + "rougeL": 0.8181818181818181, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de la tombe de saint Lazare dans la cathédrale d' Autun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -145494,12 +145691,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -145580,33 +145771,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tombe de saint Lazare dans la cathédrale d' Autun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tombe de saint Lazare dans la cathédrale d' Autun", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tombe de saint Lazare", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "tombe de saint Lazare", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "tombe de saint Lazare", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "de la tombe de saint Lazare", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la tombe de saint Lazare", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 0.0 } }, "human_annot": { @@ -145616,12 +145814,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -145720,33 +145912,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "llama-2_lora": { "answer_pred": "terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "mixtral-8x7b": { "answer_pred": "les terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -146002,48 +146201,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -146178,33 +146335,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les terrassiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -146244,12 +146408,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -146306,33 +146464,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "raid de 259/260.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "259/260", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -146360,12 +146525,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -146429,33 +146588,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "raid de 259/260.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "259/260", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le raid de 259/260", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -146501,12 +146667,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -146558,33 +146718,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "celui de droite, en entrant dans le palais", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "celui de droite, en entrant dans le palais", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le savant se soit décidé pour « celui de droite, en entrant dans le palais [ sic ] »", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "monolithe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le plus petit des deux monuments", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "celui de droite, en entrant dans le palais", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le monument/ l'obélisque/n le plus petit des deux et le plus abîmé.", - "rougeL": 0.08695652173913043 + "rougeL": 0.08695652173913043, + "HScore": 0.5 } }, "human_annot": { @@ -146606,12 +146773,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -146680,33 +146841,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "celui de droite", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "celui de droite, en entrant dans le palais", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lequel des deux monuments en partie recouverts de sable, devait en premier être envoyé en France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "lequel des deux monuments", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le plus petit des deux", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "celui de droite, en entrant dans le palais", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "« celui de droite, en entrant dans le palais »", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -146740,12 +146908,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -146820,33 +146982,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Urartéens", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Urartu", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 } }, "human_annot": { @@ -146880,12 +147049,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -146949,33 +147112,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1846", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -147015,12 +147185,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -147071,33 +147235,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1846", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 20 novembre 1846", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -147125,12 +147296,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -147199,33 +147364,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' espace occupé fut rendu aux Romains", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l'espace occupé est rendu aux Romains et le Colisée est édifié sur l'emplacement du lac", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'espace occupé fut rendu aux Romains et le Colisée fut édifié sur l'emplacement du lac.", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { @@ -147253,12 +147425,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -147321,33 +147487,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' espace occupé fut rendu aux Romains", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'espace occupé fut rendu aux Romains.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -147369,12 +147542,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -147449,33 +147616,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le dux Ibba.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -147497,12 +147671,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -147589,33 +147757,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa taille énorme et la qualité de ses détails", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sa taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "taille énorme et la qualité de ses détails", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "sa taille énorme et la qualité de ses détails", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sa taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "taille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la taille énorme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -147649,12 +147824,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -147716,33 +147885,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1883", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1884", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1885", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "1883, par M. Alexandre Bertrand, 1884.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "1883, 1884, 1885", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1883, 1884, 1885", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1883, 1884, 1885.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -147752,12 +147928,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -147845,33 +148015,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Roderick Eric Davis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Roderick Eric Davis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Roderick Eric Davis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Roderick Eric Davis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Roderick Eric Davis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Roderick Eric Davis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Roderick Eric Davis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -147905,12 +148082,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -147986,33 +148157,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trois promontoires rocheux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.7 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trois promontoires rocheux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.7 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trois promontoires rocheux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.7 }, "Camembert_baseline": { "answer_pred": "trois promontoires rocheux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.7 }, "llama-2_lora": { "answer_pred": "Yenicekale, Sarıkale et Nisantepe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.7 }, "mixtral-8x7b": { "answer_pred": "trois promontoires rocheux situés dans l' alignement entre la porte des Lions et de Büyükkale respectivement Yenicekale, Sarıkale et Nisantepe", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 0.9 }, "GPT-3.5": { "answer_pred": "Yenicekale, Sarıkale et Nisantepe.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.7 } }, "human_annot": { @@ -148310,48 +148488,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -148449,33 +148585,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pour être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la maison circulaire fouillée au Nord du plateau à l' emplacement du site d' activités Unexpo", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "pour être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour être présentée au public", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -148491,12 +148634,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -148571,33 +148708,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "présenté au public", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "présenter au public", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour être présenté au public", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -148613,12 +148757,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -148699,33 +148837,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour expliquer l' introduction des poteries mycéniennes que la deuxième invasion n' aurait pas ramenées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour expliquer l' introduction des poteries mycéniennes que la deuxième invasion n' aurait pas ramenées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour expliquer l' introduction des poteries mycéniennes que la deuxième invasion n' aurait pas ramenées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pour expliquer l' introduction des poteries mycéniennes", - "rougeL": 0.761904761904762 + "rougeL": 0.761904761904762, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "pour expliquer l' introduction des poteries mycéniennes que la deuxième invasion n' aurait pas ramenées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "expliquer l' introduction des poteries mycéniennes que la deuxième invasion n' aurait pas ramenées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "expliquer l'introduction des poteries mycéniennes que la deuxième invasion n'aurait pas ramenées.", - "rougeL": 0.8275862068965517 + "rougeL": 0.8275862068965517, + "HScore": 1.0 } }, "human_annot": { @@ -148771,12 +148916,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -148821,33 +148960,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' introduction des poteries mycéniennes que la deuxième invasion n' aurait pas ramenées", - "rougeL": 0.9600000000000001 + "rougeL": 0.9600000000000001, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "introduction des poteries mycéniennes que la deuxième invasion n' aurait pas ramenées", - "rougeL": 0.9600000000000001 + "rougeL": 0.9600000000000001, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' introduction des poteries mycéniennes que la deuxième invasion n' aurait pas ramenées", - "rougeL": 0.9600000000000001 + "rougeL": 0.9600000000000001, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' introduction des poteries mycéniennes", - "rougeL": 0.7000000000000001 + "rougeL": 0.7000000000000001, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l' introduction des poteries mycéniennes", - "rougeL": 0.7000000000000001 + "rougeL": 0.7000000000000001, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' introduction des poteries mycéniennes", - "rougeL": 0.7000000000000001 + "rougeL": 0.7000000000000001, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'introduction des poteries mycéniennes que la deuxième invasion n'aurait pas ramenées", - "rougeL": 0.7857142857142856 + "rougeL": 0.7857142857142856, + "HScore": 1.0 } }, "human_annot": { @@ -148863,12 +149009,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -148949,33 +149089,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "basiliques byzantines, désignées selon leur emplacement dans l' enceinte de la ville comme basilique orientale, basilique occidentale et basilique occidentale, et une quatrième hors les murs, associée à la nécropole", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "basiliques byzantines", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trois basiliques byzantines, désignées selon leur emplacement dans l'enceinte de la ville comme basilique orientale, basilique centrale et basilique occidentale, et une quatrième", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "basiliques byzantines", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "basiliques byzantines", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "quatre basiliques byzantines", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les basiliques byzantines", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 } }, "human_annot": { @@ -148997,12 +149144,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -149073,33 +149214,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "basilique orientale, basilique centrale et basilique occidentale", - "rougeL": 0.1818181818181818 + "rougeL": 0.1818181818181818, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "basilique orientale, basilique centrale et basilique occidentale, et une quatrième hors les murs", - "rougeL": 0.14285714285714285 + "rougeL": 0.14285714285714285, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trois basiliques byzantines, désignées selon leur emplacement dans l'enceinte de la ville comme basilique orientale, basilique centrale et basilique occidentale, et une quatrième", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "basilique orientale, basilique centrale et basilique occidentale, et une quatrième", - "rougeL": 0.16666666666666666 + "rougeL": 0.16666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "basilique orientale, basilique centrale, basilique occidentale et une quatrième hors les murs", - "rougeL": 0.14285714285714285 + "rougeL": 0.14285714285714285, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "basilique orientale, basilique centrale et basilique occidentale, et une quatrième hors les murs", - "rougeL": 0.14285714285714285 + "rougeL": 0.14285714285714285, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "basilique orientale, basilique centrale et basilique occidentale", - "rougeL": 0.1818181818181818 + "rougeL": 0.1818181818181818, + "HScore": 0.5 } }, "human_annot": { @@ -149139,12 +149287,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -149203,33 +149345,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour les jeux jusqu' au IIIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour les jeux", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour les jeux jusqu' au IIIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "jeux", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "jeux jusqu' au IIIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour les jeux", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "pour les jeux", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 } }, "human_annot": { @@ -149251,12 +149400,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -149331,33 +149474,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pourquoi ne pas faire un autre de ces films? Le public est demandeur", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pourquoi ne pas faire un autre de ces films?", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Harrison m' a appelé un jour et m' a dit : Pourquoi ne pas faire un autre de ces films", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Harrison Ford, qui avait un moment hésité à faire ce quatrième volet,", - "rougeL": 0.14285714285714288 + "rougeL": 0.14285714285714288, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "pourquoi ne pas faire un autre de ces films ?", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "faire un autre de ces films", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "À propos de faire un autre de ces films", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 0.5 } }, "human_annot": { @@ -149403,12 +149553,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -149450,33 +149594,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pourquoi ne pas faire un autre de ces films", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pourquoi ne pas faire un autre de ces films?", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pourquoi ne pas faire un autre de ces films", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pourquoi ne pas faire un autre de ces films", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Pourquoi ne pas faire un autre de ces films ?", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Pourquoi ne pas faire un autre de ces films ?", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pourquoi ne pas faire un autre de ces films ?", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -149486,12 +149637,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -149575,33 +149720,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Au Musée Archéologique de Santander.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -149629,12 +149781,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -149697,33 +149843,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Musée Archéologique de Santander", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Musée Archéologique de Santander.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -149763,12 +149916,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -149825,33 +149972,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ayahuasca qui a des effets hallucinogènes", - "rougeL": 0.5555555555555556 + "rougeL": 0.5555555555555556, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'ayahuasca", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -149879,12 +150033,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -149950,33 +150098,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ayahuasca", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'ayahuasca", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -150010,12 +150165,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -150081,33 +150230,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un hanchement prononcé et une grâce confinant à la mollesse et à l' effémination", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grâce confinant à la mollesse et à l' effémination", - "rougeL": 0.8421052631578948 + "rougeL": 0.8421052631578948, + "HScore": 0.55 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "hanchement prononcé et une grâce confinant à la mollesse et à l' effémination", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "Camembert_baseline": { "answer_pred": "un hanchement prononcé et une grâce confinant à la mollesse et à l' effémination,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "llama-2_lora": { "answer_pred": "un hanchement prononcé et une grâce confinant à la mollesse et à l' effémination", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "mixtral-8x7b": { "answer_pred": "un hanchement prononcé et une grâce confinant à la mollesse et à l' effémination", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "GPT-3.5": { "answer_pred": "un hanchement prononcé et une grâce confinant à la mollesse et à l' effémination", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 } }, "human_annot": { @@ -150321,48 +150477,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Partiellement correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -150537,33 +150651,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "filtre pour spéléologues", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "filtre pour spéléologues", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "filtre pour spéléologues", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "comme un filtre pour spéléologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "comme un filtre pour spéléologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "agit comme un filtre pour spéléologues : le passage est si étroit que seules des personnes minces peuvent s' y faufiler.", - "rougeL": 0.47619047619047616 + "rougeL": 0.47619047619047616, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Tight Spot agit comme un filtre pour spéléologues.", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 } }, "human_annot": { @@ -150609,12 +150730,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -150665,33 +150780,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plus de deux siècles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plus de deux siècles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plus de deux siècles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "depuis plus de deux siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "depuis plus de deux siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de deux siècles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "depuis plus de deux siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -150821,48 +150943,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -151111,33 +151191,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plus de deux siècles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "depuis plus de deux siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plus de deux siècles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "depuis plus de deux siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "depuis plus de deux siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de deux siècles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "depuis plus de deux siècles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -151183,12 +151270,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -151239,33 +151320,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1951", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1903", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1951", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "1903", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -151281,12 +151369,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -151361,33 +151443,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1951", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1903", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1908", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -151415,12 +151504,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -151489,33 +151572,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "production de stèles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "stèles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "stèles", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "stèle", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "400 av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "production de stèles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La production de stèles.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -151561,12 +151651,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -151611,33 +151695,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "autour de 400 av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "production de stèles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La production de stèles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "production de stèles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "400 av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la production de stèles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question \"Que commencent les Mayas ?\" est \"La production de stèles\"", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -151647,12 +151738,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -151739,33 +151824,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des influences culturelles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dépôts funéraires dans des tombes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des influences culturelles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "influences culturelles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "des influences culturelles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "influences de la culture de Qijia", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "l'influence culturelle depuis l'intérieur de la Chine", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -151805,12 +151897,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -151864,33 +151950,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les forces hollandaises ont combattu en avril 1826", - "rougeL": 0.36363636363636365 + "rougeL": 0.36363636363636365, + "HScore": 0.5 } }, "human_annot": { @@ -151924,12 +152017,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -151988,33 +152075,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "9 juin", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "9 juin 1826", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 9 juin 1826.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -152036,12 +152130,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -152118,33 +152206,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les parties des deux autres talus", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -152184,12 +152279,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -152240,33 +152329,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les deux autres talus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "ses parties sont devenues invisibles", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "certaines de leurs parties sont devenues invisibles", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -152294,12 +152390,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -152368,33 +152458,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "artistes d' Altamira", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les artistes d' Altamira", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les artistes d' Altamira", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les artistes d' Altamira", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les artistes d' Altamira", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "artistes d' Altamira", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les artistes d'Altamira.", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -152404,12 +152501,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -152496,33 +152587,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les parties des deux autres talus", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "certaines de leurs parties", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -152562,12 +152660,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -152636,33 +152728,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Assyrie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Urartéens", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Urartu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Urartéens.", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 } }, "human_annot": { @@ -152672,12 +152771,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -152765,33 +152858,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fin du XIXe siècle", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fin du XIXe siècle", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fin du XIXe siècle", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fin du XIXe siècle", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "avant le XIXe siècle", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "avant la fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "avant la fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -152813,12 +152913,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -152889,33 +152983,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "XIXe siècle", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "XIXe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "XIXe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "XIXe siècle", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "XIXe siècle", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "XIXe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le XIXe siècle", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -152955,12 +153056,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -153019,33 +153114,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Cambridge et Venise.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -153085,12 +153187,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -153142,33 +153238,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Cambridge et Venise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Cambridge et Venise.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -153202,12 +153305,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -153271,33 +153368,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "On quitte Bavay par la route de Maubeuge.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { @@ -153331,12 +153435,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -153393,33 +153491,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Bavay", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -153453,12 +153558,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -153521,33 +153620,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "victoire macédonienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mosaque d' Alexandre", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la mosaque d' Alexandre", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mosaïque d' Alexandre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une peinture plus ancienne, de la fin du IVe siècle", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la mosaïque d' Alexandre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mosaïque d' Alexandre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -153569,12 +153675,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -153647,33 +153747,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la mosaïque d' Alexandre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mosaque d' Alexandre", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la mosaque d' Alexandre", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "la mosaïque d' Alexandre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "fin du IVe siècle", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "mosaïque d' Alexandre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mosaïque d' Alexandre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -153707,12 +153814,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -153779,33 +153880,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ouest et est", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vomitoires ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ouest et est", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ouest et est", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -153851,12 +153959,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -153901,33 +154003,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vomitoires ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les vomitoires ouest et est", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -153949,12 +154058,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -154029,33 +154132,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "stèles retrouvées à Bab El Aïn", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "stèles", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les stèles retrouvées à Bab El An", - "rougeL": 0.823529411764706 + "rougeL": 0.823529411764706, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "stèles", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "stèles retrouvées à Bab El Aïn", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les stèles retrouvées à Bab El Aïn", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les stèles retrouvées à Bab El Aïn", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -154065,12 +154175,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -154159,33 +154263,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "En décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -154225,12 +154336,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -154283,33 +154388,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1929", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En décembre 1929", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -154331,12 +154443,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -154413,33 +154519,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dépôt", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le trésor de Vaise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le trésor de Vaise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le trésor de Vaise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un dépôt monétaire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le trésor de Vaise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le trésor de Vaise", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -154467,12 +154580,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -154543,33 +154650,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -154609,12 +154723,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -154667,33 +154775,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en 1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en 1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1999", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -154727,12 +154842,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -154797,33 +154906,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -154851,12 +154967,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -154919,33 +155029,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Irhuleni, roi de Hama", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Irhuleni, roi de Hama, et Hadadézer, roi de Damas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -154985,12 +155102,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -155047,33 +155158,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "La plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "La plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "stèle datée des Basses - terres mayas", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"La plus ancienne stèle\".", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 } }, "human_annot": { @@ -155083,12 +155201,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -155166,42 +155278,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Basses - terres mayas", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "La plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la plus ancienne stèle datée des Basses - terres mayas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la plus ancienne stèle", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -155291,33 +155404,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "communauté juive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la communauté juive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la communauté juive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "juive", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "juive", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la communauté juive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la communauté juive", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -155357,12 +155477,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -155432,33 +155546,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "figures de terres cuites", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.95 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "figures de terres cuites", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.95 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "figures de terres cuites", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.95 }, "Camembert_baseline": { "answer_pred": "figures de terres cuites", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.95 }, "llama-2_lora": { "answer_pred": "des collections du Musée du Louvre et des musées des Facultés de Lettres de bordeaux, Lille, Lyon et Montpellier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.1 }, "mixtral-8x7b": { "answer_pred": "figures de terres cuites", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.95 }, "GPT-3.5": { "answer_pred": "des figures de terres cuites", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.95 } }, "human_annot": { @@ -155672,48 +155793,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Partiellement correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -155886,42 +155965,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "qu' il fut saisi d' horreur et d' inquiétude pour la conservation de la grotte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "il fut saisi d' horreur et d' inquiétude pour la conservation de la grotte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "qu' il fut saisi d' horreur et d' inquiétude pour la conservation de la grotte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "qu' il fut saisi d' horreur et d' inquiétude pour la conservation de la grotte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "que la grotte a été contaminée par le traitement", - "rougeL": 0.20000000000000004 + "rougeL": 0.20000000000000004, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "qu' un produit toxique avait été appliqué sur la souche d' un frêne, au-dessus de la cavité ornée (sur celles de deux frênes en réalité)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "on comprend qu'il y a eu un problème avec la conservation de la grotte et que des produits toxiques ont été appliqués sur les arbres au-dessus de la cavité.", - "rougeL": 0.21052631578947364 + "rougeL": 0.21052631578947364, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -156014,33 +156094,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sultan Abdülhamid II", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le sultan Abdülhamid II", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sultan Abdülhamid II", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le sultan Abdülhamid II", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Abdülhamid II", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sultan Abdülhamid II", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le sultan Abdülhamid II", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -156068,12 +156155,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -156136,33 +156217,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Abdülhamid II", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Abdülhamid II", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Abdülhamid II", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Abdülhamid II", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Abdülhamid II", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Abdülhamid II", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le sultan Abdülhamid II", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -156190,12 +156278,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -156276,33 +156358,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "officier de l' armée française", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Aimé Laussedat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Aimé Laussedat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Aimé Laussedat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Aimé Laussedat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Aimé Laussedat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Aimé Laussedat", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -156336,12 +156425,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -156406,33 +156489,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "extraction des pierres, dites « grandes oolithes blanches » des carrières de Norroy, près de Pont - à - Mousson", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "extraction des pierres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "extraction des pierres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "extraction des pierres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "extraire des pierres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "extraction des pierres, dites « grandes oolithes blanches » des carrières de Norroy, près de Pont - à - Mousson", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "extraire des pierres", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -156478,12 +156568,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -156528,33 +156612,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "extraction des pierres, dites « grandes oolithes blanches » des carrières de Norroy, près de Pont - à - Mousson", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "centurions, soldats, vexillaires et auxiliaires", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "centurions, soldats, vexillaires et auxiliaires qui furent affectés à l' extraction des pierres", - "rougeL": 0.35294117647058826 + "rougeL": 0.35294117647058826, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "centurions, soldats, vexillaires et auxiliaires qui furent affectés à l' extraction des pierres,", - "rougeL": 0.35294117647058826 + "rougeL": 0.35294117647058826, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "extraction des pierres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "extraction des pierres, dites « grandes oolithes blanches » des carrières de Norroy, près de Pont - à - Mousson", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'extraction des pierres, dites « grandes oolithes blanches » des carrières de Norroy, près de Pont - à - Mousson.", - "rougeL": 0.16666666666666666 + "rougeL": 0.16666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -156570,12 +156661,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -156656,33 +156741,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "leurs frères", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les femmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -156722,12 +156814,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -156800,33 +156886,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "temple d' État", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "temple d' État", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "temple d' État » et capitale", - "rougeL": 0.03389830508474576 + "rougeL": 0.03389830508474576, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "le plus grand des temples du complexe monumental d' Angkor au Cambodge.", - "rougeL": 0.11594202898550722 + "rougeL": 0.11594202898550722, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "« temple d' État » et capitale", - "rougeL": 0.06666666666666667 + "rougeL": 0.06666666666666667, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Angkor Vat", - "rougeL": 0.19672131147540983 + "rougeL": 0.19672131147540983, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Suryavarman II construit Angkor Vat au début du XIIe siècle.", - "rougeL": 0.19178082191780818 + "rougeL": 0.19178082191780818, + "HScore": 1.0 } }, "human_annot": { @@ -156842,12 +156935,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -156928,33 +157015,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "marchands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "marchands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "marchands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "marchands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "marchands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "marchands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les marchands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -156988,12 +157082,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -157056,33 +157144,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en cabotant ou en longeant les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en cabotant ou en longeant les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "en cabotant ou en longeant les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en cabotant ou en longeant les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en cabotant ou en longeant les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en cabotant ou en longeant les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en cabotant ou en longeant les rives gelées reliant la Sibérie à l' Amérique.", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -157116,12 +157211,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -157179,33 +157268,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Sibérie à l' Amérique", - "rougeL": 0.5882352941176471 + "rougeL": 0.5882352941176471, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Sibérie à l' Amérique", - "rougeL": 0.5882352941176471 + "rougeL": 0.5882352941176471, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "rives gelées reliant la Sibérie à l' Amérique", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les rives reliant la Sibérie à l'Amérique", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -157233,12 +157329,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -157320,33 +157410,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Talleyrand", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Talleyrand", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Talleyrand", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Talleyrand", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Talleyrand", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Talleyrand", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Talleyrand", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -157380,12 +157477,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -157448,33 +157539,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans la ville", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ville", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans la ville", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "dans la ville", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "à Orange", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans la ville", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Dans la ville.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -157496,12 +157594,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -157576,33 +157668,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à chauffer les maisons, d' autres pour la cuisine", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sanitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "salle de réception principale portée par quatre pilastres qui ouvrait sur une grande cour autour de laquelle s' organisait l' édifice", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "installations sanitaires", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sanitaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "destinés à chauffer les maisons, d' autres pour la cuisine, ainsi que des installations sanitaires (latrines, baignoires)", - "rougeL": 0.14285714285714288 + "rougeL": 0.14285714285714288, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "pour les installations sanitaires", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -157648,12 +157747,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -157704,33 +157797,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nouvel édifice", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "amphithéâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "amphithéâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "amphithéâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "amphithéâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' amphithéâtre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'édifice nouveau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -157746,12 +157846,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -157832,33 +157926,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' espace occupé fut rendu aux Romains", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l'espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 0.8571428571428572 + "rougeL": 0.8571428571428572, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Après la mort de Néron, l'espace occupé fut rendu aux Romains et le Colisée fut édifié sur l'emplacement du lac.", - "rougeL": 0.6153846153846154 + "rougeL": 0.6153846153846154, + "HScore": 1.0 } }, "human_annot": { @@ -157880,12 +157981,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -157954,33 +158049,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' espace occupé fut rendu aux Romains", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l'espace occupé est rendu aux Romains et le Colisée est édifié sur l'emplacement du lac", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' espace occupé fut rendu aux Romains et le Colisée fut édifié sur l' emplacement du lac", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Après la mort de Néron, l'espace occupé fut rendu aux Romains et le Colisée fut édifié sur l'emplacement du lac.", - "rougeL": 0.6153846153846154 + "rougeL": 0.6153846153846154, + "HScore": 1.0 } }, "human_annot": { @@ -158014,12 +158116,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -158094,33 +158190,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Martel a identifié les interactions entre le niveau de la rivière Echo et celui de la rivière Green.", - "rougeL": 0.2666666666666667 + "rougeL": 0.2666666666666667, + "HScore": 0.0 } }, "human_annot": { @@ -158148,12 +158251,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -158229,33 +158326,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Martel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Martel a vérifié l'influence de la rivière à la surface avec celle à l'intérieur de la grotte.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -158289,12 +158393,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -158358,33 +158456,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Auguste", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -158424,12 +158529,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -158481,33 +158580,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Dion Cassius", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Il n'y a pas de mention d'un professeur dans l'article.", - "rougeL": 0.14285714285714285 + "rougeL": 0.14285714285714285, + "HScore": 0.0 } }, "human_annot": { @@ -158547,12 +158653,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -158610,33 +158710,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sur le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sur le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sur le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sur le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -158670,12 +158777,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -158738,33 +158839,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "couteaux, marteaux, gouges, pinces, haches, burins, et même deux enclumes parmi les plus lourdes du monde antique (11 et 23,5 kg)", - "rougeL": 0.3448275862068966 + "rougeL": 0.3448275862068966, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "couteaux, marteaux, gouges, pinces, haches, burins", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "couteaux, marteaux, gouges, pinces, haches, burins, et même deux enclumes", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "deux enclumes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "des enclumes", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "deux enclumes parmi les plus lourdes du monde antique", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "deux enclumes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -158780,12 +158888,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -158867,33 +158969,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "90 %", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "90 % des personnes infectées", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "90 % des personnes infectées", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "90 % des personnes infectées", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "90 % des personnes infectées", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Les villages", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la population", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -158933,12 +159042,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -158989,33 +159092,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "90 %", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "90 %", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "90 %", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "90 %", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "90 % des personnes infectées", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "90 %", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "90 % des personnes infectées", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 } }, "human_annot": { @@ -159049,12 +159159,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -159129,33 +159233,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Julien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Julien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Julien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Julien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Julien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Julien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Julien qui y séjourne durant l'hiver 362/363.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -159183,12 +159294,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -159270,33 +159375,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sa tombe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tombes rupestres", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Tombe EK2 : Ahmès Pen - Nekhbet", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tombe EK2 : Ahmès Pen - Nekhbet", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tombe EK3 : Pahéri.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Tombe EK2 : Ahmès Pen - Nekhbet", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Sa tombe contient des textes concernant l'histoire du Nouvel Empire, des pharaons Ahmôsis Ier et Thoutmôsis III.", - "rougeL": 0.08333333333333333 + "rougeL": 0.08333333333333333, + "HScore": 0.5 } }, "human_annot": { @@ -159318,12 +159430,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -159404,33 +159510,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sur le versant sud de la colline qui domine la ville", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tombes rupestres", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Tombe EK2 : Ahmès Pen - Nekhbet", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tombe EK2 : Ahmès Pen - Nekhbet", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tombe EK3 : Pahéri.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Tombe EK2 : Ahmès Pen - Nekhbet", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"Sa tombe contient des textes concernant l'histoire du Nouvel Empire, des pharaons Ahmôsis Ier et Thoutmôsis III.\"", - "rougeL": 0.07142857142857142 + "rougeL": 0.07142857142857142, + "HScore": 0.0 } }, "human_annot": { @@ -159470,12 +159583,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -159532,33 +159639,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "racheter l' ensemble du domaine de Mammoth Cave", - "rougeL": 0.5833333333333334 + "rougeL": 0.5833333333333334, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "racheter l' ensemble du domaine de Mammoth Cave, ainsi que Bishop et les autres esclaves de Franklin Gorin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de racheter l' ensemble du domaine de Mammoth Cave, ainsi que Bishop et les autres esclaves de Franklin Gorin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "racheter l' ensemble du domaine de Mammoth Cave,", - "rougeL": 0.5833333333333334 + "rougeL": 0.5833333333333334, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "ouvrir un hôpital pour tuberculeux dans la grotte", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "racheter l' ensemble du domaine de Mammoth Cave", - "rougeL": 0.5833333333333334 + "rougeL": 0.5833333333333334, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "racheter l' ensemble du domaine de Mammoth Cave", - "rougeL": 0.5833333333333334 + "rougeL": 0.5833333333333334, + "HScore": 0.5 } }, "human_annot": { @@ -159574,12 +159688,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -159660,33 +159768,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Al Nasir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -159720,12 +159835,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -159788,33 +159897,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les analyses chimiques de la fin du XIXe siècle", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "analyses chimiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les analyses chimiques de la fin du XIXe siècle", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "analyses chimiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "M. Daubrée, en 1881, notait : « Pour ramollir une roche aussi réfractaire que le granite, il a fallu une intention formelle, servie par des efforts habiles et prolongés … Il a fallu une surabondance, une sorte de luxe de chaleur … par suite d' un procédé ingénieux et puissant ».", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Les analyses chimiques de la fin du XIXe siècle", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les analyses chimiques.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -159830,12 +159946,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -159911,42 +160021,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les analyses chimiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "analyses chimiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les analyses chimiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "analyses chimiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "M. Daubrée, en 1881, notait : « Pour ramollir une roche aussi réfractaire que le granite, il a fallu une intention formelle, servie par des efforts habiles et prolongés … Il a fallu une surabondance, une sorte de luxe de chaleur … par suite d' un procédé ingénieux et puissant ».", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Les analyses chimiques de la fin du XIXe siècle permirent de comprendre", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les analyses chimiques de la fin du XIXe siècle permirent de comprendre que la soude, la potasse, le sel, l' argile", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -160040,33 +160151,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "2014", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "2014", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "2014", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "2014", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "2014", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "2014", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "2014", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -160106,12 +160224,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -160168,33 +160280,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le général fatimide Jawhar arrive à al-Mansuriya en 969.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -160222,12 +160341,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -160287,33 +160400,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en 973", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "969", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le général fatimide Jawhar atteint al-Mansuriya en 969.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -160341,12 +160461,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -160406,33 +160520,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "consul", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "conservateur des antiquités grecques et romaines du British Museum", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -160460,12 +160581,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -160528,33 +160643,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "conservateur des antiquités grecques et romaines du British Museum", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -160594,12 +160716,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -160650,33 +160766,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "conservateur des antiquités grecques et romaines du British Museum", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "consul à Rome", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -160710,12 +160833,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -160778,42 +160895,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Géants, fils de Gaïa", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Géants", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Géants, fils de Gaa", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les Géants", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Géants, fils de Gaïa", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Géants, fils de Gaïa", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Géants", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -160906,33 +161024,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Marseille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Marseille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Marseille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Marseille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Marseille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Marseille", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Marseille.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -160966,12 +161091,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -161037,33 +161156,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Olivier Jehasse", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Olivier Jehasse", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Olivier Jehasse", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Olivier Jehasse", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Olivier Jehasse", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Olivier Jehasse", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Olivier Jehasse.", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -161097,12 +161223,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -161166,33 +161286,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "19", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "130", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "19", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "vers 130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "En 19.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -161238,12 +161365,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -161289,33 +161410,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "En 19", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "vers 130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "vers 130 - 136", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -161361,12 +161489,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -161418,33 +161540,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fouilles de l' Héraion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fouilles de l' Héraion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "aux fouilles de l' Héraion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "aux fouilles de l' Héraion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à construire des routes dans l' ouest de l' Empire ottoman", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "aux fouilles de l' Héraion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "aux fouilles de l'Héraion en 1861, aux travaux de construction des routes dans l'ouest de l'Empire ottoman, aux fouilles à Pergame en 1864.", - "rougeL": 0.23076923076923075 + "rougeL": 0.23076923076923075, + "HScore": 1.0 } }, "human_annot": { @@ -161460,12 +161589,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -161540,33 +161663,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Héraion", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fouilles de l' Héraion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fouilles de l' Héraion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "aux fouilles de l' Héraion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Héraion", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Héraion", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Samos, Héraion", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 } }, "human_annot": { @@ -161594,12 +161724,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -161668,33 +161792,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hélios, dieu du Soleil", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -161734,12 +161865,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -161790,33 +161915,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Grec", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les Grecs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -161844,12 +161976,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -161918,15 +162044,15 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "On pouvait boire aussi de l'ayahuasca qui a des effets hallucinogènes en affectant le système nerveux central", "rougeL": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "prêtres", "rougeL": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Incas", "rougeL": 0.0 }, @@ -161985,33 +162111,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "système nerveux central", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "prêtres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Incas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "puissances surnaturelles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "les prêtres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "on pouvait", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "on pouvait boire aussi de l'ayahuasca", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -162027,12 +162160,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -162114,33 +162241,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "21 mai 1919", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le matin du mercredi 21 mai 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le matin du mercredi 21 mai 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mercredi 21 mai 1919", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le mercredi 21 mai 1919", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mercredi 21 mai 1919", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le matin du mercredi 21 mai 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -162186,12 +162320,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -162236,33 +162364,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mercredi 21 mai 1919", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matin du mercredi 21 mai 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matin du mercredi 21 mai 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mercredi 21 mai 1919", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le mercredi 21 mai 1919", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le matin du mercredi 21 mai 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le matin du mercredi 21 mai 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -162308,12 +162443,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -162364,33 +162493,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un prieur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un prieur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un prieur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "prieur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un prieur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un prieur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un prieur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -162418,12 +162554,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -162505,33 +162635,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -162565,12 +162702,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -162640,33 +162771,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Coiedii", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Coiedii", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Coiedii", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la famille des Coiedii", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -162688,12 +162826,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -162769,33 +162901,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fouilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fouilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fouilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des fouilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une expédition scientifique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "des fouilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des fouilles", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -162811,12 +162950,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -162897,33 +163030,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "descendants directs de Manco Capac", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "merveilleuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "merveilleuse", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "origine merveilleuse", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "démi-dieux fils du soleil", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "fils du soleil", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les souverains incas ont une origine \"merveilleuse\"", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 } }, "human_annot": { @@ -162939,12 +163079,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -163025,33 +163159,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouet el - Aryan", - "rougeL": 0.8181818181818182 + "rougeL": 0.8181818181818182, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouet el - Aryan", - "rougeL": 0.8181818181818182 + "rougeL": 0.8181818181818182, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à un kilomètre et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à un kilomètre et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -163067,12 +163208,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -163147,33 +163282,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un kilomètres et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouet el - Aryan", - "rougeL": 0.8181818181818182 + "rougeL": 0.8181818181818182, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouet el - Aryan", - "rougeL": 0.8181818181818182 + "rougeL": 0.8181818181818182, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à un kilomètre et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à un kilomètres et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à un kilomètre et demi au nord de la pyramide à tranches de Zaouïet el - Aryan", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -163213,12 +163355,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -163275,33 +163411,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "accumulation de vestiges de différentes époques", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "accumulation de vestiges de différentes époques", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "accumulation de vestiges de différentes époques", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "accumulation de vestiges de différentes époques", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "accumulation de vestiges de différentes époques", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cette accumulation de vestiges de différentes époques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'accumulation de vestiges de différentes époques.", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -163347,12 +163490,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -163421,33 +163558,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son esprit", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son esprit", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son esprit", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "son esprit", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le corps de la jeune fille", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "son esprit", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'esprit du sacrifié", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -163475,12 +163619,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -163549,33 +163687,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -163615,12 +163760,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -163674,33 +163813,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en 1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1586", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -163740,12 +163886,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -163817,33 +163957,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "modification du milieu de croissance", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -163871,12 +164018,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -163951,33 +164092,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "modifiant les patrons de croissance", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "modification du milieu de croissance", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les processus hydrogéomorphologiques.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -163993,12 +164141,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -164079,33 +164221,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "les différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la coiffure masculine", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -164151,12 +164300,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -164201,33 +164344,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "d'une seule pièce", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "différences sociales", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les différences sociales s' expriment principalement au niveau de la qualité du tissu employé pour leur confection.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -164255,12 +164405,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -164329,42 +164473,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les raisons de cette évolution", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les raisons de cette évolution, à laquelle on assiste pour la grande majorité des villes de la Gaule romaine pendant l' Antiquité tardive", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les raisons de cette évolution, à laquelle on assiste pour la grande majorité des villes de la Gaule romaine pendant l' Antiquité tardive", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "raisons de cette évolution", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "raisons de cette évolution", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Les raisons de cette évolution.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -164458,42 +164603,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "aile gauche", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "aile gauche", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "aile gauche", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "aile gauche", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "aile gauche", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' aile gauche", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'aile gauche de la flotte de ce dernier", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -164586,33 +164732,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "septembre 1997", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "septembre 1997", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "septembre 1997", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en septembre 1997,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en septembre 1997", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "septembre 1997", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en septembre 1997", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -164658,12 +164811,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -164714,42 +164861,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un confort", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "confort", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "système de drainage des eaux usées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un confort", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "un système de drainage des eaux usées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un confort probablement inventé par cette civilisation", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "un système de drainage des eaux usées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -164849,33 +164997,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Santo António de Tanna", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -164903,12 +165058,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -164983,33 +165132,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "escadre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la frégate", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -165037,12 +165193,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -165129,33 +165279,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son mémorial en souvenir du génocide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son mémorial en souvenir du génocide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son mémorial en souvenir du génocide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "son mémorial en souvenir du génocide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un parti clandestin", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "son mémorial en souvenir du génocide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Erevan a créé son mémorial en souvenir du génocide.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { @@ -165195,12 +165352,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -165269,33 +165420,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mémorial en souvenir du génocide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mémorial", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son mémorial en souvenir du génocide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mémorial", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "mémorial en souvenir du génocide", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "son mémorial", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Erevan a construit son mémorial en souvenir du génocide.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 } }, "human_annot": { @@ -165311,12 +165469,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -165397,33 +165549,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les caractères germaniques des statuts grecques et romaines.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -165439,12 +165598,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -165525,33 +165678,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tous les trois ans", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous les trois ans depuis 1953", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous les trois ans depuis 1953", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1953", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "tous les trois ans depuis 1953", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "tous les trois ans depuis 1953", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "tous les trois ans depuis 1953", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -165573,12 +165733,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -165648,33 +165802,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tous les trois ans", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous les trois ans depuis 1953", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous les trois ans depuis 1953", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tous les trois", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "trois ans", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "tous les trois ans", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "tous les trois ans", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -165720,12 +165881,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -165777,33 +165932,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' expression de la peur", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'expression de la peur", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -165819,12 +165981,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -165899,33 +166055,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' expression de la peur", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "peur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'expression de la peur", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -165965,12 +166128,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -166039,33 +166196,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le site archéologique de Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "site archéologique de Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "site archéologique de Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le site archéologique de Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le site archéologique de Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le site archéologique de Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le site archéologique de Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -166105,12 +166269,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -166170,33 +166328,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le site archéologique de Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Briga intemporelle", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Briga", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le site archéologique de Briga.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -166236,12 +166401,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -166295,33 +166454,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Kenya", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en Afrique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Afrique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Tanzanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Kenya (KE)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "KE - 1", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Kenya (KE) : • _ KE - 1 – Lac Turkana", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -166355,12 +166521,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -166417,42 +166577,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "australopithèques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "hominines", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "hominines", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "hominines", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "sable et roches", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Au fond du lac Turkana, on trouve les principaux sites de découverte d'hominines, notamment des australopithèques.", - "rougeL": 0.32 + "rougeL": 0.32, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -166575,33 +166736,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Léon de Beylié", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Léon de Beylié", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Léon de Beylié", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Léon de Beylié", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "capitaine Léon de Beylié", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Léon de Beylié", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Léon de Beylié", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -166635,12 +166803,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -166715,33 +166877,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "âge du cuivre ou Chalcolithique et précède l' âge du fer", - "rougeL": 0.1818181818181818 + "rougeL": 0.1818181818181818, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' Amérique latine où les civilisations précolombiennes connurent une métallurgie de l' or et du cuivre jusqu' à la conquête espagnole", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "âge du bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "métallurgie de l' or et du cuivre jusqu' à la conquête espagnole.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l' or et du cuivre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' âge du bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la période de l'âge du bronze", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -166757,12 +166926,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -166844,33 +167007,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "moellons de petit appareil et les terres cuites (tubuli, fragments de briques ou de tuiles)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 } }, "human_annot": { @@ -166880,12 +167050,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -166966,33 +167130,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "moellons de petit appareil et les terres cuites (tubuli, fragments de briques ou de tuiles)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "moellons de petit appareil et les terres cuites (tubuli, fragments de briques ou de tuiles)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "moellons de petit appareil et les terres cuites (tubuli, fragments de briques ou de tuiles)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les moellons de petit appareil et les terres cuites", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 } }, "human_annot": { @@ -167032,12 +167203,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -167106,33 +167271,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -167166,12 +167338,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -167240,33 +167406,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Dion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Dion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Dion", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Dion Cassius", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le récit de Dion", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -167306,12 +167479,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -167368,33 +167535,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -167434,12 +167608,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -167491,33 +167659,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les anciens rois de la région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -167533,12 +167708,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -167620,33 +167789,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -167680,12 +167856,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -167742,33 +167912,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "En 1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "En 1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1876", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -167808,12 +167985,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -167888,33 +168059,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chauves-souris", - "rougeL": 0.1739130434782609 + "rougeL": 0.1739130434782609, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chauves-souris", - "rougeL": 0.1739130434782609 + "rougeL": 0.1739130434782609, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chauves-souris", - "rougeL": 0.1739130434782609 + "rougeL": 0.1739130434782609, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chauves-souris", - "rougeL": 0.1739130434782609 + "rougeL": 0.1739130434782609, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "9 ou 12 millions d' individus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "chauves-souris de l' Indiana, chauve-souris cendrée, petite chauve-souris brune, grande Chauve-souris brune et pipistrelle de l' Est", - "rougeL": 0.975609756097561 + "rougeL": 0.975609756097561, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ces espèces", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 } }, "human_annot": { @@ -167960,12 +168138,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -168016,33 +168188,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "février 1678", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "février 1678", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "février 1678", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "février 1678", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "février 1678", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "février 1678", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La construction du bateau débute à Bassein en février 1678.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { @@ -168070,12 +168249,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -168145,33 +168318,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "au départ des oursins", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au départ des oursins", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au départ des oursins", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "oursins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "oursins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "oursins", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "des oursins.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -168193,12 +168373,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -168274,33 +168448,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux narguilés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "renfort pour la garnison du fort Jesus", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux narguilés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "deux narguilés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "narguilés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "deux narguilés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les narguilés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 } }, "human_annot": { @@ -168322,12 +168503,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -168402,33 +168577,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans la Cité ou à ses abords immédiats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans la Cité ou à ses abords immédiats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans la Cité ou à ses abords immédiats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dans la Cité ou à ses abords immédiats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Cité ou à ses abords immédiats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans la Cité ou à ses abords immédiats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans la Cité ou à ses abords immédiats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -168468,12 +168650,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -168530,33 +168706,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grâce à une intervention militaire athénienne", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grâce à une intervention militaire athénienne", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "grâce à une intervention militaire athénienne", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "par une intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "grâce à une intervention militaire athénienne", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "grâce à une intervention militaire athénienne", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 } }, "human_annot": { @@ -168602,12 +168785,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -168652,33 +168829,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "militaire athénienne", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une intervention militaire athénienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -168724,12 +168908,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -168780,33 +168958,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Attale Ier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Attale Ier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Attale Ier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' Attale Ier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "llama-2_lora": { "answer_pred": "Attale Ier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Attale Ier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Attale Ier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -168978,48 +169163,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -169232,42 +169375,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "routes romaines constituant un système défensif", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ancien réseau de routes romaines constituant un système défensif", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des techniques de reconnaissance aérienne permettront de retrouver le tracé d' un ancien réseau de routes romaines constituant un système défensif", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "ancien réseau de routes romaines", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "de la déclaration de guerre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "techniques de reconnaissance aérienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "d'un ancien réseau de routes romaines constituant un système défensif.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -169354,33 +169498,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ancien réseau de routes romaines constituant un système défensif", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ancien réseau de routes romaines constituant un système défensif", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Irak et la Jordanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "des techniques de reconnaissance aérienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l'Irak et la Jordanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "T. E. Lawrence", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "il", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -169390,12 +169541,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -169488,33 +169633,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "objets très éloignés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "objets très éloignés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "arbrisseau, pas un buisson", - "rougeL": 0.2 + "rougeL": 0.2, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "des objets très éloignés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' horizon y est vaste et rien ne cache à la vue des objets très éloignés.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des objets très éloignés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les objets très éloignés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -169524,12 +169676,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -169629,33 +169775,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -169695,12 +169848,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -169757,33 +169904,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sur un gué", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur un gué", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur un gué", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sur un gué qui offre en son centre un îlot central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sur un gué qui offre en son centre un îlot central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sur un gué qui offre en son centre un îlot central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur un gué", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 } }, "human_annot": { @@ -169793,12 +169947,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -169885,33 +170033,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "partage de l' Empire", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "lors du partage de l' Empire à la suite de la mort du fils d' Alexandre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "partage de l' Empire à la suite de la mort du fils d' Alexandre", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "partage de l' Empire", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "après le partage de l' Empire à la suite de la mort du fils d' Alexandre", - "rougeL": 0.8888888888888888 + "rougeL": 0.8888888888888888, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "On citera par exemple Cassandre ou Ptolémée Ier.", - "rougeL": 0.09523809523809525 + "rougeL": 0.09523809523809525, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "lors du partage de l' Empire", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 } }, "human_annot": { @@ -169927,12 +170082,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -170013,33 +170162,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -170055,12 +170211,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -170135,33 +170285,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à Pincevent", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -170201,12 +170358,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -170275,33 +170426,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -170335,12 +170493,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -170409,33 +170561,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Gaulois", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les Gaulois.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -170457,12 +170616,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -170549,33 +170702,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Chancas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Viracocha", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Chancas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Chancas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Chancas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Chancas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Chancas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -170615,12 +170775,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -170691,33 +170845,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "hydrogéomorphologiques", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "par la modification de son milieu de croissance (variation du niveau du sol, inclinaison ou ploiement de la tige).", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "modifiant les patrons de croissance (répartition, diversité spécifique)", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Les processus hydrogéomorphologiques.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -170751,12 +170912,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -170825,33 +170980,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "modifiant les patrons de croissance", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "modifiant les patrons de croissance", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "patrons de croissance", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "par la modification de son milieu de croissance (variation du niveau du sol, inclinaison ou ploiement de la tige).", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les processus hydrogéomorphologiques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -170861,12 +171023,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -170965,33 +171121,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le nom du village a donné son nom à la période qui a progressivement, au XXe siècle, trouvé ses limites historiques, géographiques et culturelles", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "limites historiques, géographiques et culturelles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le nom du village a donné son nom à la période qui a progressivement, au XXe siècle, trouvé ses limites historiques, géographiques et culturelles", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le nom du village a donné son nom", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le nom de la culture néolithique", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "son nom", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le nom du village a donné son nom à la période.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -171013,12 +171176,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -171093,33 +171250,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Portugal", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vestiges d' enceintes vitrifiées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "enceintes vitrifiées", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "vestiges d' enceintes vitrifiées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Portugal", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "vestiges d' enceintes vitrifiées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Des vestiges d' enceintes vitrifiées ont été découverts en Europe du Sud.", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -171165,12 +171329,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -171215,42 +171373,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Portugal", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "enceintes vitrifiées", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "enceintes vitrifiées", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "enceintes vitrifiées", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Portugal", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "murs vitrifiés", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des vestiges d'enceintes vitrifiées", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -171343,33 +171502,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -171415,12 +171581,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -171465,33 +171625,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Hittites", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -171501,12 +171668,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -171593,33 +171754,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "radiocarbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "radiocarbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "radiocarbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "radiocarbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "radiocarbone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "datations radiocarbone", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les datations radiocarbone", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -171659,12 +171827,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -171721,33 +171883,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Grec", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Érétrie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la colonisation grecque", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -171793,12 +171962,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -171849,33 +172012,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans une tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans une tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une tour, une tombe, une stèle funéraire et le Grand autel de Pergame.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Pergame", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "à Pergame", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 } }, "human_annot": { @@ -171921,12 +172091,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -171971,33 +172135,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tour, une tombe, une stèle funéraire et le Grand autel de Pergame.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tour, tombe, stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une tour, une tombe, une stèle funéraire et le Grand autel de Pergame", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -172013,12 +172184,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -172117,33 +172282,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "leurs terres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "leurs terres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "leurs terres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "leurs terres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les terres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "leurs terres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "leurs terres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -172183,12 +172355,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -172245,33 +172411,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' opinion que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ce sont des bâtiments commerciaux", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 } }, "human_annot": { @@ -172287,12 +172460,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -172367,33 +172534,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' opinion que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' opinion que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' opinion que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' opinion que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "que ce sont des bâtiments commerciaux où les poteries et leur contenu changeaient de main est exprimé par plusieurs chercheurs.", - "rougeL": 0.8181818181818182 + "rougeL": 0.8181818181818182, + "HScore": 1.0 } }, "human_annot": { @@ -172403,12 +172577,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -172495,33 +172663,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hans Rienerth", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hans Rienerth", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hans Rienerth", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Hans Rienerth", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hans Rienerth", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Hans Rienerth", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hans Rienerth", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -172555,12 +172730,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -172624,74 +172793,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Érétrie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.2222222222222222 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Chalcidique et en Occident", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les Grecs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la colonisation grecque", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5555555555555556 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_9", - "rating": "Partiellement correct" - }, - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -172996,20 +173134,6 @@ "rating": "Erreur inacceptable" } ], - "question": [ - { - "annot": "annot_5", - "rating": "Question douteuse" - }, - { - "annot": "annot_8", - "rating": "Question douteuse" - }, - { - "annot": "annot_10", - "rating": "Question douteuse" - } - ], "no_answer": [ { "annot": "annot_5", @@ -173058,42 +173182,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Chalcidique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Chalcidique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "grecque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Chalcidique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Érétrie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Chalcidique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -173210,33 +173335,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Tefnakht de Saïs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Tefnakht et ses troupes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Piyé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tefnakht", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Piyé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Piyé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Piyé enlève Memphis suite à de violents combats.", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 } }, "human_annot": { @@ -173264,12 +173396,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -173335,33 +173461,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "détruit les fortifications comme on l' exigeait et reprirent le combat avec l' aide des Romains", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "détruire les fortifications", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "détruire les fortifications", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "reddition de la population", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de détruire les fortifications", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "détruire les fortifications", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de détruire les fortifications", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -173389,12 +173522,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -173458,33 +173585,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fortifications", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les fortifications", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les fortifications", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les fortifications", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "fortifications", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les fortifications", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les fortifications", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -173530,12 +173664,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -173587,33 +173715,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Bernardino de Sahagún", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Bernardino de Sahagn", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Bernardino de Sahagn", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Bernardino de Sahagún", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Bernardino de Sahagún", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Bernardino de Sahagún", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Bernardino de Sahagún", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -173635,12 +173770,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -173717,33 +173846,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "comté d' Edmonson", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Mammoth Cave", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Edmonson", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Mammoth Cave", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le comté d' Edmonson", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les villes avoisinantes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le comté d'Edmonson", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -173783,12 +173919,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -173846,33 +173976,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Mammoth Cave", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Mammoth Cave", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Parc National de Mammoth Cave", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Mammoth Cave", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Parc National de Mammoth Cave", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Mammoth Cave", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le parc national de Mammoth Cave", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -173918,12 +174055,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -173993,33 +174124,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "expédition", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' expédition", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' expédition", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "L' expédition vers Khotan", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "expédition", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' expédition", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "L'expédition prend le départ de Bandipur au Cachemire à la fin du mois de mai 1900.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -174065,12 +174203,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -174122,33 +174254,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "si l' on a à faire à une cité", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' agencement des bâtiments", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "si l' on a à faire à une cité", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "si l' on a à faire à une cité,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "si l'on a à faire à une cité", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "si l' on a à faire à une cité", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "si l'on a affaire à une cité", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -174164,12 +174303,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -174250,33 +174383,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "inspiration platonicienne", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -174310,12 +174450,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -174372,33 +174506,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "platonicienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Platon", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Platon", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Platon", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -174408,12 +174549,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -174500,42 +174635,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "traces de riz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "riz", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "riz", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "riz", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "des traces de riz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "parfois des traces de riz", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la pâte de la céramique contient parfois des traces de riz", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -174622,33 +174758,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "traces de riz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des traces de riz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "riz", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des traces de riz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des traces de riz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des traces de riz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des traces de riz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -174670,12 +174813,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -174750,33 +174887,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "rien ne cache à la vue", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "rien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "rien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "rien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' horizon", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "pas un arbrisseau, pas un buisson", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "rien ne cache", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -174822,12 +174966,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -174878,33 +175016,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Dion Cassius", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Dion Cassius", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "combat de cavalerie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Dion Cassius", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "César", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "César.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -174926,12 +175071,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -175003,33 +175142,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "George", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "George", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "George", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Harrison Ford", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "George Lucas ?", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Steven Spielberg", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Steven Spielberg", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -175069,12 +175215,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -175131,33 +175271,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "datation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "datation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la datation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "datation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "datation donnée par la résonance de spin électronique", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "compatible avec la datation", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"la datation\".", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -175173,12 +175320,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -175253,33 +175394,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "datation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "indice d' ancienneté en contradiction avec les dates absolues obtenues par la thermoluminescence", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "compatible avec la datation donnée", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "datation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "datation donnée par la biostratigraphie", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "datation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"donner la datation\".", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -175313,12 +175461,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -175393,33 +175535,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quitter Hattusa", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hattusa", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Samuha", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Gasgas", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Samuha", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le pouvoir royal est alors considérablement affaibli, proche de sa perte", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le pouvoir royal", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -175465,12 +175614,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -175525,33 +175668,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les cartes de base de pays entiers.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -175591,12 +175741,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -175647,33 +175791,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cartes de base de pays entiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "la photogrammétrie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la photographie aérienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un emploi de plus en plus systématique de la photogrammétrie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la photogrammétrie pour réaliser les cartes de base de pays entiers.", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 } }, "human_annot": { @@ -175701,12 +175852,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -175793,33 +175938,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Artémis", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la déesse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -175841,12 +175993,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -175921,33 +176067,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "argile", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "argile", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "argile", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "argile", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "d' une matière plus ou moins poreuse et réalisées dans une argile d' une dureté bien faible", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "argile", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "dans une argile d'une dureté bien faible", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 } }, "human_annot": { @@ -175981,12 +176134,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -176043,33 +176190,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "argile d' une dureté bien faible", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans une argile d' une dureté bien faible", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans une argile d' une dureté bien faible", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dans une argile", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "d' une matière plus ou moins poreuse et réalisées dans une argile d' une dureté bien faible.", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "réalisées dans une argile d' une dureté bien faible", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans une argile d'une dureté bien faible", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -176085,12 +176239,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -176171,33 +176319,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "durant la Seconde Guerre mondiale", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tué durant la Seconde Guerre mondiale", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tué durant la Seconde Guerre mondiale", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "rivière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sa vie durant la Seconde Guerre mondiale", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la vie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la \"rivière perdue\"", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -176243,12 +176398,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -176300,33 +176449,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "de nombreux objets, éléments d' architecture et sculptures issus des fouilles archéologiques réalisées dans et autour du site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "de nombreux objets, éléments d' architecture et sculptures issus des fouilles archéologiques réalisées dans et autour du site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de nombreux objets, éléments d'architecture et sculptures issus des fouilles archéologiques réalisées dans et autour du site", - "rougeL": 0.8695652173913043 + "rougeL": 0.8695652173913043, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de nombreux objets, éléments d' architecture et sculptures issus des fouilles archéologiques réalisées dans et autour du site.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de nombreux objets, éléments d' architecture et sculptures issus des fouilles archéologiques réalisées dans et autour du site.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "objets, éléments d' architecture et sculptures issus des fouilles archéologiques réalisées dans et autour du site", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de nombreux objets, éléments d'architecture et sculptures issus des fouilles archéologiques", - "rougeL": 0.7000000000000001 + "rougeL": 0.7000000000000001, + "HScore": 1.0 } }, "human_annot": { @@ -176366,12 +176522,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -176428,42 +176578,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à partir d' actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à partir d' actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à partir d' actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à partir d' actes et de registres des titulaires de charges publiques de son époque.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à partir d'actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 0.7777777777777777 + "rougeL": 0.7777777777777777, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"bien connue à partir d'actes et de registres des titulaires de charges publiques de son époque\".", - "rougeL": 0.5833333333333334 + "rougeL": 0.5833333333333334, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -176551,33 +176702,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' actes et de registres des titulaires de charges publiques de son époque.", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à partir d'actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 0.7777777777777777 + "rougeL": 0.7777777777777777, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "actes et de registres des titulaires de charges publiques de son époque", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à partir d'actes et de registres des titulaires de charges publiques de son époque.", - "rougeL": 0.7777777777777777 + "rougeL": 0.7777777777777777, + "HScore": 1.0 } }, "human_annot": { @@ -176623,12 +176781,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -176680,33 +176832,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "effets personnels", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "du mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -176722,12 +176881,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -176802,33 +176955,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "officiers", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "du mess des officiers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -176868,12 +177028,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -176942,33 +177096,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Al - Mansur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "al - Mansur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Al - Mansur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "al - Mansur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "al - Mansur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "al - Mansur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Al-Mansur", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -176990,12 +177151,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -177082,33 +177237,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pompéi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "art hellénistique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pompéi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -177142,12 +177304,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -177216,33 +177372,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pompéi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Pompéi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pompéi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le propriétaire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -177264,12 +177427,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -177344,33 +177501,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Même avant l' arrivée des Espagnols eux-mêmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Même avant l' arrivée des Espagnols eux-mêmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Même avant l' arrivée des Espagnols eux-mêmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "avant l' arrivée des Espagnols", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "avant l'arrivée des Espagnols eux-mêmes", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Même avant l' arrivée des Espagnols eux-mêmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "avant l'arrivée", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 0.0 } }, "human_annot": { @@ -177398,12 +177562,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -177472,33 +177630,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "lorsque les ossements auront été complètement dégagés de leur gangue de brèche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "lorsque les ossements auront été complètement dégagés de leur gangue de brèche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ossements auront été complètement dégagés de leur gangue de brèche", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "lorsque les ossements auront été complètement dégagés de leur gangue de brèche", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "dégagés de leur gangue de brèche", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "dégagés de leur gangue de brèche", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "une étude publiée en 2015", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -177532,12 +177697,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -177618,33 +177777,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Félix Voulot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Une mosaque de 232 m2 a été découverte en 1883 par Félix Voulot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Félix Voulot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Félix Voulot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Félix Voulot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "mosaïque", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mosaïque de Grand", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -177660,12 +177826,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -177748,33 +177908,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -177802,12 +177969,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -177870,33 +178031,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des observateurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -177930,12 +178098,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -177998,33 +178160,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' Abbé Baudry, puis le Dr Marcel Baudouin", - "rougeL": 0.9285714285714286 + "rougeL": 0.9285714285714286, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' Abbé Baudry, puis le Dr Marcel Baudouin", - "rougeL": 0.9285714285714286 + "rougeL": 0.9285714285714286, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "puits à eau comblés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' Abbé Baudry, puis le Dr Marcel Baudouin", - "rougeL": 0.9285714285714286 + "rougeL": 0.9285714285714286, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Abbé Baudry", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "puits à eau comblés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "l'Abbé Baudry et le Dr Marcel Baudouin", - "rougeL": 0.8148148148148148 + "rougeL": 0.8148148148148148, + "HScore": 1.0 } }, "human_annot": { @@ -178046,12 +178215,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -178127,33 +178290,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Andrea Palladio et Antonia da Sangallo le Jeune.", - "rougeL": 0.896551724137931 + "rougeL": 0.896551724137931, + "HScore": 1.0 } }, "human_annot": { @@ -178193,12 +178363,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -178252,33 +178416,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des architectes comme Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.967741935483871 + "rougeL": 0.967741935483871, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Andrea Palladio et Antonio da Sangallo le Jeune.", - "rougeL": 0.9655172413793104 + "rougeL": 0.9655172413793104, + "HScore": 1.0 } }, "human_annot": { @@ -178318,12 +178489,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -178395,33 +178560,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le peintre des Enfers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le peintre des Enfers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le peintre des Enfers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le peintre des Enfers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le peintre des Enfers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le peintre des Enfers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le peintre des Enfers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -178455,12 +178627,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -178523,33 +178689,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Kirghizes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Kirghizes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Kirghizes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Sarikol", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Kirghizes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Stein", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "l'expédition ou Ferdinand von Richthofen", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -178559,12 +178732,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -178651,33 +178818,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour suivre les cours de l' école d' état-major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "suivre les cours de l' école d' état-major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour suivre les cours de l' école d' état-major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pour suivre les cours de l' école d' état-major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour suivre les cours de l' école d' état-major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour suivre les cours de l' école d' état-major", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour suivre les cours de l'école d'état-major", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -178705,12 +178879,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -178773,33 +178941,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "école d' état-major", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les cours de l' école d' état-major", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' école d' état-major", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les cours de l' école d' état-major", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "école d' état-major", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les cours de l' école d' état-major", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les cours de l'école d'état-major", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 } }, "human_annot": { @@ -178845,12 +179020,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -178913,42 +179082,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "peuple des Santons", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le nom d' origine de la ville", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les contemporains", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Mediolanum Santonum", - "rougeL": 0.39999999999999997 + "rougeL": 0.39999999999999997, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Mediolanum Santonum (Saintes)", - "rougeL": 0.3529411764705882 + "rougeL": 0.3529411764705882, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' oppidum", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Pons", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -179059,33 +179229,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Praxitèle jeune", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Artémis", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Artémis de Dresde", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la déesse (Artémis)", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -179125,12 +179302,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -179187,33 +179358,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "offrandes et sacrifice", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ayahuasca", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ayahuasca", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "feuilles de coca", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "ayahuasca qui a des effets hallucinogènes en affectant le système nerveux central.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ayahuasca", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de l'ayahuasca", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 } }, "human_annot": { @@ -179229,12 +179407,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -179315,33 +179487,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Dion Cassius", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Dion Cassius", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Dion Cassius", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "combat de cavalerie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "César", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Il", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "César", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -179357,12 +179536,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -179443,33 +179616,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sur la Seine gelée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur la Seine gelée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur la Seine gelée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la Seine gelée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "sur la Seine gelée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Seine gelée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "sur la Seine gelée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -179515,12 +179695,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -179571,33 +179745,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -179637,12 +179818,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -179693,33 +179868,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le 26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 26 octobre 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -179747,12 +179929,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -179821,33 +179997,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "au début du VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "début du VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au début du VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "première moitié du Ier siècle", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans la première moitié du Ier siècle", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "première moitié du Ier siècle", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le pont du Gard est utilisé dans la première moitié du Ier siècle.", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 } }, "human_annot": { @@ -179857,12 +180040,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -179943,33 +180120,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "VIe siècle", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "VIe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "VIe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ier siècle", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Ve siècle", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Ier siècle", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "au début du VIe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -179979,12 +180163,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -180071,33 +180249,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Londres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "site", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il quitta donc le site", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -180107,12 +180292,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -180199,33 +180378,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Une autre motte féodale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Une autre motte féodale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Une autre motte féodale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "motte féodale", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "une motte féodale", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une motte féodale", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "une autre motte féodale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -180265,12 +180451,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -180327,33 +180507,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "divers programmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "divers programmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "divers programmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "divers programmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "divers programmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "divers programmes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Par divers programmes.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -180381,12 +180568,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -180455,33 +180636,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bord d' un lac qui occupait la plaine sur laquelle s' étend maintenant la banlieue de Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au bord d' un lac", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au bord d' un lac qui occupait la plaine sur laquelle s' étend maintenant la banlieue de Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "au bord d' un lac", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sur un lac qui occupait la plaine sur laquelle s' étend maintenant la banlieue de Mexico", - "rougeL": 0.962962962962963 + "rougeL": 0.962962962962963, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "au bord d' un lac qui occupait la plaine sur laquelle s' étend maintenant la banlieue de Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au bord d'un lac", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.5 } }, "human_annot": { @@ -180515,12 +180703,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -180580,33 +180762,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un lac qui occupait la plaine sur laquelle s' étend maintenant la banlieue de Mexico", - "rougeL": 0.962962962962963 + "rougeL": 0.962962962962963, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "lac qui occupait la plaine sur laquelle s' étend maintenant la banlieue de Mexico", - "rougeL": 0.962962962962963 + "rougeL": 0.962962962962963, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lac qui occupait la plaine sur laquelle s' étend maintenant la banlieue de Mexico", - "rougeL": 0.962962962962963 + "rougeL": 0.962962962962963, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "au bord d' un lac", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Mexico", - "rougeL": 0.35294117647058826 + "rougeL": 0.35294117647058826, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "un lac qui occupait la plaine sur laquelle s' étend maintenant la banlieue de Mexico", - "rougeL": 0.962962962962963 + "rougeL": 0.962962962962963, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le lac", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 0.0 } }, "human_annot": { @@ -180634,12 +180823,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -180711,33 +180894,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1961", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1961.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -180765,12 +180955,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -180833,33 +181017,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1958", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -180887,12 +181078,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -181021,33 +181206,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Il", - "rougeL": 0 + "rougeL": 0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -181093,12 +181285,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -181203,42 +181389,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "il décide de céder les biens de sa propre famille, et fait lui-même le boniment de la marchandise", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "il", - "rougeL": 0 + "rougeL": 0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Caligula", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -181343,33 +181530,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "panneau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "panneau de marbre avec la figure sculptée d' un géant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "panneau de marbre avec la figure sculptée d' un géant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un panneau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "panneau de marbre avec la figure sculptée d' un géant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "panneau de marbre avec la figure sculptée d' un géant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le panneau.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -181385,12 +181579,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -181477,33 +181665,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "panneau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "panneau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "panneau de marbre avec la figure sculptée d' un géant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "panneau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "panneau de marbre avec la figure sculptée d' un géant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "panneau de marbre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le panneau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -181549,12 +181744,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -181605,33 +181794,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -181671,12 +181867,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -181727,33 +181917,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nerveux central", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "nerveux central", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nerveux central", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "divination", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "système nerveux central", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le système nerveux central.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -181793,12 +181990,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -181855,33 +182046,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les caractères germaniques des statuts grecques et romaines.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -181897,12 +182095,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -181977,33 +182169,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "germaniques des statuts grecques et romaines", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "caractères germaniques des statuts grecques et romaines", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "germaniques des statuts grecques et romaines", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "caractères germaniques", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les caractères germaniques des statuts grecques et romaines.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -182043,12 +182242,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -182105,33 +182298,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une configuration, à savoir celle d' un bateau tombe.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -182165,12 +182365,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -182228,33 +182422,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Basil Brown", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -182294,12 +182495,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -182357,33 +182552,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en 1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1976 durant la Révolution culturelle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -182411,12 +182613,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -182485,33 +182681,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1976 durant la Révolution culturelle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1976", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "1976 durant la Révolution culturelle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1976 and phase finale (v. 1250 - 1046)", - "rougeL": 0.15384615384615383 + "rougeL": 0.15384615384615383, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "en 1976 durant la Révolution culturelle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -182533,12 +182736,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -182613,33 +182810,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1924", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1924", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1924", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "(1854 - 1924),", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "1924.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1924", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Manuel Luque meurt dans cette ville.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -182661,12 +182865,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -182737,42 +182935,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1854", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1854 - 1924", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1854 - 1924", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "(1854 - 1924),", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "1924.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1924", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Il n'y a pas d'information dans l'article sur le moment où Manuel Luque meurt dans cette ville.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -182879,33 +183078,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pendant les trois ans qu' il passe dans ce pays, il s' adonne surtout à des recherches sur les antiquités et sur les anciennes doctrines religieuses du Moyen-Orient", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ce séjour", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La mission Gardanne est alors abandonnée", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "la protection de son oncle", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "des recherches sur les antiquités et sur les anciennes doctrines religieuses du Moyen-Orient", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "recherches sur les antiquités et sur les anciennes doctrines religieuses du Moyen-Orient", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le séjour dans le pays et les recherches sur les antiquités et les anciennes doctrines religieuses du Moyen-Orient.", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.5 } }, "human_annot": { @@ -182945,12 +183151,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -183013,33 +183213,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pendant les trois ans qu' il passe dans ce pays, il s' adonne surtout à des recherches sur les antiquités et sur les anciennes doctrines religieuses du Moyen-Orient", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ce séjour", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La mission Gardanne est alors abandonnée", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "ce séjour décide de la vocation scientifique du jeune Lajard. Pendant les trois ans qu' il passe dans ce pays,", - "rougeL": 0.14285714285714288 + "rougeL": 0.14285714285714288, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des recherches sur les antiquités et sur les anciennes doctrines religieuses du Moyen-Orient", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "séjour dans ce pays", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le séjour dans le pays", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 } }, "human_annot": { @@ -183055,12 +183262,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -183159,33 +183360,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son architecte personnel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son architecte personnel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son architecte personnel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "architecte personnel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "son architecte personnel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "son architecte personnel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Son architecte personnel", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -183207,12 +183415,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -183287,33 +183489,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une belle mosaïque au labyrinthe", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une belle mosaque au labyrinthe", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une belle mosaque au labyrinthe", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une belle mosaïque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mosaïque au labyrinthe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une belle mosaïque au labyrinthe", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une belle mosaïque au labyrinthe", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -183359,12 +183568,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -183409,33 +183612,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mosaïque au labyrinthe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mosaque au labyrinthe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mosaque", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "mosaïque", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "mosaïque au labyrinthe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mosaïque au labyrinthe", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une belle mosaïque au labyrinthe", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -183469,12 +183679,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -183537,33 +183741,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -183591,12 +183802,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -183659,33 +183864,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -183713,12 +183925,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -183781,33 +183987,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 25 février 1917", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -183847,12 +184060,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -183909,33 +184116,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "2003,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "2003 : découverte des ponts antiques", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "2003 : découverte des ponts antiques.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -183957,12 +184171,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -184032,33 +184240,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "2003,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "2003", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "2003 : découverte des ponts antiques", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -184080,12 +184295,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -184161,33 +184370,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Cet homme vivait au milieu de forêts et de prairies, où il pratiquait chasse et pêche", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Homme de Zhoukoudian", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Homme de Zhoukoudian", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' Homme de Zhoukoudian", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Homo sapiens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' Homme de Zhoukoudian", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Cet homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -184203,12 +184419,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -184292,33 +184502,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "poignards en cuivre", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Plusieurs poignards en cuivre", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Plusieurs poignards en cuivre supposés d' origine Rinaldone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Plusieurs poignards en cuivre", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "poignards en cuivre", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "poignards en cuivre supposés d' origine Rinaldone", - "rougeL": 0.9523809523809523 + "rougeL": 0.9523809523809523, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Plusieurs poignards en cuivre supposés d'origine Rinaldone ont même été découverts en Suisse.", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 0.5 } }, "human_annot": { @@ -184352,12 +184569,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -184414,33 +184625,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "poignards en cuivre", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "poignards en cuivre", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Plusieurs poignards en cuivre supposés d' origine Rinaldone", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "poignards en cuivre", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "poignards en cuivre", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "poignards en cuivre", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Plusieurs poignards en cuivre", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -184480,12 +184698,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -184560,33 +184772,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux plongeurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Santo António de Tanna", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux plongeurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Santo António de Tanna,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la Santo António de Tanna", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la Santo António de Tanna, sous le commandement du capitaine Domingos Pereira de Gusman", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La Santo António de Tanna", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -184602,12 +184821,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -184696,33 +184909,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -184762,12 +184982,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -184818,33 +185032,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "forestier", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "forestier", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "forestier", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "forestier mais aussi de clairières", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un mélange de couvert forestier", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -184872,12 +185093,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -184946,33 +185161,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "professeur de sa discipline au collège de France où il est titulaire de la chaire de Paléoanthropologie et de préhistoire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "professeur de sa discipline au collège de France où il est titulaire de la chaire de Paléoanthropologie et de préhistoire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il est directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "professeur de sa discipline au collège de France", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "professeur de sa discipline", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -185018,12 +185240,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -185068,33 +185284,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "collège de France", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -185128,12 +185351,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -185190,33 +185407,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "musées", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "musées", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "différents musées", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "musées", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de différents musées", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "différents musées", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de différents musées", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -185226,12 +185450,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -185318,33 +185536,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un autre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un autre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un autre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "un autre, sachant écrire,", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un autre, sachant écrire", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un autre, sachant écrire", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un autre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -185384,12 +185609,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -185448,33 +185667,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "installation fixe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fixe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fixe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "installation fixe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "fixe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "fixe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le four devient une installation fixe.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -185490,12 +185716,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -185576,33 +185796,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Il existe à l'extérieur du cercle de sarsen deux autres cercles un peu irréguliers", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 } }, "human_annot": { @@ -185636,12 +185863,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -185700,33 +185921,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "deux autres cercles un peu irréguliers de chacun 30 grandes cavités", - "rougeL": 0.7058823529411764 + "rougeL": 0.7058823529411764, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "deux autres cercles", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -185766,12 +185994,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -185830,33 +186052,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "écrivent des choses sur ce peuple", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "écrivent des choses sur ce peuple", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "choses sur ce peuple : cette période de transition « entre » la Préhistoire et l' Histoire s' appelle la Protohistoire", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "écrivent des choses sur ce peuple", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Qu'un peuple ne connaisse pas l'écriture, mais qu'un autre, sachant écrire, écrivent des choses sur ce peuple.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "écrivent des choses sur ce peuple", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 } }, "human_annot": { @@ -185872,12 +186101,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -185970,33 +186193,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trois chasseurs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.2 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, "Camembert_baseline": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, "llama-2_lora": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, "mixtral-8x7b": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, "GPT-3.5": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 } }, "human_annot": { @@ -186168,48 +186398,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -186423,33 +186611,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "randonneurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "randonneurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "randonneurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des randonneurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des randonneurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des randonneurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des randonneurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -186465,12 +186660,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -186551,33 +186740,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fin de l' âge de la pierre vers 6000 av. J.-C.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fin de l' âge de la pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à la fin de l' âge de la pierre vers 6000 av. J.-C.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fin de l' âge de la pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l'âge de la pierre vers 6000 av. J.-C.", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la fin de l'âge de la pierre", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la fin de l' âge de la pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -186623,12 +186819,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -186679,33 +186869,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les hommes de l'Aurignacien.", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -186745,12 +186942,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -186802,33 +186993,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Aurignacien", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les hommes de l'Aurignacien", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -186868,12 +187066,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -186931,33 +187123,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pléistocène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pléistocène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pléistocène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pléistocène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1884", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "pléistocène (quaternaire)", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "du pléistocène (quaternaire)", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -186967,12 +187166,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -187053,42 +187246,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pléistocène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "En 1884", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1884", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "pléistocène", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1884", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "dans les alluvions de la Somme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Il découvre dans les alluvions de la Somme des outils en silex qu'il date du pléistocène (quaternaire).", - "rougeL": 0.34782608695652173 + "rougeL": 0.34782608695652173, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -187193,33 +187387,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "site archéologiques situés en France", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sites archéologiques situés en France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "France", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "France", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "France, où on a pour la première fois identifié une « culture » particulière.", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "France", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en France", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -187259,12 +187460,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -187322,33 +187517,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à l' entrée des grottes et des cavernes.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à l'entrée des grottes et des cavernes", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { @@ -187382,12 +187584,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -187444,33 +187640,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grottes et des cavernes", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grottes et des cavernes", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des grottes et des cavernes", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "grottes et des cavernes", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "grottes et des cavernes", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "grottes et des cavernes", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "à l'entrée des grottes", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.5 } }, "human_annot": { @@ -187510,12 +187713,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -187572,33 +187769,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -187614,12 +187818,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -187700,33 +187898,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Édouard Piette est mort à Rumigny.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -187754,12 +187959,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -187824,33 +188023,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Rumigny", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -187884,12 +188090,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -187954,33 +188154,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le petit sac de cuir contient de petits outils de silex (grattoir, perçoir, lame pointue).", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -187990,12 +188197,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -188076,33 +188277,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "de petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -188142,12 +188350,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -188198,78 +188400,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.9 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.9 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.9 }, "Camembert_baseline": { "answer_pred": "de petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.9 }, "llama-2_lora": { "answer_pred": "petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un nécessaire à faire du feu (amadou, silex) et des petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.35 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -188644,33 +188811,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de petits outils de silex", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -188698,12 +188872,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -188772,33 +188940,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sac de cuir", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un petit sac de cuir contenant de petits outils de silex (grattoir, perçoir, lame pointue).", - "rougeL": 0.3157894736842105 + "rougeL": 0.3157894736842105, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un petit sac de cuir contenant de petits outils de silex (grattoir, perçoir, lame pointue)", - "rougeL": 0.3157894736842105 + "rougeL": 0.3157894736842105, + "HScore": 1.0 } }, "human_annot": { @@ -188826,12 +189001,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -188900,33 +189069,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des grattoirs carénés qui servent à gratter les peaux, burins carénés et burins busqués", - "rougeL": 0.6923076923076924 + "rougeL": 0.6923076923076924, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "burins busqués", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -188948,12 +189124,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -189026,33 +189196,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -189086,12 +189263,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -189158,33 +189329,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "être humain ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à un être humain ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tzi tzi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "un être humain ancien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Ötzi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Ötzi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le nom a été donné à Ötzi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -189218,12 +189396,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -189304,33 +189476,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "arc", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "arc", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La réalisation de propulseur", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' arc", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' arc", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "arc", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'arc", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -189346,12 +189525,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -189433,33 +189606,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hommes de la Préhistoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hommes de la Préhistoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Hommes de la Préhistoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les Hommes de la Préhistoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Hommes de la Préhistoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Hommes de la Préhistoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Hommes de la Préhistoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -189493,12 +189673,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -189562,33 +189736,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -189622,12 +189803,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -189684,33 +189859,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "30000", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "30000", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "30000", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers 30000", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -189726,12 +189908,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -189812,33 +189988,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trois", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trois", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trois", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "trois", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "trois grandes périodes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "trois", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "trois grandes périodes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -189866,12 +190049,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -189940,33 +190117,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la formation du calcaire et des fossiles du nord-est de la France,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -189982,12 +190166,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -190062,42 +190240,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "formation du calcaire et des fossiles du nord-est de la France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "calcaire et des fossiles du nord-est de la France,", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "géologie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la formation du calcaire et des fossiles", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le calcaire et les fossiles du nord-est de la France", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -190190,33 +190369,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' homme retrouvé à Combe - Capelle en Dorogne", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' homme retrouvé à Combe - Capelle", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' homme retrouvé à Combe - Capelle en Dordogne", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' homme retrouvé à Combe - Capelle en Dordogne,", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "homme de Combe - Capelle", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' homme retrouvé à Combe - Capelle en Dordogne", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'homme retrouvé à Combe - Capelle en Dordogne", - "rougeL": 0.4285714285714285 + "rougeL": 0.4285714285714285, + "HScore": 1.0 } }, "human_annot": { @@ -190262,12 +190448,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -190312,33 +190492,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' homme retrouvé à Combe - Capelle en Dorogne", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "homme retrouvé à Combe - Capelle", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "homme retrouvé à Combe - Capelle en Dordogne", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "L' homme retrouvé à Combe - Capelle en Dordogne,", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l'homme retrouvé à Combe - Capelle en Dordogne", - "rougeL": 0.4285714285714285 + "rougeL": 0.4285714285714285, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' homme retrouvé à Combe - Capelle en Dordogne", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'homme retrouvé à Combe-Capelle en Dordogne.", - "rougeL": 0.4285714285714285 + "rougeL": 0.4285714285714285, + "HScore": 1.0 } }, "human_annot": { @@ -190348,12 +190535,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -190440,33 +190621,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un mélange de couvert forestier mais aussi de clairières", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -190482,12 +190670,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -190568,33 +190750,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Denis Peyrony a découvert la grotte en 1901.", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 1.0 } }, "human_annot": { @@ -190628,12 +190817,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -190690,33 +190873,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1901. La grotte à été découverte en 1901 par Denis Peyrony, elle est classé monument historique l' année suivante.", - "rougeL": 0.125 + "rougeL": 0.125, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1901", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -190756,12 +190946,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -190818,33 +191002,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -190854,12 +191045,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -190946,33 +191131,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "culture", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.75 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une « culture » particulière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une « culture » particulière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "Camembert_baseline": { "answer_pred": "outils de pierre ou d' os", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "une « culture » particulière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "mixtral-8x7b": { "answer_pred": "une « culture » particulière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 }, "GPT-3.5": { "answer_pred": "une « culture » particulière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 } }, "human_annot": { @@ -191102,48 +191294,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Partiellement correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -191398,33 +191548,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "décrivit les outils trouvés sur le site", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les outils trouvés", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 } }, "human_annot": { @@ -191458,12 +191615,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -191520,33 +191671,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "décrivit les outils trouvés sur le site", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les outils trouvés", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -191568,12 +191726,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -191648,33 +191800,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la population", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la population", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la population", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "population", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la population", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la population", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la population", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -191708,12 +191867,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -191777,33 +191930,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plusieurs dizaines de kilomètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plusieurs dizaines de kilomètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plusieurs dizaines de kilomètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "plusieurs dizaines de kilomètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "de plusieurs dizaines de kilomètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "quelquefois de plusieurs dizaines de kilomètres", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de plusieurs dizaines de kilomètres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -191849,12 +192009,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -191905,33 +192059,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les hommes de l'Aurignacien.", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -191959,12 +192120,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -192028,33 +192183,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Aurignacien", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les hommes de l' Aurignacien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les hommes de l'Aurignacien", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -192076,12 +192238,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -192157,33 +192313,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "on ne sait pas", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -192229,12 +192392,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -192300,33 +192457,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "guerriers professionnels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "guerriers professionnels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "guerriers professionnels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "guerriers professionnels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "guerriers professionnels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "guerriers professionnels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les guerriers professionnels", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -192360,12 +192524,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -192429,33 +192587,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Australopithecus afarensis", - "rougeL": 0.6956521739130436 + "rougeL": 0.6956521739130436, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Lucy", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Australopithecus afarensis", - "rougeL": 0.6956521739130436 + "rougeL": 0.6956521739130436, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "d' australopithèque", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Lucy (australopithèque)", - "rougeL": 0.3 + "rougeL": 0.3, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "fossile de Lucy", - "rougeL": 0.1111111111111111 + "rougeL": 0.1111111111111111, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Lucy (australopithèque)", - "rougeL": 0.3 + "rougeL": 0.3, + "HScore": 1.0 } }, "human_annot": { @@ -192471,12 +192636,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -192557,33 +192716,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "magdalénien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "magdalénien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "magdalénien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "du magdalénien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "magdalénien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "du magdalénien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "magdalénien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -192623,12 +192789,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -192685,33 +192845,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le 5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le 5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le 5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 5 août 1868.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -192745,12 +192912,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -192809,33 +192970,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "5 août 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -192869,12 +193037,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -192939,33 +193101,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à l' entrée des grottes et des cavernes.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à l'entrée des grottes et des cavernes.", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { @@ -192999,12 +193168,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -193061,33 +193224,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' entrée des grottes et des cavernes.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à l' entrée des grottes et des cavernes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à l'entrée des grottes et des cavernes", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { @@ -193109,12 +193279,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -193189,33 +193353,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "conditions atmosphériques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "conditions atmosphériques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "conditions atmosphériques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "conditions atmosphériques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "par les conditions atmosphériques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "conditions atmosphériques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les conditions atmosphériques", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -193243,12 +193414,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -193317,33 +193482,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "blocs de pierre de plusieurs dizaines de tonnes", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "blocs de pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "blocs de pierre de plusieurs dizaines de tonnes", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "blocs de pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "blocs de pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "blocs de pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "blocs de pierre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -193377,12 +193549,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -193446,33 +193612,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "rares", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "rares", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "rares", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "devenues rares", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "rares", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "devenues rares", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "rares", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -193482,12 +193655,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -193574,33 +193741,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Tchad", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Tchad", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Tchad", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tchad", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tchad", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans la même région, dans le but de découvrir d' autres fossiles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "au Tchad", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -193640,12 +193814,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -193703,33 +193871,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "L' Homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' Homme", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -193769,12 +193944,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -193843,33 +194012,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La Vénus de Willendorf", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Faite de calcaire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La Vénus de Willendorf", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vénus de Willendorf La Vénus de Willendorf", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vénus de Willendorf", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "La Vénus de Willendorf", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La Vénus de Willendorf", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -193903,12 +194079,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -193971,42 +194141,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "entre -15500 et -13500 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "entre -15500 et -13500 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "entre -15500 et -13500 ans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "entre -15500 et -13500 ans,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "-15500 et -13500 ans", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "-15500 et -13500 ans", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Les dates sont comprises entre -15500 et -13500 ans.", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -194100,33 +194271,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Andrewsarchus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Andrewsarchus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Andrewsarchus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Andrewsarchus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Andrewsarchus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Andrewsarchus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Andrewsarchus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -194340,48 +194518,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -194552,33 +194688,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "civilisation danubienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' agriculture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "civilisation danubienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -194612,12 +194755,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -194674,33 +194811,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "civilisation danubienne", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' agriculture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le Néolithique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -194740,12 +194884,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -194802,33 +194940,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "producteurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "producteurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "producteurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ils deviennent sédentaires et construisent des villages", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "producteurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "deviennent producteurs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ils deviennent sédentaires.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -194838,12 +194983,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -194930,33 +195069,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "avec un propulseur de taille moyenne (75 centimètres)", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "avec un propulseur de taille moyenne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "avec un propulseur de taille moyenne (75 centimètres)", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "avec un propulseur de taille moyenne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "avec un propulseur de taille moyenne (75 centimètres)", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "avec un propulseur de taille moyenne (75 centimètres)", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "envoyée avec un propulseur", - "rougeL": 0.6666666666666665 + "rougeL": 0.6666666666666665, + "HScore": 1.0 } }, "human_annot": { @@ -194990,12 +195136,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -195052,33 +195192,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "propulseur", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "propulseur", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "avec un propulseur de taille moyenne (75 centimètres)", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "projectile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "projectile de taille moyenne (75 centimètres)", - "rougeL": 0.4000000000000001 + "rougeL": 0.4000000000000001, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "propulseur", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "avec un propulseur de taille moyenne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -195100,12 +195247,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -195180,42 +195321,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ce jour -là", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "15000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "il y a 15000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ce jour -là,", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "15000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "ce jour -là", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ce jour-là", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -195303,33 +195445,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Meruti est un garçon d' une dizaine d' années, vivant il y a 15000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "15000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' histoire Meruti est un garçon d' une dizaine d' années, vivant il y a 15000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "15000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "15000 ans", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Ce jour -là", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ce jour", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -195357,12 +195506,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -195432,33 +195575,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ses liens possibles avec l' astronomie ancienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour ses liens possibles avec l' astronomie ancienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ses liens possibles avec l' astronomie ancienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "liens possibles avec l' astronomie ancienne.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "pour ses liens possibles avec l' astronomie ancienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "liens possibles avec l' astronomie ancienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "pour ses liens possibles avec l' astronomie ancienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -195504,12 +195654,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -195560,33 +195704,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la terre pour les murs, voire la pierre", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la terre pour les murs", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "leurs maisons peuvent donc être plus solides et plus travaillées", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "on y a trouvé de nombreux objets fabriqués par les hommes, et des restes des animaux qu' ils chassaient.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "pour les murs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Comme ils ne changent plus d' endroit, et que leurs techniques et connaissances sont plus développées", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "pour les murs, voire la pierre.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 } }, "human_annot": { @@ -195626,12 +195777,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -195689,33 +195834,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grattoir, perçoir, lame pointue", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sac de cuir", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "cuir", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -195725,12 +195877,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -195811,33 +195957,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sac de cuir", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "un petit sac de cuir contenant de petits outils de silex (grattoir, perçoir, lame pointue).", - "rougeL": 0.3157894736842105 + "rougeL": 0.3157894736842105, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -195847,12 +196000,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -195939,33 +196086,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Mission Paléoanthropologique Franco - Tchadienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Mission Paléoanthropologique Franco - Tchadienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Mission Paléoanthropologique Franco - Tchadienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Mission Paléoanthropologique Franco - Tchadienne (MPFT)", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Mission Paléoanthropologique Franco - Tchadienne (MPFT)", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Mission Paléoanthropologique Franco - Tchadienne (MPFT)", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Mission Paléoanthropologique Franco - Tchadienne (MPFT)", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 } }, "human_annot": { @@ -195981,12 +196135,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -196067,33 +196215,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -196127,12 +196282,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -196189,33 +196338,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "-600", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "-600", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "-600", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "-600", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "-600", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "-600", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -196255,12 +196411,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -196317,33 +196467,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une hotte munie d' une armature formée d' une longue tige de noisetier, deux récipients en écorce de bouleau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "hotte munie d' une armature formée d' une longue tige de noisetier, deux récipients en écorce de bouleau, un petit sac comprenant un nécessaire à faire du feu", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "céréales, de cerf et de bouquetin.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "céréales, cerf et bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le dernier repas d' Ötzi se composait de céréales, de cerf et de bouquetin.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -196377,12 +196534,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -196440,33 +196591,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "céréales, de cerf et de bouquetin.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "céréales, cerf et bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "céréales, cerf et bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -196512,12 +196670,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -196569,33 +196721,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'abbé Breuil", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -196629,12 +196788,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -196691,33 +196844,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "l' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'abbé Breuil", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -196757,12 +196917,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -196819,33 +196973,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "-2300", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "autour de -2300", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "-2300", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "autour de -2300.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "-2300", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "autour de -2300", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le squelette d'un homme de l'âge du bronze a été découvert dans le fossé extérieur et il est mort autour de -2300.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -196867,12 +197028,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -196941,33 +197096,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "-2300", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "-2300", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "-2300", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "-2300.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "-2300", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "-2300", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "autour de -2300.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -197007,12 +197169,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -197069,33 +197225,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -197117,12 +197280,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -197197,33 +197354,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "âge de la pierre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "âge de la pierre vers 6000 av. J.-C", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "âge de la pierre vers 6000 av. J.-C.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' âge de la pierre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "6000 av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' âge de la pierre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l' âge de la pierre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 } }, "human_annot": { @@ -197251,12 +197415,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -197337,33 +197495,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "coquillages perforés, des objets sculptés en ronde bosse mais aussi des peintures", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fouillée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "coquillages perforés, des objets sculptés en ronde bosse mais aussi des peintures", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "coquillages perforés, des objets sculptés en ronde bosse mais aussi des peintures", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "site éponyme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "site éponyme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "La grotte d'Aurignac en Haute-Garonne a donné son nom à cette culture en 1906.", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -197397,12 +197562,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -197465,33 +197624,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "or.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -197531,12 +197697,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -197587,33 +197747,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "l' origine principale du bronze", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' origine principale du bronze", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le bronze", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -197647,12 +197814,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -197715,33 +197876,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "La fragilité des poteries, surtout celles de terre séchée", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -197775,12 +197943,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -197839,33 +198001,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "poteries, surtout celles de terre séchée", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les poteries", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -197911,12 +198080,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -197969,33 +198132,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'abbé Breuil", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -198029,12 +198199,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -198091,33 +198255,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' abbé Breuil", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -198163,12 +198334,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -198219,33 +198384,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Florès", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "île de Florès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "île de Florès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' île de Florès,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Florès", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "île de Florès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'île de Florès", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -198267,12 +198439,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -198341,33 +198507,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "île de Florès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fossiles de l' Homme de Florès", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "île de Florès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fossiles de l' Homme de Florès", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "grottes de Liang Bua", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' île de Florès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'Homme de Florès", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.0 } }, "human_annot": { @@ -198383,12 +198556,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -198469,33 +198636,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le projectile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "(une sagaie de 2 mètres de long)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "projectile (une sagaie de 2 mètres de long)", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -198517,12 +198691,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -198592,33 +198760,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projectile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le projectile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "(une sagaie de 2 mètres de long)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "projectile (une sagaie de 2 mètres de long)", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -198640,12 +198815,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -198721,33 +198890,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "par ces flèches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "par ces flèches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "par ces flèches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Plusieurs pointes de flèches", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "par des flèches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "par ces flèches", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "L'homme a été tué par des flèches.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 } }, "human_annot": { @@ -198787,12 +198963,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -198849,33 +199019,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -198909,12 +199086,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -198982,33 +199153,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les ossements", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ossements", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ossements", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Les ossements (de grande taille) des mammouths", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de la peau qui a disparu depuis", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "ossements (de grande taille) des mammouths", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les ossements des mammouths", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 } }, "human_annot": { @@ -199048,12 +199226,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -199105,33 +199277,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les ossements (de grande taille) des mammouths ont été utilisés pour construire l' armature des huttes", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.9 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ossements", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.7 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la peau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.15 }, "Camembert_baseline": { "answer_pred": "ossements", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.7 }, "llama-2_lora": { "answer_pred": "ossements", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.7 }, "mixtral-8x7b": { "answer_pred": "ossements (de grande taille) des mammouths", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les ossements (de grande taille) des mammouths", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -199345,48 +199524,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -199558,33 +199695,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Proche-Orient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Proche-Orient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Au Proche-Orient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Au Proche-Orient, elle commence vers 8000 av. J.-C.,", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vers 8000 av. J.-C.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Au Proche-Orient", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Au Proche-Orient.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -199612,12 +199756,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -199698,33 +199836,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ces outils étaient probablement utilisés pour tuer des animaux, mais c' est très incertain", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ces outils étaient probablement utilisés pour tuer des animaux, mais c' est très incertain", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "couper, de piquer et de frapper", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "enlevant des éclats sur un bloc de pierre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "par des bifaces", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "c'est très incertain", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -199740,12 +199885,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -199833,33 +199972,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ces outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bifaces", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bifaces", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "coup de poing", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les bifaces", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -199893,12 +200039,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -199962,33 +200102,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -200022,12 +200169,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -200088,33 +200229,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Yves Coppens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -200148,12 +200296,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -200220,33 +200362,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un petit sac de cuir contenant de petits outils de silex (grattoir, perçoir, lame pointue).", - "rougeL": 0.3157894736842105 + "rougeL": 0.3157894736842105, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -200262,12 +200411,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -200342,33 +200485,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sac de cuir", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sac de cuir", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un petit sac de cuir contenant de petits outils de silex (grattoir, perçoir, lame pointue).", - "rougeL": 0.3157894736842105 + "rougeL": 0.3157894736842105, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans un petit sac de cuir", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -200408,12 +200558,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -200470,33 +200614,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projectile", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un projectile", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bouquetin", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "projectile", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "projectile plutôt long et lourd", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "projectile", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le propulseur", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -200512,12 +200663,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -200599,33 +200744,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "diriger des fouilles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "professeur de sa discipline au collège de France", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "il est nommé directeur de différents musées", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -200665,12 +200817,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -200721,33 +200867,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "diriger des fouilles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -200775,12 +200928,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -200849,42 +200996,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "professeur de sa discipline au collège de France où il est titulaire de la chaire de Paléoanthropologie et de préhistoire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "professeur de sa discipline au collège de France où il est titulaire de la chaire de Paléoanthropologie et de préhistoire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "professeur de sa discipline au collège de France", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -200977,33 +201125,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "-600", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "vers -600", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -201031,12 +201186,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -201105,33 +201254,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "projectile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le projectile", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "(une sagaie de 2 mètres de long)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "projectile (une sagaie de 2 mètres de long)", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une sagaie de 2 mètres de long", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -201147,12 +201303,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -201234,33 +201384,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la première étude sur la grotte et recensa à peu près 200 peintures ou gravures", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "première étude sur la grotte", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "première étude sur la grotte", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la première étude sur la grotte et recensa à peu près 200 peintures ou gravures", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la première étude sur la grotte", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la première étude sur la grotte", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'abbé Breuil réalisa la première étude sur la grotte.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -201288,12 +201445,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -201362,33 +201513,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Moyen-Orient et de l' Asie du Sud-Est", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 0.8125 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Moyen-Orient et de l' Asie du Sud-Est", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 0.8125 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Moyen-Orient et de l' Asie du Sud-Est", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 0.8125 }, "Camembert_baseline": { "answer_pred": "Moyen-Orient et de l' Asie du Sud-Est", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 0.8125 }, "llama-2_lora": { "answer_pred": "Moyen-Orient et de l'Asie du Sud-Est", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 0.8125 }, "mixtral-8x7b": { "answer_pred": "Moyen-Orient, Asie du Sud-Est, Europe, Asie, Afrique, Amériques", - "rougeL": 0.5714285714285714 + "rougeL": 0.5714285714285714, + "HScore": 0.8125 }, "GPT-3.5": { "answer_pred": "le Moyen-Orient et l'Asie du Sud-Est, l'Europe et le reste de l'Asie et de l'Afrique, les Amériques.", - "rougeL": 0.43478260869565216 + "rougeL": 0.43478260869565216, + "HScore": 0.875 } }, "human_annot": { @@ -201596,40 +201754,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Partiellement correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -201697,16 +201821,6 @@ "annot": "annot_10", "rating": "Correct" } - ], - "question": [ - { - "annot": "annot_6", - "rating": "Question douteuse" - }, - { - "annot": "annot_8", - "rating": "Question douteuse" - } ] }, "lu_in_question": false, @@ -201752,33 +201866,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "céréales, de cerf et de bouquetin.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "céréales, cerf et bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le dernier repas d'Ötzi se composait de céréales, de cerf et de bouquetin.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -201812,12 +201933,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -201875,33 +201990,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "céréales, de cerf et de bouquetin.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "céréales, cerf et bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "céréales, de cerf et de bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "céréales, cerf et bouquetin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -201923,12 +202045,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -202004,33 +202120,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des outils et savait utiliser le feu", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "des outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des outils", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -202052,12 +202175,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -202132,33 +202249,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "obsidienne", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -202186,12 +202310,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -202255,33 +202373,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "néolithiques en obsidienne", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "néolithiques en obsidienne", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "néolithiques en obsidienne", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "objets néolithiques en obsidienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -202315,12 +202440,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -202384,33 +202503,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Europe centrale (civilisation danubienne)", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Europe centrale (civilisation danubienne)", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.0 } }, "human_annot": { @@ -202420,12 +202546,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -202506,33 +202626,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Europe centrale (civilisation danubienne)", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Europe centrale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -202548,12 +202675,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -202652,33 +202773,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "grottes ouvertes ou peu profondes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans le sol d' abris-sous-roche", - "rougeL": 0.20000000000000004 + "rougeL": 0.20000000000000004, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans le sol d' abris-sous-roche", - "rougeL": 0.20000000000000004 + "rougeL": 0.20000000000000004, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "grottes ouvertes ou peu profondes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "dans le sol d' abris-sous-roche", - "rougeL": 0.20000000000000004 + "rougeL": 0.20000000000000004, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "abris-sous-roche", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "dans des abris-sous-roche", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.5 } }, "human_annot": { @@ -202694,12 +202822,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -202780,33 +202902,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "polissage des pierres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "polissage des pierres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à la période", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "polissage des pierres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Néolithique, qui signifie « nouvelle pierre » en grec ancien.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "au polissage des pierres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la période", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -202852,12 +202981,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -202908,33 +203031,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des variations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des variations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des variations. Avant 15000 il était tempéré. Puis il redevient froid et sec, la végétation herbeuse se développe, favorisant les herbivores, l' antilope saga", - "rougeL": 0.08333333333333333 + "rougeL": 0.08333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des variations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "froid (il s' agit de la dernière période des glaciations).", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "des variations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "des variations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -202974,12 +203104,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -203036,33 +203160,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une « culture » particulière", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le Châtelperronien (relié à l' homme de Néanderthal), l' Aurignacien, le Gavettien, le Solutréen et le Magdalénien", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "culture", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cultures archéologiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une « culture » particulière", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Les noms des cultures archéologiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Châtelperronien, l'Aurignacien, le Gavettien, le Solutréen et le Magdalénien.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -203102,12 +203233,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -203169,33 +203294,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tranchant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un tranchant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un tranchant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un tranchant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un tranchant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un tranchant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un tranchant", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -203229,12 +203361,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -203297,33 +203423,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour la traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -203351,12 +203484,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -203420,33 +203547,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la traversée de la Seine", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la traversée de la Seine.", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -203456,12 +203590,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -203549,33 +203677,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de 1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -203615,12 +203750,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -203671,33 +203800,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "de 1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "de 1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "de 1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de 1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "de 1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de 1846 à 1868", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -203731,12 +203867,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -203799,42 +203929,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les anciens explorateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les anciens explorateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les anciens explorateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les anciens explorateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "anciens explorateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "anciens explorateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les anciens explorateurs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -203939,33 +204070,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "poids d' environ 50 kg", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -203999,12 +204137,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -204073,33 +204205,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "poids d' environ 50 kg", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Homo rudolfensis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Aucune séquence de mots dans l'article ne répond à la question posée.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -204127,12 +204266,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -204201,33 +204334,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -204261,12 +204401,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -204331,33 +204465,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "diriger des fouilles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "professeur de sa discipline au collège de France", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "il est nommé directeur de différents musées.", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -204373,12 +204514,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -204453,33 +204588,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "directeur de différents musées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -204513,12 +204655,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -204581,33 +204717,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grattoirs carénés qui servent à gratter les peaux, burins carénés et burins busqués", - "rougeL": 0.6923076923076924 + "rougeL": 0.6923076923076924, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -204641,12 +204784,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -204707,33 +204844,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grattoirs carénés qui servent à gratter les peaux, burins carénés et burins busqués", - "rougeL": 0.6923076923076924 + "rougeL": 0.6923076923076924, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "burins carénés et burins busqués", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -204761,12 +204905,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -204839,33 +204977,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "L' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Breuil", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' abée Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'abbé Breuil", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -204899,12 +205044,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -204967,33 +205106,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "outils en silex à côté d' os de mammifères", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "outils en silex à côté d' os de mammifères", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "outils en silex à côté d' os de mammifères", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "outils en silex à côté d' os de mammifères", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "outils en silex", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "outils en silex", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les outils en silex", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -205027,12 +205173,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -205091,33 +205231,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "outils en silex", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "outils en silex", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "outils en silex à côté d' os de mammifères", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.95 }, "Camembert_baseline": { "answer_pred": "outils en silex à côté d' os de mammifères", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.95 }, "llama-2_lora": { "answer_pred": "outils en silex", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "mixtral-8x7b": { "answer_pred": "l' homme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "des outils en silex", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.95 } }, "human_annot": { @@ -205205,48 +205352,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_3", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_4", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_6", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_8", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -205557,33 +205662,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "coup de poing", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.1 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ces outils étaient probablement utilisés pour tuer des animaux, mais c' est très incertain", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.45 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "couper, de piquer et de frapper", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.15 }, "Camembert_baseline": { "answer_pred": "On frappe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.1 }, "llama-2_lora": { "answer_pred": "par des animaux", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.05 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.2 }, "GPT-3.5": { "answer_pred": "c'est très incertain", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.45 } }, "human_annot": { @@ -205713,48 +205825,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_3", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_4", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Erreur inacceptable" - }, - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -206032,33 +206102,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "groupes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les groupes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les groupes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Les groupes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les groupes s' y installaient et y habitaient.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Les groupes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les groupes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -206086,12 +206163,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -206160,33 +206231,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans la même région", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au Tchad", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Tchad", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Tchad", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Tchad", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans la même région, dans le but de découvrir d' autres fossiles", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "dans la même région (au Tchad)", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -206226,12 +206304,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -206285,33 +206357,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "habitation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "habitation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une habitation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "Camembert_baseline": { "answer_pred": "habitation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "llama-2_lora": { "answer_pred": "une habitation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "mixtral-8x7b": { "answer_pred": "une habitation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "GPT-3.5": { "answer_pred": "une habitation", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 } }, "human_annot": { @@ -206399,48 +206478,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Partiellement correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Partiellement correct" - }, - { - "annot": "annot_6", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -206737,33 +206774,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les archéologues", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -206773,12 +206817,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -206877,33 +206915,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les hommes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -206937,12 +206982,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -207017,33 +207056,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Meruti", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -207077,12 +207123,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -207146,33 +207186,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "On trouve aussi des bisons, des aurochs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "On", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "On trouve aussi des bisons, des aurochs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "renne et le cheval. On trouve aussi des bisons, des aurochs. Le mammouth", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le mammouth disparait progressivement bien qu' il soit toujours représenté.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Les animaux les plus consommés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "On trouve aussi des bisons, des aurochs.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -207206,12 +207253,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -207274,33 +207315,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "village de Saint-Acheul", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "du village de Saint-Acheul", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "village de Saint-Acheul", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "village de Saint-Acheul, près d' Amiens", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Saint-Acheul, près d' Amiens (dans la Somme)", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "du village de Saint-Acheul, près d' Amiens (dans la Somme)", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le nom Acheuléen provient du village de Saint-Acheul, près d'Amiens (dans la Somme).", - "rougeL": 0.4761904761904762 + "rougeL": 0.4761904761904762, + "HScore": 1.0 } }, "human_annot": { @@ -207328,12 +207376,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -207396,33 +207438,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Saint-Acheul", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Saint-Acheul", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Saint-Acheul", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Saint-Acheul,", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Saint-Acheul", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Saint-Acheul", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Saint-Acheul", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -207456,12 +207505,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -207524,42 +207567,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "travaillant les hommes produisent leur alimentation végétale et abandonnent progressivement la cueillette", - "rougeL": 0.9473684210526316 + "rougeL": 0.9473684210526316, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la culture nait lorsque les hommes prennent le contrôle de la croissance de certaines espèces végétales", - "rougeL": 0.1904761904761905 + "rougeL": 0.1904761904761905, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "alimentation végétale et abandonnent progressivement la cueillette", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "La culture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "la culture des plantes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "La culture des plantes commence lorsque en travaillant les hommes produisent leur alimentation végétale et abandonnent progressivement la cueillette.", - "rougeL": 0.8695652173913044 + "rougeL": 0.8695652173913044, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la production alimentaire végétale", - "rougeL": 0.15384615384615383 + "rougeL": 0.15384615384615383, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -207652,42 +207696,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' abbé Henri Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' abbé Henri Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -207771,33 +207816,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Henri Breuil", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' abbé Henri Breuil", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'abbé Henri Breuil", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -207807,12 +207859,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -207896,33 +207942,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la sève ne peut plus alimenter la partie supérieure de l' arbre qui sèche sur place et est brûlé", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la sève", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un engrais naturel", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "sève", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sève", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la sève", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la sève", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -207944,12 +207997,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -208025,33 +208072,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Édouard Piette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -208085,12 +208139,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -208155,33 +208203,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "kangourous", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "kangourous", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "kangourous", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "kangourous", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "kangourous", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "kangourous", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "c'est le cas des kangourous.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -208209,12 +208264,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -208284,33 +208333,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "abris-sous-roche", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 0.7 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sol d' abris-sous-roche", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 0.9 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sol d' abris-sous-roche", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 0.9 }, "Camembert_baseline": { "answer_pred": "sol d' abris-sous-roche,", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 0.9 }, "llama-2_lora": { "answer_pred": "abris-sous-roche", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 0.7 }, "mixtral-8x7b": { "answer_pred": "sol d' abris-sous-roche", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 0.9 }, "GPT-3.5": { "answer_pred": "dans le sol d' abris-sous-roche", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 0.9 } }, "human_annot": { @@ -208398,48 +208454,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Partiellement correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Partiellement correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -208730,33 +208744,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "abris-sous-roche", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans le sol d' abris-sous-roche", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans le sol d' abris-sous-roche", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "abris-sous-roche,", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "abris-sous-roche", - "rougeL": 0.5333333333333333 + "rougeL": 0.5333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "le sol d' abris-sous-roche, c' est - à - dire des grottes ouvertes ou peu profondes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans le sol d'abris-sous-roche", - "rougeL": 0.4705882352941177 + "rougeL": 0.4705882352941177, + "HScore": 0.5 } }, "human_annot": { @@ -208796,12 +208817,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -208863,33 +208878,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Christian Lapie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Christian Lapie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Christian Lapie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les architectes Maud Leforestier et Frédéric Blérot.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -208899,12 +208921,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -208982,33 +208998,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.9565217391304348 + "rougeL": 0.9565217391304348, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.9565217391304348 + "rougeL": 0.9565217391304348, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.9565217391304348 + "rougeL": 0.9565217391304348, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.9565217391304348 + "rougeL": 0.9565217391304348, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.9565217391304348 + "rougeL": 0.9565217391304348, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.9565217391304348 + "rougeL": 0.9565217391304348, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Maud Leforestier et Frédéric Blérot", - "rougeL": 0.9565217391304348 + "rougeL": 0.9565217391304348, + "HScore": 1.0 } }, "human_annot": { @@ -209042,12 +209065,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -209111,33 +209128,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la première balle traversa le bord de la voiture et atteignit la duchesse de Hohenberg à l' abdomen", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "deux fois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la première balle traversa le bord de la voiture et atteignit la duchesse de Hohenberg à l' abdomen", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "la première balle traversa le bord de la voiture et atteignit la duchesse de Hohenberg à l' abdomen. La seconde balle", - "rougeL": 0.19999999999999998 + "rougeL": 0.19999999999999998, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "deux fois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "deux fois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Princip tire une deuxième balle.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 } }, "human_annot": { @@ -209147,12 +209171,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -209239,33 +209257,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Boroevi", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Boroevi", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -209299,12 +209324,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -209361,33 +209380,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Boroevi", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Boroevi", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Boroević", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -209421,12 +209447,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -209507,33 +209527,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Hitler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Hitler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Chef d' escadron de la flotte du Führer", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "faire croire aux soldats allemands et à la population civile qu' il arrivait immédiatement après la fin de la lutte.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Hitler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Hitler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Hitler", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -209555,12 +209582,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -209635,33 +209656,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nuit du 24 septembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la nuit du 24 septembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -209671,12 +209699,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -209758,33 +209780,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dans la nuit du 24 septembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le 24 septembre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans la nuit du 24 septembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans la nuit du 24 septembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -209818,12 +209847,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -209899,33 +209922,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa volonté d' accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "accepter la protection italienne et même un prince italien comme souverain", - "rougeL": 0.7000000000000001 + "rougeL": 0.7000000000000001, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sa volonté d' accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sa volonté d' accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu'elle ne perd pas de territoire", - "rougeL": 0.8799999999999999 + "rougeL": 0.8799999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "exprima sa volonté d' accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire", - "rougeL": 0.9285714285714286 + "rougeL": 0.9285714285714286, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sa volonté d'accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu'elle ne perdait pas de territoire.", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -209935,12 +209965,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -210033,33 +210057,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire", - "rougeL": 0.9600000000000001 + "rougeL": 0.9600000000000001, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "accepter la protection italienne et même un prince italien comme souverain", - "rougeL": 0.7000000000000001 + "rougeL": 0.7000000000000001, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire", - "rougeL": 0.9600000000000001 + "rougeL": 0.9600000000000001, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sa volonté d' accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire", - "rougeL": 0.9600000000000001 + "rougeL": 0.9600000000000001, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "exprima sa volonté d' accepter la protection italienne et même un prince italien comme souverain aussi longtemps qu' elle ne perdait pas de territoire", - "rougeL": 0.9285714285714286 + "rougeL": 0.9285714285714286, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "accepter la protection italienne", - "rougeL": 0.375 + "rougeL": 0.375, + "HScore": 0.5 } }, "human_annot": { @@ -210105,12 +210136,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -210161,33 +210186,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le lieutenant Kompartz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Kompartz", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lieutenant Kompartz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "lieutenant Kompartz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le lieutenant Kompartz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le lieutenant Kompartz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le lieutenant Kompartz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -210203,12 +210235,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -210293,33 +210319,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "eau potable", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "eau potable pendant les repas et en ne fumant pas", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "eau potable", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' eau potable", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "300 £ supplémentaires par an pour maintenir le niveau de vie d' un officier de cavalerie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "eau potable", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de l'eau potable", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -210353,12 +210386,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -210434,33 +210461,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Allemands", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "des colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des soldats allemands", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -210500,12 +210534,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -210568,33 +210596,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "des colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Allemands", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des colonnes de soldats allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les soldats allemands", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -210616,12 +210651,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -210696,33 +210725,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Première Guerre mondiale", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Première Guerre mondiale", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Première Guerre mondiale", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Première Guerre mondiale", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Première Guerre mondiale.", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Première Guerre mondiale", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Mitrofan Nejentsev participe à la Première Guerre mondiale.", - "rougeL": 0.13333333333333333 + "rougeL": 0.13333333333333333, + "HScore": 0.5 } }, "human_annot": { @@ -210762,12 +210798,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -210819,33 +210849,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Première Guerre mondiale et à la guerre civile russe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Première Guerre mondiale et à la guerre civile russe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Première Guerre mondiale et à la guerre civile russe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à la Première Guerre mondiale et à la guerre civile russe", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Première Guerre mondiale. Mitrofan Nejentsev. Mitrofan Osipovitch Nejentsev (russe : Митрофан Осипович Неженцев) est un colonel d' état-major russe né en 1886 et mort au combat le 12 avril 1918 près de Ekaterinodar. Il participa à la Première Guerre mondiale et à la guerre civile russe.", - "rougeL": 0.09375 + "rougeL": 0.09375, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Première Guerre mondiale", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Mitrofan Osipovitch Nejentsev participe à la Première Guerre mondiale et à la guerre civile russe.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -210873,12 +210910,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -210978,33 +211009,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Paul Prosper Henrys", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Paul Prosper Henrys", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Paul Prosper Henrys", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Paul Prosper Henrys", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Lunéville.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "il", - "rougeL": 0 + "rougeL": 0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Paul Prosper Henrys", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -211050,12 +211088,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -211107,33 +211139,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Karl Bodenschatz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Karl Bodenschatz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Karl Bodenschatz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Karl Bodenschatz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Karl Bodenschatz, officier d' ordonnance de Richthofen au Jagdgeschwaders 1", - "rougeL": 0.4827586206896552 + "rougeL": 0.4827586206896552, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Karl Bodenschatz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Karl Bodenschatz, officier d'ordonnance de Richthofen au Jagdgeschwaders 1", - "rougeL": 0.4666666666666667 + "rougeL": 0.4666666666666667, + "HScore": 1.0 } }, "human_annot": { @@ -211173,12 +211212,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -211260,33 +211293,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Fabian Ware", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Fabian Ware", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Fabian Ware", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Fabian Ware", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Fabian Ware", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Fabian Ware", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Fabian Ware", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -211320,12 +211360,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -211388,33 +211422,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Éamon de Valera du Sinn Fein", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Éamon de Valera", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Éamon de Valera du Sinn Fein", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Éamon de Valera", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Éamon de Valera", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Éamon de Valera", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Éamon de Valera du Sinn Fein.", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 } }, "human_annot": { @@ -211430,12 +211471,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -211517,33 +211552,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "harceler les tranchées voisines, pour couper les barbelés et détruire les bunkers adverses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "harceler les tranchées voisines, pour couper les barbelés et détruire les bunkers adverses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à la verticale", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "harceler les tranchées voisines, pour couper les barbelés et détruire les bunkers adverses.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "harceler les tranchées voisines, pour couper les barbelés et détruire les bunkers adverses.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "harceler les tranchées voisines, pour couper les barbelés et détruire les bunkers adverses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "harceler les tranchées voisines, couper les barbelés, détruire les bunkers adverses", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -211571,12 +211613,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -211660,33 +211696,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "60e division d' infanterie", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "21e division d' infanterie", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "21e division d' infanterie", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "21e division d' infanterie", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "21e division d' infanterie", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "21e division d' infanterie", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la 21e division d'infanterie", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 } }, "human_annot": { @@ -211714,12 +211757,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -211788,33 +211825,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -211848,12 +211892,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -211910,33 +211948,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Heurteaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -212150,48 +212195,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -212374,33 +212377,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une arme psychologique, destinée à terroriser les Parisiens, les désordres et les manifestations ainsi suscités poussant le gouvernement français à demander un armistice", - "rougeL": 0.08 + "rougeL": 0.08, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "solide haubanage partant d' un mât central", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "obus de 210 mm chemisés entre 210 et 240 mm pour s' adapter à l' usure du tube provoquée par l' effet d' arrachement des munitions", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "canons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "mont de Joie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les canons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les canons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -212410,12 +212420,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -212508,33 +212512,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Woditschka", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Woditschka", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Woditschka", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Woditschka", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Chvéïk et Woditschka", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Chvéïk et Woditschka", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Chvéïk et Woditschka", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -212544,12 +212555,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -212690,33 +212695,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "lieutenant de Villaret", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le lieutenant de Villaret", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "lieutenant de Villaret", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le lieutenant de Villaret", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Villaret", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "le lieutenant de Villaret", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le lieutenant de Villaret", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -212726,12 +212738,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -212819,33 +212825,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "de nombreuses autres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "de nombreuses autres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trente-cinq personnes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "vénizélistes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "de nombreuses autres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "nombreuses autres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de nombreuses autres", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -212879,12 +212892,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -212947,33 +212954,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Konrad est nommé à la 1re brigade d'artillerie de Munich en août 1895.", - "rougeL": 0.35294117647058826 + "rougeL": 0.35294117647058826, + "HScore": 1.0 } }, "human_annot": { @@ -213007,12 +213021,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -213070,33 +213078,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1895", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1895", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1895", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en août 1895", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -213124,12 +213139,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -213199,42 +213208,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "port de Vladivostok en Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vladivostok", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "port de Vladivostok", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vladivostok", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vladivostok", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vladivostok en Sibérie", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "port de Vladivostok", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -213321,33 +213331,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "port de Vladivostok en Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "port de Vladivostok en Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "port de Vladivostok en Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vladivostok en Sibérie", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vladivostok en Sibérie", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vladivostok en Sibérie", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans le port de Vladivostok en Sibérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -213375,12 +213392,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -213449,33 +213460,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -213515,12 +213533,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -213571,33 +213583,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Normandie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -213625,12 +213644,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -213699,33 +213712,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Deverell", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Deverell", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Deverell", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "Camembert_baseline": { "answer_pred": "Deverell", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "llama-2_lora": { "answer_pred": "Deverell", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "mixtral-8x7b": { "answer_pred": "Deverell", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 }, "GPT-3.5": { "answer_pred": "Deverell", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.9 } }, "human_annot": { @@ -213939,48 +213959,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -214152,33 +214130,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Schlieffen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Schlieffen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Schlieffen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Schlieffen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Schlieffen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Schlieffen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Schlieffen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -214212,12 +214197,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -214280,33 +214259,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "organe principal des activistes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en tant qu' organe principal des activistes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "organe principal des activistes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en tant qu' organe principal des activistes,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "comme organe principal des activistes", - "rougeL": 0.75 + "rougeL": 0.75, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "agissant en tant qu' organe principal des activistes", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Conseil de Flandre agit en tant qu'organe principal des activistes", - "rougeL": 0.46153846153846156 + "rougeL": 0.46153846153846156, + "HScore": 1.0 } }, "human_annot": { @@ -214346,12 +214332,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -214402,33 +214382,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "organe principal des activistes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "organe principal des activistes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "organe principal des activistes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en tant qu' organe principal des activistes,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "organe principal des activistes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "organe principal des activistes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "organe principal des activistes", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -214462,12 +214449,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -214542,42 +214523,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sophie", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Sophie", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Sophie", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "François-Joseph", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "l'empereur François-Joseph", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' empereur François-Joseph", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L' empereur François-Joseph lui demande d' y réfléchir.", - "rougeL": 0.8750000000000001 + "rougeL": 0.8750000000000001, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -214676,42 +214658,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "François-Joseph", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "François-Joseph", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "François-Joseph", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "François-Joseph", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "François-Joseph", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "empereur François-Joseph", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'empereur François-Joseph", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -214804,33 +214787,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tout ou partie d' un document", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tout ou partie d' un document", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un document", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "tout ou partie d' un document", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "tout ou partie d' un document", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Cet article contient tout ou partie d' un document provenant du site La Vie rémoise.", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Aucune séquence de mots ne répond à la question posée car l'article ne mentionne pas ce qui provient du site La Vie rémoise.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -214876,12 +214866,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -214926,33 +214910,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tout ou partie d' un document", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5555555555555556 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tout ou partie d' un document provenant du site La Vie rémoise", - "rougeL": 0.19999999999999998 + "rougeL": 0.19999999999999998, + "HScore": 0.7222222222222222 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un document provenant du site La Vie rémoise", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.6111111111111112 }, "Camembert_baseline": { "answer_pred": "tout ou partie d' un document", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5555555555555556 }, "llama-2_lora": { "answer_pred": "un document provenant du site La Vie rémoise", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.6111111111111112 }, "mixtral-8x7b": { "answer_pred": "document provenant du site La Vie rémoise", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.6111111111111112 }, "GPT-3.5": { "answer_pred": "Il n'y a pas dans l'article mentionné de séquence de mots qui répond à la question donnée.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.2222222222222222 } }, "human_annot": { @@ -215146,44 +215137,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - }, - { - "annot": "annot_2", - "rating": "Partiellement correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Partiellement correct" - }, - { - "annot": "annot_6", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_8", - "rating": "Partiellement correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -215297,12 +215250,6 @@ "annot": "annot_10", "rating": "Erreur inacceptable" } - ], - "question": [ - { - "annot": "annot_5", - "rating": "Question douteuse" - } ] }, "lu_in_question": false, @@ -215348,33 +215295,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "voisins ou amis musulmans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "voisins ou amis musulmans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des voisins ou amis musulmans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des voisins ou amis musulmans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des voisins ou amis musulmans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des voisins ou amis musulmans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des voisins ou amis musulmans", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -215414,12 +215368,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -215476,33 +215424,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ils n' ont plus rien à manger mais réussissent toutefois à survivre en pêchant du poisson, en attrapant des oiseaux et en buvant de l' eau de pluie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Rickenbacker et son équipage", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Rickenbacker et son équipage", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Rickenbacker et son équipage", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Rickenbacker et son équipage", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Rickenbacker et son équipage", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Rickenbacker et son équipage.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -215530,12 +215485,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -215608,33 +215557,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ouest", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "course à la mer", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le camp allemand", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le commandement allemand", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -215650,12 +215606,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -215730,33 +215680,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le commandement allemand", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Chaque camp", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -215796,12 +215753,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -215858,33 +215809,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "près de Vlorë en Albanie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "près de Vlor en Albanie", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "près de Vlor en Albanie", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "près de Vlorë en Albanie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "près de Vlore en Albanie", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "près de Vlorë en Albanie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Vlorë en Albanie", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 0.5 } }, "human_annot": { @@ -215900,12 +215858,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -215982,33 +215934,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vlorë", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vlor", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vlor", - "rougeL": 0.4444444444444445 + "rougeL": 0.4444444444444445, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Vlorë", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vlorë", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vlorë", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "près de Vlorë en Albanie.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -216048,12 +216007,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -216112,33 +216065,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Au soir du 21 avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "soir du 21 avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Au soir du 21 avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "soir du 21 avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "au soir du 21 avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Au soir du 21 avril", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Au soir du 21 avril.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -216178,12 +216138,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -216243,42 +216197,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le 2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le 2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le 2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le 2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 2nd bataillon du major Jackson.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -216365,33 +216320,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "2nd bataillon", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "2nd bataillon", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "2nd bataillon", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "2nd bataillon du major Jackson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 2nd bataillon", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 } }, "human_annot": { @@ -216407,12 +216369,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -216493,33 +216449,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le curé, l' abbé Gaillard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gaillard", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le curé, l' abbé Gaillard", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'abbé Gaillard", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -216547,12 +216510,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -216615,33 +216572,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' abbé Gaillard", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'abbé Gaillard", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -216675,12 +216639,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -216743,33 +216701,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "flotte", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "flotte", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "flotte britannique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "flotte britannique", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "flotte aux origines modestes", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la flotte", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la flotte aux origines modestes", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 } }, "human_annot": { @@ -216785,12 +216750,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -216872,33 +216831,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -216938,12 +216904,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -216994,33 +216954,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à Zurich", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -217048,12 +217015,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -217122,33 +217083,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Allemagne.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Allemagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Allemagne", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 1.0 } }, "human_annot": { @@ -217164,12 +217132,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -217251,33 +217213,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1922", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1922", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1922", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1922", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1922", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1922", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "datant de 1922", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -217305,12 +217274,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -217379,33 +217342,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "techniques de guérilla des Boers", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "techniques de guérilla des Boers", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "techniques de guérilla des Boers", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "techniques de guérilla des Boers", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pertes devant les techniques de guérilla des Boers", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "devant les techniques de guérilla des Boers", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les techniques de guérilla des Boers", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 1.0 } }, "human_annot": { @@ -217433,12 +217403,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -217501,33 +217465,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Boers", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Boers", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Boers", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Boers", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Boers", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des Boers", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des Boers", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -217543,12 +217514,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -217629,33 +217594,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "turques et austro - hongroises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "turques et austro - hongroises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "turques et austro - hongroises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "turques et austro - hongroises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "turques et austro - hongroises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "turques et austro - hongroises", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "turques et austro-hongroises", - "rougeL": 0.7999999999999999 + "rougeL": 0.7999999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -217683,12 +217655,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -217757,33 +217723,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -217817,12 +217790,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -217885,33 +217852,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Sancerre", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -217951,12 +217925,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -218007,33 +217975,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Sancerre sur la Loire", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -218067,12 +218042,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -218135,33 +218104,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une certaine sympathie pour le parti nazi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une certaine sympathie pour le parti nazi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sympathie pour le parti nazi", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une certaine sympathie pour le parti nazi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sympathie pour le parti nazi", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une certaine sympathie pour le parti nazi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le comte Czernin exprima une certaine sympathie pour le parti nazi.", - "rougeL": 0.625 + "rougeL": 0.625, + "HScore": 1.0 } }, "human_annot": { @@ -218195,12 +218171,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -218257,33 +218227,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "parti nazi, bien que son fils, Manfred, qui était resté avec sa mère en Angleterre fut un pilote de la RAF pendant la Seconde Guerre mondiale fut un pilote de la RAF pendant la Seconde Guerre mondial", - "rougeL": 0.20689655172413793 + "rougeL": 0.20689655172413793, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "nazi", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nazi", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "le parti nazi", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le parti nazi", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le parti nazi", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le parti nazi", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -218299,12 +218276,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -218385,33 +218356,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "tous ses buts de guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "tous ses buts de guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "tous ses buts de guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "tous ses buts de guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ses buts de guerre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "tous ses buts de guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La Bulgarie a maintenant réalisé tous ses buts de guerre.", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 1.0 } }, "human_annot": { @@ -218445,12 +218423,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -218525,33 +218497,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "États-Unis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "États-Unis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "États-Unis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "États-Unis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "États-Unis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "États-Unis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les États-Unis.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -218591,12 +218570,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -218655,33 +218628,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "entrer à l' académie royale militaire et trois fois à Sandhurst", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "entrer à l' académie royale militaire et trois fois à Sandhurst", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "entrer à l' académie royale militaire et trois fois à Sandhurst", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "il tente par deux fois d' entrer à l' académie royale militaire et trois fois à Sandhurst", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "entrer à l' académie royale militaire", - "rougeL": 0.5882352941176471 + "rougeL": 0.5882352941176471, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "entrer à l' académie royale militaire et trois fois à Sandhurst", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Il n'arrive pas à entrer à l'académie royale militaire.", - "rougeL": 0.4210526315789474 + "rougeL": 0.4210526315789474, + "HScore": 0.5 } }, "human_annot": { @@ -218727,12 +218707,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -218777,33 +218751,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trois fois à Sandhurst", - "rougeL": 0.7368421052631579 + "rougeL": 0.7368421052631579, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "deux fois d' entrer à l' académie royale militaire et trois fois à Sandhurst", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "deux fois d' entrer à l' académie royale militaire et trois fois à Sandhurst", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "deux fois d' entrer à l' académie royale militaire et trois fois à Sandhurst", - "rougeL": 0.923076923076923 + "rougeL": 0.923076923076923, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "trois fois à Sandhurst", - "rougeL": 0.7368421052631579 + "rougeL": 0.7368421052631579, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "entre à l' académie royale militaire et trois fois à Sandhurst sans succès", - "rougeL": 0.8461538461538461 + "rougeL": 0.8461538461538461, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "tente par deux fois d' entrer à l' académie royale militaire et trois fois à Sandhurst sans succès", - "rougeL": 0.8275862068965517 + "rougeL": 0.8275862068965517, + "HScore": 1.0 } }, "human_annot": { @@ -218849,12 +218830,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -218917,33 +218892,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "général Dubail", - "rougeL": 0.22222222222222224 + "rougeL": 0.22222222222222224, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "le IIe Armée française", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -218977,12 +218959,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -219052,33 +219028,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "de Castelnau", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le général de Castelnau", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -219094,12 +219077,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -219181,33 +219158,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "400 kg", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -219241,12 +219225,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -219309,33 +219287,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une explosion anormale au centre de la galerie", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une bombe ait explosé au bas d' un escalier d' accès sinon dans la galerie elle-même", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "explosion anormale au centre de la galerie", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "explosion anormale", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "explosion anormale au centre de la galerie", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une explosion anormale", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la destruction immédiate de la structure de soutènement et l'effondrement de la couverture de terres.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -219357,12 +219342,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -219432,33 +219411,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "explosion anormale au centre de la galerie", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "entraînant comme au Mont Cornillet des tués et des blessés en nombre, et de plus à Carspach la destruction immédiate de la structure de soutènement et l' effondre", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "explosion anormale au centre de la galerie", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "explosion anormale", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "entraînant comme au Mont Cornillet des tués et des blessés en nombre, et de plus à Carspach la destruction immédiate de la structure de soutènement et l' effondrement de la couverture de terres.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "entraînant comme au Mont Cornillet des tués et des blessés en nombre, et de plus à Carspach la destruction immédiate de la structure de soutènement et l' effondrement de la couverture de terres", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la destruction immédiate de la structure de soutènement et l' effondrement de la couverture de terres.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -219480,12 +219466,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur inacceptable" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -219573,33 +219553,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Union de la gauche socialiste, qui rassemble des chrétiens de gauche et d' anciens communistes, mais quitte ce mouvement lorsque celui-ci participe à la fondation du Parti socialiste unifié", - "rougeL": 0.2727272727272727 + "rougeL": 0.2727272727272727, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Union de la gauche socialiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Union de la gauche socialiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Union de la gauche socialiste,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Union de la gauche socialiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Union de la gauche socialiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Union de la gauche socialiste", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -219633,12 +219620,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -219701,33 +219682,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "célébrer l' occupation australienne de la Nouvelle - Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour célébrer l' occupation australienne de la Nouvelle - Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour célébrer l' occupation australienne de la Nouvelle - Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pour célébrer l' occupation australienne de la Nouvelle - Bretagne.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "celebrer l'occupation australienne de la Nouvelle - Bretagne", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "célébrer l' occupation australienne de la Nouvelle - Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour célébrer l'occupation australienne de la Nouvelle-Bretagne", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -219767,12 +219755,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -219823,33 +219805,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "célébrer l' occupation australienne de la Nouvelle - Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "célébrer l' occupation australienne de la Nouvelle - Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "célébrer l' occupation australienne de la Nouvelle - Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "célébrer l' occupation australienne de la Nouvelle - Bretagne.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "célébrer l'occupation australienne de la Nouvelle - Bretagne", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "célébrer l' occupation australienne de la Nouvelle - Bretagne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour célébrer l'occupation australienne de la Nouvelle-Bretagne.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -219883,12 +219872,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -219975,33 +219958,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "château d' Itter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "château d' Itter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "La Rocque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "colonel de La Rocque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Château d' Itter", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "La Rocque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "La Rocque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 } }, "human_annot": { @@ -220047,12 +220037,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -220105,33 +220089,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "von Kluck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "von Kluck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "von Kluck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "von Kluck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "von Kluck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "von Kluck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "von Kluck", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -220165,12 +220156,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -220252,33 +220237,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "leurs dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "leurs dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "les Alliés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les dirigeants alliés", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les dirigeants, décidés", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 } }, "human_annot": { @@ -220312,12 +220304,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -220394,33 +220380,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "leurs dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "leurs dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "les Alliés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les dirigeants alliés", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "leurs dirigeants", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -220436,12 +220429,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -220524,33 +220511,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "256", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "256", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "256", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "256", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "256 personnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "256 personnes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mort de 256 personnes.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -220560,12 +220554,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -220652,33 +220640,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "675000", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "675000", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "675000", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "675000", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "675000 soldats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "environ 675000", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "environ 675000 soldats", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -220724,12 +220719,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -220774,33 +220763,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les Allemands comptent environ 675000 soldats", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les Allemands comptent environ 675000 soldats", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les Allemands comptent environ 675000 soldats tués, blessés ou disparus au combat", - "rougeL": 0.42857142857142855 + "rougeL": 0.42857142857142855, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "675000", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "675000 soldats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Français, Belges, Britanniques, Allemands", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Français, les Belges et les Britanniques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -220828,12 +220824,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -220932,33 +220922,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Walther von Lüttwitz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Walther von Lüttwitz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Walther von Lüttwitz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Walther von Lüttwitz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "IIIe corps d' armée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Walther von Lüttwitz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Walther von Lüttwitz", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -220992,12 +220989,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -221060,33 +221051,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à une autre guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une autre guerre, fut -elle, pour les navires, ponctuelle", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à une autre guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à une autre guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "loin de la France, à une autre guerre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à une autre guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à une autre guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -221120,12 +221118,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -221182,33 +221174,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une autre guerre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "guerre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "victoire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une autre guerre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "une autre guerre", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la victoire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la victoire", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -221224,12 +221223,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -221310,33 +221303,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "après la bataille de Mons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "après la bataille de Mons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "après la bataille de Mons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "après la bataille de Mons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "après la bataille de Mons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "après la bataille de Mons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "après la bataille de Mons", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -221370,12 +221370,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -221432,33 +221426,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bataille de Mons", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bataille de Mons", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bataille de Mons", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bataille de Mons", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "bataille de Mons", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bataille de Mons", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Après la bataille de Mons.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -221492,12 +221493,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -221578,33 +221573,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Wilson est la personne qui apprend qu'avec un mandat parlementaire, il pourra plus facilement devenir administrateur des sociétés.", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.0 } }, "human_annot": { @@ -221632,12 +221634,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -221706,33 +221702,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "une fosse commune contenant les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une fosse commune contenant les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une fosse commune contenant les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une fosse commune contenant les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Fosse commune contenant les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une fosse commune contenant les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une fosse commune contenant les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -221772,12 +221775,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -221834,33 +221831,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Rochefort, Brest, Cherbourg et Toulon", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -221894,12 +221898,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -221956,33 +221954,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Paris", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le départ de la brigade de 6000 hommes organisée en deux régiments de fusiliers marins pour aller en renfort de l'armée belge s'est fait à partir de Paris.", - "rougeL": 0.0909090909090909 + "rougeL": 0.0909090909090909, + "HScore": 1.0 } }, "human_annot": { @@ -222010,12 +222015,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -222084,33 +222083,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "À la recherche des traces du passé", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -222150,12 +222156,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -222206,33 +222206,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "passé du passé", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "des traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "traces du passé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -222266,12 +222273,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -222334,33 +222335,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "existence d' une lettre du premier ministre français Clemenceau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "l' intransigeance allemande anéantit tout espoir en s' exprimant dans un discours public, l' existence d' une lettre du premier ministre français Clemenceau", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' intransigeance allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' existence d' une lettre du premier ministre français Clemenceau.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "intransigeance allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' existence d' une lettre du premier ministre français Clemenceau", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "l'intransigeance allemande", - "rougeL": 0.8000000000000002 + "rougeL": 0.8000000000000002, + "HScore": 1.0 } }, "human_annot": { @@ -222370,12 +222378,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -222464,33 +222466,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "premières sorties d' usine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "premières sorties d' usine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les premières sorties d' usine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les premières sorties d' usine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les premières sorties d' usine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les premières sorties d' usine", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les premières sorties d'usine.", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -222530,12 +222539,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -222592,33 +222595,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les corps de trois Kaiserschützen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les corps de trois Kaiserschützen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "corps de trois Kaiserschützen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les corps de trois Kaiserschützen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "trois Kaiserschützen", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les corps de trois Kaiserschützen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les corps de trois Kaiserschützen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -222646,12 +222656,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -222714,33 +222718,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "trois Kaiserschützen", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "trois Kaiserschützen", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "trois Kaiserschützen", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "trois Kaiserschützen", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "trois Kaiserschützen", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "trois Kaiserschützen", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les corps de trois Kaiserschützen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -222780,12 +222791,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -222842,33 +222847,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ami de von Rauffenstein", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un ami", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -222908,12 +222920,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -222970,33 +222976,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "défendre les droits des soldats d' Algérie.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour défendre les droits des soldats d'Algérie", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 1.0 } }, "human_annot": { @@ -223036,12 +223049,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -223092,33 +223099,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "défendre les droits des soldats d' Algérie.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pour défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pour défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour défendre les droits des soldats d' Algérie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -223152,12 +223166,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -223220,33 +223228,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chef socialiste", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Jaurès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Jaurès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Jaurès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Jaurès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Jaurès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jaurès", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -223280,12 +223295,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -223349,33 +223358,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les barbelés et les mitrailleuses comme les éléments à détruire pour relancer la guerre de mouvement", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les barbelés et les mitrailleuses comme les éléments à détruire pour relancer la guerre de mouvement", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les barbelés et les mitrailleuses comme les éléments à détruire pour relancer la guerre de mouvement", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -223409,12 +223425,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -223471,42 +223481,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les barbelés et les mitrailleuses", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -223599,33 +223610,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -223659,12 +223677,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -223721,33 +223733,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -223781,12 +223800,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -223843,33 +223856,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "chef de la mission militaire française en Pologne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -223903,12 +223923,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -223971,33 +223985,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la première balle traversa le bord de la voiture et atteignit la duchesse de Hohenberg à l' abdomen", - "rougeL": 0.2222222222222222 + "rougeL": 0.2222222222222222, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la première balle traversa le bord de la voiture et atteignit la duchesse de Hohenberg à l' abdomen. La seconde balle atteignit l' archiduc dans le cou", - "rougeL": 0.14814814814814814 + "rougeL": 0.14814814814814814, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la duchesse de Hohenberg à l' abdomen. La seconde balle atteignit l' archiduc dans le cou", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "voiture", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "sandwich", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "deux fois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Princip a tiré avec une balle.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 0.5 } }, "human_annot": { @@ -224013,12 +224034,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -224099,33 +224114,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Gough", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Gough", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gough", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Gough", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Gough", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Gough", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Gough", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -224165,12 +224187,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -224228,33 +224244,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Allemagne de l'ouest", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -224270,12 +224293,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -224352,33 +224369,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemagne", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Allemagne de l' ouest", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En Allemagne", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.5 } }, "human_annot": { @@ -224418,12 +224442,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -224482,33 +224500,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Français", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "partie de Français qui ont rejoint les révolutionnaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "partie de Français qui ont rejoint les révolutionnaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Français qui ont rejoint les révolutionnaires", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de Français qui ont rejoint les révolutionnaires", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Français", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "de Français", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 } }, "human_annot": { @@ -224524,12 +224549,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -224604,33 +224623,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Français", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "partie de Français qui ont rejoint les révolutionnaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "partie de Français qui ont rejoint les révolutionnaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Français qui ont rejoint les révolutionnaires", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Français qui ont rejoint les révolutionnaires", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "la propagande bolchevique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -224670,12 +224696,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -224732,33 +224752,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le 26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le 26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 26 août 1914.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -224768,12 +224795,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -224854,33 +224875,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le 26 août 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 26 août 1914.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -224920,12 +224948,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -224982,33 +225004,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Au début de 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "début de 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "début de 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "début de 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "10 sont des bataillons de la Garde", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Au début de 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Au début de 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -225024,12 +225053,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -225105,33 +225128,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1920", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1920", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1920", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "1920", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "1920", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "1920", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Au début de 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -225159,12 +225189,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -225234,33 +225258,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "les mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -225300,12 +225331,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -225356,33 +225381,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les mineurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les mineurs allemands.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -225422,12 +225454,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -225484,33 +225510,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un emploi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "emploi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un emploi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un emploi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un emploi dans une usine de mécanique automobile", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "emploi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un emploi", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -225520,12 +225553,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -225612,42 +225639,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fosse commune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fosse commune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à l' entrée sud du tunnel", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' entrée sud du tunnel,", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sud du tunnel", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une fosse commune à l' entrée sud du tunnel", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une fosse commune", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -225740,33 +225768,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "révolution russe de 1905", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "révolution russe de 1905", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "révolution russe de 1905", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "révolution russe de 1905", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "révolution russe de 1905", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "révolution russe de 1905", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "révolution russe de 1905", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -225806,12 +225841,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -225864,15 +225893,15 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1905", "rougeL": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "révolution russe de 1905", "rougeL": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "révolution russe de 1905", "rougeL": 1.0 }, @@ -225938,33 +225967,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le service des armes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le service des armes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le service des armes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le service des armes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le service des armes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le service des armes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le service des armes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -225998,12 +226034,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -226066,33 +226096,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "4 novembre", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le 4 novembre", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -226138,12 +226175,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -226190,33 +226221,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "4 novembre", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le 4 novembre", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Le 1er novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Le 1er novembre.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -226250,12 +226288,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -226320,33 +226352,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les survivants de la Première Guerre mondiale ayant subi une ou plusieurs blessures au combat", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "survivants de la Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "survivants de la Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "survivants de la Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "survivants de la Première Guerre mondiale ayant subi une ou plusieurs blessures au combat et affectés par des séquelles physiques graves, notamment au niveau du visage.", - "rougeL": 0.4210526315789474 + "rougeL": 0.4210526315789474, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "survivants de la Première Guerre mondiale ayant subi une ou plusieurs blessures au combat et affectés par des séquelles physiques graves", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les survivants de la Première Guerre mondiale ayant subi une ou plusieurs blessures au combat", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -226356,12 +226395,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -226444,42 +226477,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les survivants de la Première Guerre mondiale ayant subi une ou plusieurs blessures au combat", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "gueules cassées", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "survivants de la Première Guerre mondiale ayant subi une ou plusieurs blessures au combat", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "survivants de la Première Guerre mondiale", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "au niveau du visage", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "survivants de la Première Guerre mondiale ayant subi une ou plusieurs blessures au combat et affectés par des séquelles physiques graves, notamment au niveau du visage", - "rougeL": 0.4210526315789474 + "rougeL": 0.4210526315789474, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Les survivants de la Première Guerre mondiale.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -226574,33 +226608,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un modèle « French gendarme » sans qu' aucun élément ne vienne valider la réalité de cette variante", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -226622,12 +226663,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -226696,33 +226731,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le modèle \"French gendarme\"", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -226750,12 +226792,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -226824,33 +226860,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -226878,12 +226921,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -226946,33 +226983,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pilotage sur Morane - Saulnier type L", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jean Navarre a appris le pilotage.", - "rougeL": 0.14285714285714285 + "rougeL": 0.14285714285714285, + "HScore": 0.0 } }, "human_annot": { @@ -227012,12 +227056,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -227074,33 +227112,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "coup de canon", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le premier coup de canon", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "résidence du poète", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "premier coup de canon", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un premier coup de canon", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "le premier coup de canon", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le premier coup de canon", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -227116,12 +227161,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -227202,42 +227241,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dépôt de grenades ou de munitions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au centre de la galerie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "galerie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "dépôt de grenades ou de munitions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "au centre de la galerie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "au bas d' un escalier d' accès sinon dans la galerie elle-même", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "d'un dépôt de grenades ou de munitions", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -227324,33 +227364,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dépôt de grenades ou de munitions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "grenades ou de munitions", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "grenades ou de munitions", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "dépôt de grenades ou de munitions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "de munitions", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "d' un dépôt de grenades ou de munitions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un dépôt de grenades ou de munitions", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -227366,12 +227413,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -227452,33 +227493,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "soldats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "soldats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "soldats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "officiers", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "les soldats flamands sortis du rang", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "les soldats flamands", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les soldats flamands", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.0 } }, "human_annot": { @@ -227512,12 +227560,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -227580,33 +227622,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les zones d' attaque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "zones d' attaque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les zones d' attaque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les zones d' attaque.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les zones d'attaque", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les zones d' attaque", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les zones d'attaque", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 } }, "human_annot": { @@ -227634,12 +227683,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -227702,33 +227745,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "élargir les zones d' attaque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "élargir les zones d' attaque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ils longent le canal à l' ouest du saillant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "d' élargir les zones d' attaque.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "élargir les zones d' attaque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "élargir les zones d' attaque", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "élargir les zones d'attaque", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -227768,12 +227818,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -227830,33 +227874,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Bundestag", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Bundestag", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Tribunal permanent des peuples, d' une sous-commission de l' ONU pour la prévention des droits de l' homme et la protection des minorités, du parlement européen, du Conseil de l' Europe et du Mercosur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Bundestag", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Bundestag", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "le Bundestag", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -227896,12 +227947,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -227949,33 +227994,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 14 février 1916.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -228003,12 +228055,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -228072,33 +228118,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 14 février 1916", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -228126,12 +228179,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -228201,33 +228248,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la mise en route de petits travaux d' intérêt local", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la mise en route de petits travaux d' intérêt local", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mise en route de petits travaux d' intérêt local", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la mise en route de petits travaux d' intérêt local", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "petits travaux d' intérêt local", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la mise en route de petits travaux d' intérêt local", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la mise en route de petits travaux d'intérêt local", - "rougeL": 0.7692307692307692 + "rougeL": 0.7692307692307692, + "HScore": 1.0 } }, "human_annot": { @@ -228261,12 +228315,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -228329,33 +228377,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "miraculés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des enfants", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -228395,12 +228450,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -228451,33 +228500,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "miraculés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "miraculés", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des femmes et des jeunes filles enlevées, disparues dans le secret des maisons turques ou rééduquées dans les écoles islamiques", - "rougeL": 0.08333333333333333 + "rougeL": 0.08333333333333333, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "quelques miraculés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -228499,12 +228555,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -228579,33 +228629,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cadres de l' armée et de l' administration", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cadres de l' armée et de l' administration", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cadres de l' armée et de l' administration", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "cadres de l' armée et de l' administration,", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "une partie des cadres de l' armée et de l' administration", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une partie des cadres de l' armée et de l' administration", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une partie des cadres de l'armée et de l'administration", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -228621,12 +228678,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -228703,33 +228754,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cadres de l' armée et de l' administration", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "cadres de l' armée et de l' administration", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "cadres de l' armée et de l' administration", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "une partie des cadres de l' armée et de l' administration,", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une partie des cadres de l' armée et de l' administration", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cadres de l' armée et de l' administration", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "une partie des cadres de l' armée et de l' administration", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -228751,12 +228809,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -228833,33 +228885,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -228893,12 +228952,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -228957,33 +229010,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Allemands de Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -229023,12 +229083,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -229087,33 +229141,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Grèce, l' Italie et les puissances de l' Entente", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la Grèce, l' Italie et les puissances de l' Entente", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Balkans, la Grèce, l' Italie et les puissances de l' Entente", - "rougeL": 0.8235294117647058 + "rougeL": 0.8235294117647058, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "la Grèce, l' Italie et les puissances de l' Entente", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Grèce, Italie et les puissances de l'Entente", - "rougeL": 0.7999999999999999 + "rougeL": 0.7999999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Grèce, l' Italie et les puissances de l' Entente", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Grèce, l'Italie et les puissances de l'Entente", - "rougeL": 0.39999999999999997 + "rougeL": 0.39999999999999997, + "HScore": 1.0 } }, "human_annot": { @@ -229129,12 +229190,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -229209,33 +229264,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Grèce, l' Italie et les puissances de l' Entente", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Grèce, l' Italie et les puissances de l' Entente", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Constantin Ier", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "la Grèce, l' Italie et les puissances de l' Entente", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Grèce, Italie et puissances de l'Entente", - "rougeL": 0.7999999999999999 + "rougeL": 0.7999999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la Grèce, l'Italie et les puissances de l'Entente", - "rougeL": 0.39999999999999997 + "rougeL": 0.39999999999999997, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l' Italie et les puissances de l' Entente", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 } }, "human_annot": { @@ -229263,12 +229325,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -229337,33 +229393,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "une ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chemin de fer", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "une ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -229397,12 +229460,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -229461,42 +229518,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "chemin de fer", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "chemin de fer", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Mourmansk", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "une ligne de chemin de fer vers ce qui allait devenir la ville de Mourmansk", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une ligne de chemin de fer", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -229591,33 +229649,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le plus gros obus qu' un fort pouvait tirer", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le plus gros obus qu' un fort pouvait tirer", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le plus gros obus qu' un fort pouvait tirer", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le plus gros obus", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.8 }, "llama-2_lora": { "answer_pred": "un obus de Grosse Bertha", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.45 }, "mixtral-8x7b": { "answer_pred": "le plus gros obus qu' un fort pouvait tirer", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un obus d'un fort", - "rougeL": 0.28571428571428575 + "rougeL": 0.28571428571428575, + "HScore": 0.6 } }, "human_annot": { @@ -229873,48 +229938,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -230080,33 +230103,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Mudra", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Mudra", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Mudra", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Mudra", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Bruno von Mudra", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Bruno von Mudra", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Bruno von Mudra", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -230116,12 +230146,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -230239,33 +230263,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Mudra", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Mudra", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Mudra", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Mudra", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Bruno von Mudra", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Bruno von Mudra", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Bruno von Mudra", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -230275,12 +230306,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -230380,33 +230405,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les observateurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les observateurs allemands dans la vallée", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les observateurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "observateurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "observateurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "observateurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les observateurs allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -230416,12 +230448,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -230508,33 +230534,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "au lendemain de la Révolution d' Octobre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au lendemain de la Révolution d' Octobre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au lendemain de la Révolution d' Octobre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "lendemain de la Révolution d' Octobre.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "au lendemain de la Révolution d' Octobre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "au lendemain de la Révolution d' Octobre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Botchkareva est arrêtée au lendemain de la Révolution d' Octobre.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { @@ -230568,12 +230601,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -230630,33 +230657,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Révolution d' Octobre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Révolution d' Octobre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Révolution d' Octobre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Révolution d' Octobre.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "révolution d'Octobre", - "rougeL": 0.3333333333333333 + "rougeL": 0.3333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Révolution d' Octobre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Révolution d' Octobre", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -230684,12 +230718,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -230758,33 +230786,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "FEA 11", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "FEA 11", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "FEA 11", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "FEA 11", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "FEA 11", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Josef Carl Peter Jacobs", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "il est envoyé au FEA 11 à Laon", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -230830,12 +230865,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -230886,33 +230915,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "France. Jean Baptiste Eugène Estienne. Jean Baptiste Eugène Estienne (7 novembre 1860 à Condé -en - Barrois, France - 2 avril 1936 à Paris) est un artilleur et ingénieur militaire français.", - "rougeL": 0.05882352941176471 + "rougeL": 0.05882352941176471, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "France", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -230946,12 +230982,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -231015,33 +231045,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "4 h 10", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "4 h 10", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "4 h 10", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "pendant les nuits précédentes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "durant les nuits précédentes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "pendant les nuits précédentes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les nuits précédentes.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 0.5 } }, "human_annot": { @@ -231057,12 +231094,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -231137,33 +231168,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pendant les nuits précédentes", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les nuits précédentes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les nuits précédentes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les nuits précédentes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "durant les nuits précédentes", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "les nuits précédentes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les nuits précédentes", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -231191,12 +231229,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -231265,33 +231297,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en bas du versant méridional de la butte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en bas du versant méridional de la butte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bas du versant méridional de la butte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en bas du versant méridional de la butte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en bas du versant méridional de la butte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en bas du versant méridional de la butte", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en bas du versant méridional de la butte.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -231319,12 +231358,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -231388,33 +231421,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "méridional de la butte", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "versant méridional de la butte", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "versant méridional de la butte", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "méridional", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "versant méridional de la butte", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "versant méridional de la butte", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en bas du versant méridional", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -231436,12 +231476,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -231517,33 +231551,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans le bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans le bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dans le bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Mexico", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans le bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -231553,12 +231594,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -231639,33 +231674,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le bureau public de télégraphe", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans le bureau public de télégraphe à Mexico", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -231705,12 +231747,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -231767,33 +231803,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sans consultations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sans consultations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sans consultations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sans consultations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "il agit sans consultations", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "agit sans consultations", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Wilson agit sans consultations.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -231839,12 +231882,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -231895,42 +231932,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "questions éthiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "questions éthiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "questions éthiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "questions éthiques", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "une paix mondiale moins militarisée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une paix mondiale moins militarisée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour une paix mondiale moins militarisée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -232028,33 +232066,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "son avis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "son avis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "son avis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "son avis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ses avis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "son avis", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'avis de Wilson", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -232094,12 +232139,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -232156,33 +232195,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sur la ZAC Actiparc près d' Arras", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur la ZAC Actiparc près d' Arras", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur la ZAC Actiparc près d' Arras", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "ZAC Actiparc près d' Arras.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "ZAC Actiparc près d' Arras", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ZAC Actiparc près d' Arras", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur la ZAC Actiparc près d'Arras", - "rougeL": 0.8888888888888888 + "rougeL": 0.8888888888888888, + "HScore": 1.0 } }, "human_annot": { @@ -232192,12 +232238,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -232280,33 +232320,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Actiparc", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ZAC Actiparc", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ZAC Actiparc", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Actiparc", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Actiparc", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ZAC Actiparc près d' Arras", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "ZAC Actiparc près d'Arras.", - "rougeL": 0.8888888888888888 + "rougeL": 0.8888888888888888, + "HScore": 1.0 } }, "human_annot": { @@ -232352,12 +232399,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -232410,33 +232451,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "très élevées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "très élevées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "très élevées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "très élevées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "élevées", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "très élevées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "très élevées", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -232470,12 +232518,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -232539,33 +232581,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le haut commandement de l'armée", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -232605,12 +232654,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -232661,33 +232704,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "haut commandement de l' armée", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "au haut commandement de l'armée", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -232715,12 +232765,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -232789,33 +232833,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "écarter de Gaulle après son échec de Dakar", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "écarter de Gaulle après son échec de Dakar", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "écarter de Gaulle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "écarter de Gaulle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "écarter de Gaulle après son échec de Dakar", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "écarter de Gaulle après son échec de Dakar, et incertains de l' accueil que leur réserveraient les généraux vichystes, cherchent un général français prestigieux pour prendre le commandement de l' armée d' Afrique", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Alliés envisagent un débarquement en Afrique du Nord", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 } }, "human_annot": { @@ -232837,12 +232888,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -232911,42 +232956,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "écarter de Gaulle après son échec de Dakar", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "écarter de Gaulle après son échec de Dakar", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "écarter de Gaulle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "écarter de Gaulle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "écarter de Gaulle après son échec de Dakar", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "cherchent un général français prestigieux pour prendre le commandement de l' armée d' Afrique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la décision prise par les Alliés dirigeants est de chercher un général français prestigieux pour prendre le commandement de l'armée d'Afrique.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -233051,42 +233097,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "impératrice russe", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "impératrice russe", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "impératrice russe", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "l' impératrice russe", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "caisse de l' impératrice russe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "caisse de l' impératrice russe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la caisse de l'impératrice russe", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -233197,33 +233244,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pour savoir comment leurs différends ont pu être divulgués dans la presse et fait valoir son droit d' exprimer ses opinions sur la stratégie", - "rougeL": 0.23529411764705882 + "rougeL": 0.23529411764705882, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pour savoir comment leurs différends ont pu être divulgués dans la presse et fait valoir son droit d' exprimer ses opinions sur la stratégie", - "rougeL": 0.23529411764705882 + "rougeL": 0.23529411764705882, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "leurs différends ont pu être divulgués dans la presse", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "stratégie", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "de la stratégie", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "stratégie", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Lloyd George écrit à Robertson pour savoir comment leurs différends ont pu être divulgués dans la presse et fait valoir son droit d'exprimer ses opinions sur la stratégie.", - "rougeL": 0.14814814814814814 + "rougeL": 0.14814814814814814, + "HScore": 1.0 } }, "human_annot": { @@ -233239,12 +233293,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -233325,33 +233373,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sous domination allemande depuis 1871", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "occupée", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sous domination allemande depuis 1871", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Thann, sous domination allemande depuis 1871, est occupée et devient, jusqu' à la fin de la guerre,", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "sous domination allemande depuis 1871", - "rougeL": 0.9090909090909091 + "rougeL": 0.9090909090909091, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Thann, sous domination allemande depuis 1871, est occupée et devient, jusqu' à la fin de la guerre, la capitale d' une portion de territoire alsacien redevenue française.", - "rougeL": 0.4799999999999999 + "rougeL": 0.4799999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Thann, sous domination allemande depuis 1871, est occupée et devient, jusqu' à la fin de la guerre, la capitale d' une portion de territoire alsacien redevenue française.", - "rougeL": 0.4799999999999999 + "rougeL": 0.4799999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -233367,12 +233422,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -233453,33 +233502,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "corps de 16 hommes exécutés par la Gestapo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "les corps de 16 hommes exécutés par la Gestapo", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "16 hommes exécutés par la Gestapo", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 0.85 }, "mixtral-8x7b": { "answer_pred": "les corps de 16 hommes", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.85 }, "GPT-3.5": { "answer_pred": "les corps de 16 hommes exécutés par la Gestapo.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -233777,48 +233833,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Partiellement correct" - }, - { - "annot": "annot_9", - "rating": "Partiellement correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -233899,33 +233913,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "16 hommes", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "16 hommes exécutés par la Gestapo", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "16 hommes exécutés par la Gestapo", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "16 hommes exécutés par la Gestapo", - "rougeL": 0.9411764705882353 + "rougeL": 0.9411764705882353, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "16 hommes", - "rougeL": 0.3636363636363636 + "rougeL": 0.3636363636363636, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "les corps de 16 hommes", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les corps de 16 hommes exécutés par la Gestapo.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -233941,12 +233962,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -234027,33 +234042,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "400 kg", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "plus de 400 kg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -234081,12 +234103,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -234155,33 +234171,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "utiliser ces conseils comme éléments de base de discussions sur la stratégie future", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "utiliser ces conseils comme éléments de base de discussions sur la stratégie future", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "d' utiliser ces conseils comme éléments de base de discussions sur la stratégie future", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' utiliser ces conseils comme éléments de base de discussions sur la stratégie future.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "envoi d' aide à l' Italie et la préparation d' offensives en Palestine", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "utiliser ces conseils comme éléments de base de discussions sur la stratégie future", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Lloyd George a décidé d'utiliser ces conseils comme éléments de base de discussions sur la stratégie future.", - "rougeL": 0.5833333333333334 + "rougeL": 0.5833333333333334, + "HScore": 1.0 } }, "human_annot": { @@ -234227,12 +234250,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -234277,33 +234294,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "utiliser ces conseils comme éléments de base de discussions sur la stratégie future", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "utiliser ces conseils comme éléments de base de discussions sur la stratégie future", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "utiliser ces conseils comme éléments de base de discussions sur la stratégie future", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "d' utiliser ces conseils comme éléments de base de discussions sur la stratégie future.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "envoi d' aide à l' Italie et la préparation d' offensives en Palestine", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "utiliser ces conseils comme éléments de base de discussions sur la stratégie future", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Lloyd George décide d'utiliser ces conseils comme éléments de base de discussions sur la stratégie future.", - "rougeL": 0.608695652173913 + "rougeL": 0.608695652173913, + "HScore": 1.0 } }, "human_annot": { @@ -234325,12 +234349,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -234405,33 +234423,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -234471,12 +234496,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -234527,33 +234546,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Bordeaux", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -234587,12 +234613,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -234655,33 +234675,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le matin.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -234715,12 +234742,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -234778,33 +234799,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "18 mars 1918", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le matin, l' artillerie allemande bombarde les lignes françaises à l' aide d' obus à gaz", - "rougeL": 0.18181818181818182 + "rougeL": 0.18181818181818182, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "le matin", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -234820,12 +234848,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -234907,42 +234929,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "bande dessinée", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "production assez importante", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "une production assez importante", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "une production assez importante", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "une production assez importante de bande dessinée", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "une production assez importante existe depuis au moins les années 2000", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "une production assez importante", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -235047,42 +235070,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "duchesse de Hohenberg", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "duchesse de Hohenberg", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "duchesse de Hohenberg", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' archiduc François - Ferdinand, héritier de l' Empire austro-hongrois, et son épouse la duchesse de Hohenberg,", - "rougeL": 0.4166666666666667 + "rougeL": 0.4166666666666667, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Duchesse de Hohenberg", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la duchesse de Hohenberg", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'épouse la duchesse de Hohenberg", - "rougeL": 0.7272727272727272 + "rougeL": 0.7272727272727272, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -235183,33 +235207,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "archiduc François - Ferdinand", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "François - Ferdinand", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "archiduc François - Ferdinand", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "François - Ferdinand", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "François-Ferdinand, héritier de l'Empire austro-hongrois", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la duchesse de Hohenberg", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "son épouse la duchesse de Hohenberg", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -235219,12 +235250,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -235313,33 +235338,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "début de juin 1920", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "En février 1920.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -235385,12 +235417,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -235437,33 +235463,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "février 1920", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -235503,12 +235536,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -235567,33 +235594,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -235639,12 +235673,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -235689,33 +235717,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "inventer des masques à gaz et systèmes de protection", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "inventer des masques à gaz et systèmes de protection", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "inventer des masques à gaz et systèmes de protection", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "masques à gaz et systèmes de protection", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "inventer des masques à gaz et systèmes de protection", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -235731,12 +235766,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -235817,42 +235846,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "sa carrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "carrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sa carrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "sa carrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "gendarme à Gaillon", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "débute sa carrière", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Jules Arsène Crosnier a commencé sa carrière en tant que gendarme à Gaillon puis à Louviers.", - "rougeL": 0.09523809523809523 + "rougeL": 0.09523809523809523, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -235945,33 +235975,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "90 % des pertes de la 25e division d' infanterie britannique", - "rougeL": 0.875 + "rougeL": 0.875, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pertes importantes à l' infanterie britannique", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pertes importantes à l' infanterie britannique", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "pertes importantes à l' infanterie britannique.", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "90 % des pertes de la 25e division d' infanterie britanniques proviennent de l' artillerie allemande.", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "des pertes importantes à l' infanterie britannique", - "rougeL": 0.4615384615384615 + "rougeL": 0.4615384615384615, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "des pertes importantes à l'infanterie britannique", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 0.5 } }, "human_annot": { @@ -236017,12 +236054,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -236073,42 +236104,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Le combat contre la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "combat contre la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Le combat contre la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Le combat contre la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Bethmann Hollweg", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "le combat contre la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le combat contre la Russie.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -236197,33 +236229,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "les sympathisants des Habsbourg et au sein de la branche des financiers", - "rougeL": 0.15384615384615385 + "rougeL": 0.15384615384615385, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le combat contre la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "combat contre la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Russie", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Allemagne du sud", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Russie", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Les financiers des Habsbourg apportent leur aide à l'Allemagne du sud.", - "rougeL": 0.11764705882352941 + "rougeL": 0.11764705882352941, + "HScore": 0.0 } }, "human_annot": { @@ -236233,12 +236272,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -236327,33 +236360,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "à déposer au procès Pucheu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "explique ses engagements envers l' accusé, ainsi que leur violation lors de la mise en résidence surveillée de ce dernier", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Giraud explique ses engagements envers l' accusé, ainsi que leur violation lors de la mise en résidence surveillée de ce dernier", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "appelé par la défense à déposer au procès Pucheu", - "rougeL": 0.8333333333333333 + "rougeL": 0.8333333333333333, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "pour déposer au procès Pucheu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "explique ses engagements envers l' accusé, ainsi que leur violation lors de la mise en résidence surveillée de ce dernier", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour déposer au procès Pucheu", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -236399,12 +236439,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -236455,33 +236489,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "déplacements de populations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "déplacements de populations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des déplacements de populations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "déplacements de populations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des déplacements de populations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "déplacements de populations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "des déplacements de populations", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -236503,12 +236544,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -236583,33 +236618,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "En juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "En juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En juin 1914.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -236631,12 +236673,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -236712,33 +236748,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1914", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1914", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1914", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En juin 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -236766,12 +236809,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -236841,33 +236878,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -236895,12 +236939,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -236964,33 +237002,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en 1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en 1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1881", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -237024,12 +237069,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -237093,33 +237132,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "cimetières des communes voisines ou créés à proximité des champs de bataille", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "sur le front", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "sur le front", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "cimetières des communes voisines ou créés à proximité des champs de bataille", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "sur les champs de bataille", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "sur le front", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "sur le front", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -237159,12 +237205,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -237217,33 +237257,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "inhumés dans des cimetières des communes voisines ou créés à proximité des champs de bataille ou encore demeurèrent sans sépulture dans les lieux les plus inaccessibles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "inhumés dans des cimetières des communes voisines ou créés à proximité des champs de bataille ou encore demeurèrent sans sépulture dans les lieux les plus inaccessibles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "inhumés dans des cimetières des communes voisines ou créés à proximité des champs de bataille ou encore demeurèrent sans sépulture dans les lieux les plus inaccessibles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "cimetières des communes voisines ou créés à proximité des champs de bataille ou encore demeurèrent sans sépulture dans les lieux les plus inaccessibles", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "sur le front", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "sur le front", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Les militaires sont morts sur le front.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -237271,12 +237318,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -237347,33 +237388,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "alerte l' armée russe qui bloque puis repousse les deux autres corps d' armée entraînant la retraite allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "alerte l' armée russe qui bloque puis repousse les deux autres corps d' armée entraînant la retraite allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "alerte l' armée russe qui bloque puis repousse les deux autres corps d' armée entraînant la retraite allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "bloque puis repousse les deux autres corps d' armée entraînant la retraite allemande.", - "rougeL": 0.8695652173913044 + "rougeL": 0.8695652173913044, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "alarme l' armée russe qui bloque puis repousse les deux autres corps d' armée entraînant la retraite allemande.", - "rougeL": 0.9230769230769231 + "rougeL": 0.9230769230769231, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "alarte l' armée russe qui bloque puis repousse les deux autres corps d' armée entraînant la retraite allemande", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "La séquence de mots qui répond à la question est \"bloque puis repousse les deux autres corps d'armée\".", - "rougeL": 0.48000000000000004 + "rougeL": 0.48000000000000004, + "HScore": 0.5 } }, "human_annot": { @@ -237413,12 +237461,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -237476,33 +237518,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un résumé des nouvelles militaires qui pour la plupart sont déjà publiées dans la presse", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "résumé des nouvelles militaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un résumé des nouvelles militaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "Camembert_baseline": { "answer_pred": "un résumé des nouvelles militaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "llama-2_lora": { "answer_pred": "un résumé des nouvelles militaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "mixtral-8x7b": { "answer_pred": "résumé des nouvelles militaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "GPT-3.5": { "answer_pred": "Robertson donne aux membres un résumé des nouvelles militaires.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 0.85 } }, "human_annot": { @@ -237548,48 +237597,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Partiellement correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Partiellement correct" - }, - { - "annot": "annot_6", - "rating": "Partiellement correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -237922,33 +237929,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "nouvelles militaires qui pour la plupart sont déjà publiées dans la presse", - "rougeL": 0.4444444444444444 + "rougeL": 0.4444444444444444, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "des nouvelles militaires qui pour la plupart sont déjà publiées dans la presse", - "rougeL": 0.4444444444444444 + "rougeL": 0.4444444444444444, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "nouvelles militaires", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "des nouvelles militaires", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des nouvelles militaires qui pour la plupart sont déjà publiées dans la presse", - "rougeL": 0.4444444444444444 + "rougeL": 0.4444444444444444, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "résumé des nouvelles militaires", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Robertson donne aux membres un résumé des nouvelles militaires.", - "rougeL": 0.5454545454545454 + "rougeL": 0.5454545454545454, + "HScore": 1.0 } }, "human_annot": { @@ -237958,12 +237972,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -238050,33 +238058,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le commandement de l'ANZAC", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -238104,12 +238119,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -238172,33 +238181,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "le commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' ANZAC", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "IIe ANZAC", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "commandement de l' ANZAC", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le commandement de l'ANZAC", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 1.0 } }, "human_annot": { @@ -238208,12 +238224,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -238300,33 +238310,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Izvestias de Kronstadt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Izvestias de Kronstadt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "les Izvestias de Kronstadt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Izvestias de Kronstadt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Izvestias de Kronstadt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "les Izvestias de Kronstadt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Izvestias de Kronstadt", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -238372,12 +238389,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -238428,33 +238439,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "soldats de toutes les puissances combattantes", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "ennemi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "ennemi", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "l' ennemi.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "soldats de toutes les puissances combattantes", - "rougeL": 0.33333333333333337 + "rougeL": 0.33333333333333337, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "soldats", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "tous les soldats ou soldats de toutes les puissances combattantes", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 } }, "human_annot": { @@ -238464,12 +238482,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -238573,33 +238585,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "pirates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "pirates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "François Deuve", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "pirates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "pirates et seigneurs de la guerre", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "pirates", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "L'Inconstant traque les pirates.", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 0.5 } }, "human_annot": { @@ -238639,12 +238658,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -238701,33 +238714,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Conseil général de l'Aisne", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -238743,12 +238763,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -238823,33 +238837,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Aisne", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Aisne", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Conseil général de l' Aisne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Conseil général de l'Aisne", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -238871,12 +238892,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -238951,42 +238966,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "découper un lambeau de peau du bras à apposer sur le visage dans la plaie afin que celle-ci se ferme grâce à la peau fournie et à maintenir à l' aide d' une structure métallique", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "procédés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "des procédés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "cette méthode consistait à découper un lambeau de peau du bras à apposer sur le visage dans la plaie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "procédés", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "une méthode consistait à découper un lambeau de peau du bras à apposer sur le visage dans la plaie afin que celle-ci se ferme grâce à la peau fournie et à maintenir à l' aide d' une structure métallique le bras sanglant au visage afin de vasculariser la plaie pour qu' elle se referme", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "cette méthode", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -239085,33 +239101,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "la production des R.V I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la production des R.V I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la production des R.V I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "la production des R.V I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "transformé en hydravion et équipé de flotteurs pour l' aéronavale allemande, et désigné en tant que type L avec le numéro de série 1432.", - "rougeL": 0.09999999999999999 + "rougeL": 0.09999999999999999, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "la production des R.V I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la production des R.V I", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -239157,12 +239180,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -239213,33 +239230,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -239273,12 +239297,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -239335,33 +239353,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -239401,12 +239426,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -239463,33 +239482,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 } }, "human_annot": { @@ -239529,12 +239555,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -239585,33 +239605,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un modèle « French gendarme » sans qu' aucun élément ne vienne valider la réalité de cette variante", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "un modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "French gendarme", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un modèle « French gendarme »", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un modèle \"French gendarme\"", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 } }, "human_annot": { @@ -239645,12 +239672,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -239713,42 +239734,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "commémorer le 50e anniversaire du génocide arménien en 1965", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "commémorer le 50e anniversaire du génocide arménien en 1965", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour commémorer le 50e anniversaire du génocide arménien en 1965", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "commémorer le 50e anniversaire du génocide arménien", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "50e anniversaire du génocide arménien", - "rougeL": 0.7692307692307693 + "rougeL": 0.7692307692307693, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "50e anniversaire du génocide arménien", - "rougeL": 0.7692307692307693 + "rougeL": 0.7692307692307693, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "pour commémorer le 50e anniversaire du génocide arménien en 1965.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -239841,33 +239863,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie en mouvement en direction de Dommartin - Lettrée", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie en mouvement en direction de Dommartin - Lettrée", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "2 bataillons, 2 escadrons et 3 pièces d' artillerie", - "rougeL": 0.37499999999999994 + "rougeL": 0.37499999999999994, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "2 bataillons, 2 escadrons et 3 pièces d' artillerie en mouvement en direction de Dommartin - Lettrée.", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "2 bataillons, 2 escadrons et 3 pièces d' artillerie", - "rougeL": 0.37499999999999994 + "rougeL": 0.37499999999999994, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie", - "rougeL": 0.5555555555555556 + "rougeL": 0.5555555555555556, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la séquence de mots qui répond à la question est \"les troupes avancées\"", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -239889,12 +239918,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -239963,33 +239986,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie en mouvement en direction de Dommartin - Lettrée", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie en mouvement en direction de Dommartin - Lettrée", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie", - "rougeL": 0.5555555555555556 + "rougeL": 0.5555555555555556, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie en mouvement en direction de Dommartin - Lettrée.", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie en mouvement en direction de Dommartin - Lettrée", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d' artillerie en mouvement en direction de Dommartin - Lettrée.", - "rougeL": 0.3846153846153846 + "rougeL": 0.3846153846153846, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un groupe formé de 2 bataillons, 2 escadrons et 3 pièces d'artillerie en mouvement en direction de Dommartin-Lettrée.", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -240017,12 +240047,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -240103,33 +240127,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ioudenitch", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ioudenitch", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -240151,12 +240182,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -240237,33 +240262,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ioudenitch", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Ioudenitch", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le général Ioudenitch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -240285,12 +240317,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_3", @@ -240365,33 +240391,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -240419,12 +240452,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -240487,33 +240514,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le 9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 9 mars 1943", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -240553,12 +240587,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -240615,33 +240643,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "L'article ne mentionne pas où la famille de Louis Murat a érigé un monument à sa mémoire.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -240675,12 +240710,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -240737,33 +240766,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à Lihons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -240803,12 +240839,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -240865,33 +240895,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Gero", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Gero", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Gero", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Gero", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Gero, son fils aîné, leutnant au Grenadier - Regiment (mot.) 51", - "rougeL": 0.25 + "rougeL": 0.25, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Gero", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Gero, son fils aîné.", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 } }, "human_annot": { @@ -240931,12 +240968,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -240990,33 +241021,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "santé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la santé, y compris la santé reproductive de nombreux animaux, et celle des humains qui les consomment", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "la santé, y compris la santé reproductive de nombreux animaux, et celle des humains qui les consomment", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "la santé", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "la santé, y compris la santé reproductive de nombreux animaux, et celle des humains qui les consomment.", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la santé, y compris la santé reproductive de nombreux animaux, et celle des humains qui les consomment", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la santé, y compris la santé reproductive de nombreux animaux, et celle des humains qui les consomment", - "rougeL": 0.16666666666666669 + "rougeL": 0.16666666666666669, + "HScore": 0.5 } }, "human_annot": { @@ -241056,12 +241094,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_6", @@ -241118,33 +241150,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Maximilian von Spee décide de mener avec ses deux croiseurs cuirassés un combat retardateur désespéré.", - "rougeL": 0.7333333333333334 + "rougeL": 0.7333333333333334, + "HScore": 1.0 } }, "human_annot": { @@ -241178,12 +241217,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -241240,33 +241273,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "décide de mener avec ses deux croiseurs cuirassés un combat retardateur désespéré", - "rougeL": 0.9565217391304348 + "rougeL": 0.9565217391304348, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Maximilian von Spee décide de mener avec ses deux croiseurs cuirassés un combat retardateur désespéré.", - "rougeL": 0.7333333333333334 + "rougeL": 0.7333333333333334, + "HScore": 1.0 } }, "human_annot": { @@ -241312,12 +241352,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -241368,42 +241402,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Dans ce port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Dans ce port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans ce port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ce port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Dans ce port sous contrôle britannique, on trouve des navires de guerre français et russes.", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -241490,33 +241525,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Dans ce port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Les Alliés", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "TsingTao", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "TsingTao", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "ce port sous contrôle britannique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans ce port sous contrôle britannique, en plus de nombreux navires marchands, on trouve des navires de guerre français et russes.", - "rougeL": 0.47058823529411764 + "rougeL": 0.47058823529411764, + "HScore": 1.0 } }, "human_annot": { @@ -241544,12 +241586,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -241618,33 +241654,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Il n'y a pas de séquence de mots dans l'article qui répond à la question \"À quelle date l'ennemi est-il arrivé à Épernay ?\"", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -241660,12 +241703,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -241742,33 +241779,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "d'une part Châlons et d'autre part Épernay", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { @@ -241808,12 +241852,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -241866,42 +241904,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "15 juillet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "NULL", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "Il n'y a pas d'information dans l'article concernant la date d'arrivée de l'ennemi à Épernay.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -242008,33 +242047,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "assiste avec Pétain à la démonstration du châssis Schneider", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Estienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Estienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Pétain", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Estienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Estienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Estienne.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -242056,12 +242102,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_2", @@ -242142,33 +242182,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Pétain", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Estienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Pétain", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Pétain", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Estienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Estienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Estienne", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -242202,12 +242249,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -242270,33 +242311,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1907.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -242324,12 +242372,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -242393,33 +242435,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "En 1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "En 1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "En 1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "En 1907", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -242459,12 +242508,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -242522,33 +242565,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "turques et grecques", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "turques et grecques", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "turques et grecques", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "(turques et grecques).", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "turques et grecques", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "populations mélangées (turques et grecques)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les populations mélangées (turques et grecques)", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -242558,12 +242608,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -242650,33 +242694,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Lower Hutt en Nouvelle-Zélande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Lower Hutt en Nouvelle-Zélande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Lower Hutt en Nouvelle-Zélande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Lower Hutt en Nouvelle-Zélande.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Lower Hutt en Nouvelle-Zélande.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Lower Hutt en Nouvelle-Zélande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Lower Hutt en Nouvelle-Zélande.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -242716,12 +242767,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -242797,33 +242842,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Maximilian von Weichs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "von Weichs", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "von Weichs", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "von Weichs", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Maximilian von Weichs", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "von Weichs", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "von Weichs", - "rougeL": 0.7272727272727273 + "rougeL": 0.7272727272727273, + "HScore": 1.0 } }, "human_annot": { @@ -242851,12 +242903,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -242925,33 +242971,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dépotoir d' atelier", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dépotoir d' atelier", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dépotoir d' atelier", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dépotoir d' atelier,", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "des dizaines de rebuts de tôle de laiton", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "dépotoir d' atelier, découvert sur la ZAC Actiparc près d' Arras", - "rougeL": 0.4210526315789473 + "rougeL": 0.4210526315789473, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "un dépotoir d'atelier", - "rougeL": 0.6 + "rougeL": 0.6, + "HScore": 1.0 } }, "human_annot": { @@ -242997,12 +243050,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -243049,33 +243096,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dépotoir d' atelier", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.9 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dépotoir d' atelier", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.9 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dépotoir d' atelier", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.9 }, "Camembert_baseline": { "answer_pred": "dépotoir d' atelier,", - "rougeL": 0.888888888888889 + "rougeL": 0.888888888888889, + "HScore": 0.85 }, "llama-2_lora": { "answer_pred": "des dizaines de rebuts de tôle de laiton", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.25 }, "mixtral-8x7b": { "answer_pred": "dépotoir d' atelier, découvert sur la ZAC Actiparc près d' Arras", - "rougeL": 0.4210526315789473 + "rougeL": 0.4210526315789473, + "HScore": 0.85 }, "GPT-3.5": { "answer_pred": "des fouilles archéologiques récentes ont permis de retrouver les traces d'un dépotoir d'atelier", - "rougeL": 0.4444444444444444 + "rougeL": 0.4444444444444444, + "HScore": 0.9 } }, "human_annot": { @@ -243205,48 +243259,6 @@ "rating": "Erreur inacceptable" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Partiellement correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Erreur acceptable (\"humaine\")" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -243515,33 +243527,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Le gouvernement a du mal à maîtriser ce sentiment auprès d' un nombre toujours croissant de patriotes et d' interventionnistes", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "l' Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "l' Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "l' Italie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'Italie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -243557,12 +243576,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -243644,33 +243657,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le 26 octobre 1914.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -243710,12 +243730,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -243766,33 +243780,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "26 octobre 1914", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 26 octobre 1914.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -243826,12 +243847,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -243912,33 +243927,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Les Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Les Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "les Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "les Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -243984,12 +244006,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_2", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_2", @@ -244054,33 +244070,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Belges", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Algériens, le 2e régiment de tirailleurs Algériens, le 1er régiment de tirailleurs Algériens, le 2e régiment de Zouaves, des Belges et des Canadiens", - "rougeL": 0.06896551724137932 + "rougeL": 0.06896551724137932, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Les Allemands", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "5200 soldats", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Belges et Canadiens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "GPT-3.5": { "answer_pred": "les troupes alliées", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.0 } }, "human_annot": { @@ -244114,12 +244137,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Erreur inacceptable" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -244184,33 +244201,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Courlande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Lettons, en favorisant l' installation de Germano - Russes venant du domaine iméprial russe, de domaines du clergé et de grands propriétaires terriens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "l' installation de Germano - Russes venant du domaine iméprial russe, de domaines du clergé et de grands propriétaires terriens", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Lettons", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "Courlande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Courlande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "les nouveaux États", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -244238,12 +244262,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Partiellement correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -244313,33 +244331,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "novembre 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "novembre 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "novembre 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "novembre 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "novembre 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "novembre 1919", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Suite aux élections de novembre 1919, le Bloc National s'avère être le grand gagnant.", - "rougeL": 0.3076923076923077 + "rougeL": 0.3076923076923077, + "HScore": 1.0 } }, "human_annot": { @@ -244367,12 +244392,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -244441,33 +244460,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -244501,12 +244527,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -244563,33 +244583,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Robertson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Robertson.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -244623,12 +244650,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_4", @@ -244691,33 +244712,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vitry - le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "Camembert_baseline": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "llama-2_lora": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "mixtral-8x7b": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.85 }, "GPT-3.5": { "answer_pred": "Adolphe Guillaumat combat à Vitry-le-François lors de la première bataille de la Marne.", - "rougeL": 0.27272727272727276 + "rougeL": 0.27272727272727276, + "HScore": 0.9 } }, "human_annot": { @@ -244889,48 +244917,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Partiellement correct" - }, - { - "annot": "annot_10", - "rating": "Erreur acceptable (\"humaine\")" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -245099,12 +245085,6 @@ "rating": "Erreur acceptable (\"humaine\")" } ], - "question": [ - { - "annot": "annot_10", - "rating": "Question douteuse" - } - ], "no_answer": [ { "annot": "annot_10", @@ -245149,33 +245129,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Vitry -le - François", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Adolphe Guillaumat combat à Vitry-le-François lors de la première bataille de la Marne.", - "rougeL": 0.27272727272727276 + "rougeL": 0.27272727272727276, + "HScore": 1.0 } }, "human_annot": { @@ -245203,12 +245190,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -245277,33 +245258,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "armée belge emmenée par le roi Albert Ier inflige donc un grave échec à l' armée impériale allemande au tout début de la campagne", - "rougeL": 0.9189189189189189 + "rougeL": 0.9189189189189189, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "armée impériale allemande", - "rougeL": 0.3 + "rougeL": 0.3, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "L' armée belge emmenée par le roi Albert Ier inflige donc un grave échec à l' armée impériale allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "L' armée belge emmenée par le roi Albert Ier inflige donc un grave échec à l' armée impériale allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "le 4 août 1914", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "l' armée belge emmenée par le roi Albert Ier inflige donc un grave échec à l' armée impériale allemande", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "que l'armée belge emmenée par le roi Albert Ier inflige un grave échec à l'armée impériale allemande.", - "rougeL": 0.7999999999999999 + "rougeL": 0.7999999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -245337,12 +245325,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -245406,33 +245388,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -245688,48 +245677,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - }, - { - "annot": "annot_2", - "rating": "Correct" - }, - { - "annot": "annot_3", - "rating": "Correct" - }, - { - "annot": "annot_4", - "rating": "Correct" - }, - { - "annot": "annot_5", - "rating": "Correct" - }, - { - "annot": "annot_6", - "rating": "Correct" - }, - { - "annot": "annot_7", - "rating": "Correct" - }, - { - "annot": "annot_8", - "rating": "Correct" - }, - { - "annot": "annot_9", - "rating": "Correct" - }, - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -245852,33 +245799,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "12 décembre 1941", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 12 décembre 1941.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -245918,12 +245872,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -245980,33 +245928,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans une charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "dans une charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans une charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "dans une charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "dans une charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "dans une charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans une charrette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -246046,12 +246001,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -246103,33 +246052,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "charrette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "charrette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "charrette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "charrette", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans une charrette de ses parents", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 } }, "human_annot": { @@ -246151,12 +246107,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_8", @@ -246232,33 +246182,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "13 novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "13 novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "13 novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "13 novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "13 novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "13 novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Le 13 novembre", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -246298,12 +246255,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -246398,33 +246349,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "adjudant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Konrad Krafft von Dellmensingen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Konrad Krafft von Dellmensingen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "adjudant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "adjudant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Konrad Krafft von Dellmensingen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Konrad Krafft von Dellmensingen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -246446,12 +246404,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -246558,33 +246510,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "adjudant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "adjudant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Konrad Krafft von Dellmensingen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "adjudant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "llama-2_lora": { "answer_pred": "adjudant", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Konrad Krafft von Dellmensingen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Konrad Krafft von Dellmensingen", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -246600,12 +246559,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -246688,33 +246641,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "catalyseur", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "catalyseur", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "comme catalyseur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "comme catalyseur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "comme catalyseur", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Verdun agit comme catalyseur", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Verdun agit comme catalyseur.", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -246724,12 +246684,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -246828,33 +246782,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Au Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -246882,12 +246843,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -246962,33 +246917,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "le Kaiser", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -247022,12 +246984,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -247090,33 +247046,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Abraham Bloch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Abraham Bloch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Abraham Bloch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Abraham Bloch Abraham Bloch", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Abraham Bloch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Abraham Bloch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Abraham Bloch", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -247144,12 +247107,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -247215,33 +247172,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Wilson.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Wilson", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -247275,12 +247239,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_5", @@ -247385,33 +247343,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Lord Kitchener", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Smith - Dorrien participe à la bataille de Gennis.", - "rougeL": 0.7142857142857143 + "rougeL": 0.7142857142857143, + "HScore": 1.0 } }, "human_annot": { @@ -247439,12 +247404,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_10", @@ -247549,33 +247508,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Lord Kitchener", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Lord Kitchener", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "Camembert_baseline": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Smith - Dorrien", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -247597,12 +247563,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -247677,33 +247637,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "veille de ses congrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "veille de ses congrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "à la veille de ses congrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "à la veille de ses congrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à la veille de ses congrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à la veille de ses congrès", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "\"On la fait dire des offices religieux à la veille de ses congrès à la fois à l'Église, au Temple et à la Synagogue.\"", - "rougeL": 0.2857142857142857 + "rougeL": 0.2857142857142857, + "HScore": 1.0 } }, "human_annot": { @@ -247719,12 +247686,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -247806,33 +247767,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Baltique", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "mer Baltique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "mer Baltique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "mer Baltique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Baltique", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "mer Baltique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "mer Baltique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -247878,12 +247846,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_5", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_5", @@ -247925,42 +247887,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "au travers de l' entrée du golfe de Finlande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "au travers de l' entrée du golfe de Finlande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "au travers de l' entrée du golfe de Finlande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "au travers de l' entrée du golfe de Finlande.", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "au travers de l' entrée du golfe de Finlande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "derrière les barrages de mines qui s' étendaient au travers de l' entrée du golfe de Finlande", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "dans la mer Baltique", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -248050,33 +248013,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Zweig", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Zweig", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Zweig", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Zweig", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Stefan Zweig", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Stefan Zweig", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Stefan Zweig", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -248086,12 +248056,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_7", @@ -248172,33 +248136,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Stefan Zweig", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Zweig", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Stefan Zweig", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Stefan Zweig", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Stefan Zweig", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Stefan Zweig", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Stefan Zweig.", - "rougeL": 0.7499999999999999 + "rougeL": 0.7499999999999999, + "HScore": 1.0 } }, "human_annot": { @@ -248238,12 +248209,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -248300,33 +248265,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "en arrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "en arrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "en arrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "en arrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "en arrière.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "en arrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en arrière", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -248360,12 +248332,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_8", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_8", @@ -248470,42 +248436,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Victor", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Victor Goybet.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -248636,33 +248603,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Victor Goybet", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Victor", - "rougeL": 0.5714285714285715 + "rougeL": 0.5714285714285715, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "Victor Goybet.", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -248708,12 +248682,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_4", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_4", @@ -248766,33 +248734,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "armistice", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "armistice", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "armistice", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "armistice", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "armistice", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "demande d' armistice", - "rougeL": 0.8 + "rougeL": 0.8, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "l'armistice", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 } }, "human_annot": { @@ -248838,12 +248813,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -248894,42 +248863,43 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "À la fin", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fin de l' ouvrage", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fin", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fin", - "rougeL": 0.6666666666666666 + "rougeL": 0.6666666666666666, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à la fin du livre", - "rougeL": 0.5 + "rougeL": 0.5, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "À la fin de l'ouvrage", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à la fin de l'ouvrage", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { - "llama-2-70b": [ - { - "annot": "annot_6", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_6", @@ -249022,33 +248992,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Marc Pourpe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Marc Pourpe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Marc Pourpe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Marc Pourpe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Marc Pourpe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "Marc Pourpe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Marc Pourpe", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -249082,12 +249059,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_3", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_3", @@ -249150,33 +249121,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "à la fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "à la fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "à la fin du XIXe siècle", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -249186,12 +249164,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_1", @@ -249278,33 +249250,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "dans un pays qui manquait presque tout, après des siècles de domination ottomane", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "dans un pays qui manquait presque tout", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "dans un pays qui manquait presque tout", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "dans un pays qui manquait presque tout, après des siècles de domination ottomane", - "rougeL": 0.6153846153846153 + "rougeL": 0.6153846153846153, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -249338,12 +249317,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_1", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_1", @@ -249400,33 +249373,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "Italie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.0 }, "mixtral-8x7b": { "answer_pred": "Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "en Albanie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 1.0 } }, "human_annot": { @@ -249454,12 +249434,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_9", @@ -249528,33 +249502,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "commémorer le 50e anniversaire du génocide arménien en 1965", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "commémorer le 50e anniversaire du génocide arménien en 1965", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "pour commémorer le 50e anniversaire du génocide arménien en 1965", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "commémorer le 50e anniversaire du génocide arménien", - "rougeL": 0.9333333333333333 + "rougeL": 0.9333333333333333, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "50e anniversaire du génocide arménien en 1965", - "rougeL": 0.8571428571428571 + "rougeL": 0.8571428571428571, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "50e anniversaire du génocide arménien", - "rougeL": 0.7692307692307693 + "rougeL": 0.7692307692307693, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "pour commémorer le 50e anniversaire du génocide arménien en 1965", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -249564,12 +249545,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_7", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_7", @@ -249686,33 +249661,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Ernest Jules Joseph Maunoury", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Ernest Jules Joseph Maunoury", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Ernest Jules Joseph Maunoury", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Ernest Jules Joseph Maunoury", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "sergent", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "Ernest Jules Joseph Maunoury", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "Ernest Jules Joseph Maunoury", - "rougeL": 0.4 + "rougeL": 0.4, + "HScore": 1.0 } }, "human_annot": { @@ -249752,12 +249734,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10", @@ -249814,33 +249790,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "Camembert_baseline": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "llama-2_lora": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "mixtral-8x7b": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 }, "GPT-3.5": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 1.0 } }, "human_annot": { @@ -249874,12 +249857,6 @@ "rating": "Correct" } ], - "llama-2-70b": [ - { - "annot": "annot_9", - "rating": "Correct" - } - ], "llama-2_lora": [ { "annot": "annot_9", @@ -249937,33 +249914,40 @@ } ], "predictions": { - "MT5-large_260_AP0": { + "MT5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "T5-large_260_AP0": { + "T5-large": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, - "FLAN-T5-large_260_AP0": { + "FLAN-T5-large": { "answer_pred": "italien et russe", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "Camembert_baseline": { "answer_pred": "Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "llama-2_lora": { "answer_pred": "Italie", - "rougeL": 0.0 + "rougeL": 0.0, + "HScore": 0.5 }, "mixtral-8x7b": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 }, "GPT-3.5": { "answer_pred": "la Russie", - "rougeL": 1.0 + "rougeL": 1.0, + "HScore": 0.5 } }, "human_annot": { @@ -250003,12 +249987,6 @@ "rating": "Partiellement correct" } ], - "llama-2-70b": [ - { - "annot": "annot_10", - "rating": "Partiellement correct" - } - ], "mixtral-8x7b": [ { "annot": "annot_10",