diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index d3dc5d1dd051446caa427dc3b8bb8115527202ae..903f57d8a437739634f4457143fc225d7297c43e 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -94,9 +94,11 @@ from __future__ import print_function import argparse import io +import os import sys import unicodedata import unittest +import math # CoNLL-U column names ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) @@ -157,6 +159,8 @@ def load_conllu(file) : self.sentences = [] # List of UDSpan instances with start&end indices into `words`. self.sentences_words = [] + # Name of the file this representation has been extracted from. + self.filename = "" class UDSpan : def __init__(self, start, end) : self.start = start @@ -189,6 +193,7 @@ def load_conllu(file) : self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS ud = UDRepresentation() + ud.filename = file.name # Load the CoNLL-U file index, sentence_start = 0, None @@ -511,34 +516,60 @@ def load_conllu_file(path) : def evaluate_wrapper(args) : # Load CoNLL-U files gold_ud = load_conllu_file(args.gold_file) - system_ud = load_conllu_file(args.system_file) + system_files = [load_conllu_file(args.system_file)] if args.system_file2 is not None : - print("TODO") - #TODO + system_files.append(load_conllu_file(args.system_file2)) - return evaluate(gold_ud, system_ud), [gold_ud, system_ud] + return gold_ud, [(system, evaluate(gold_ud, system)) for system in system_files] ################################################################################ ################################################################################ def compute_errors(gold_file, system_file, evaluation, metric) : - errors = {} + class Error : + def __init__(self, gold_file, system_file, gold_word, system_word, metric) : + self.gold = gold_word + self.pred = system_word + self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end] + self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end] + # TODO : do it for other than UPOS + self.type = gold.columns[UPOS]+"->"+pred.columns[UPOS] + + class Errors : + def __init__(self, metric) : + self.types = [] + self.nb_errors = 0 + self.metric = metric + def __len__(self) : + return self.nb_errors + def add(self, error) : + self.nb_errors += 1 + for t in self.types : + if t.type == error.type : + t.add(error) + return + self.types.append(ErrorType(error.type)) + self.types[-1].add(error) + def sort(self) : + self.types.sort(key=len, reverse=True) + + class ErrorType : + def __init__(self, error_type) : + self.type = error_type + self.errors = [] + def __len__(self) : + return len(self.errors) + def add(self, error) : + self.errors.append(error) + + errors = Errors(metric) for alignment_word in evaluation[metric][1] : gold = alignment_word.gold_word pred = alignment_word.system_word - error_type = gold.columns[UPOS]+"->"+pred.columns[UPOS] + error = Error(gold_file, system_file, gold, pred, metric) - gold_sentence_start = gold_file.sentences_words[gold.sentence].start - gold_sentence_end = gold_file.sentences_words[gold.sentence].end - pred_sentence_start = system_file.sentences_words[pred.sentence].start - pred_sentence_end = system_file.sentences_words[pred.sentence].end - - error = [gold, pred, gold_file.words[gold_sentence_start:gold_sentence_end], system_file.words[pred_sentence_start:pred_sentence_end]] - - if error_type not in errors : - errors[error_type] = [] - errors[error_type].append(error) + errors.add(error) return errors ################################################################################ @@ -556,37 +587,54 @@ def main() : help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.") parser.add_argument("--system_file2", help="Name of another CoNLL-U file with predicted data, for error comparison.") + parser.add_argument("--enumerate_errors", "-e", default=None, + help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").") args = parser.parse_args() + errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',') + # Evaluate - evaluation, files = evaluate_wrapper(args) - - # Compute errors - errors = compute_errors(files[0], files[1], evaluation, "UPOS") - - # Print the evaluation - if args.counts : - print("Metric | Correct | Gold | Predicted | Aligned") - else : - print("Metric | Precision | Recall | F1 Score | AligndAcc") - print("-----------+-----------+-----------+-----------+-----------") - for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"] : + gold_ud, evaluations = evaluate_wrapper(args) + + for (system_ud, evaluation) in evaluations : + fnamelen = len(system_ud.filename) + print("*"*math.ceil((80-2-fnamelen)/2),system_ud.filename,"*"*math.floor((80-2-fnamelen)/2)) + # Compute errors + errors_list = [compute_errors(gold_ud, system_ud, evaluation, metric) for metric in errors_metrics] + + # Print the evaluation if args.counts : - print("{:11}|{:10} |{:10} |{:10} |{:10}".format( - metric, - evaluation[metric][0].correct, - evaluation[metric][0].gold_total, - evaluation[metric][0].system_total, - evaluation[metric][0].aligned_total or (evaluation[metric][0].correct if metric == "Words" else "") - )) + print("Metric | Correct | Gold | Predicted | Aligned") else : - print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( - metric, - 100 * evaluation[metric][0].precision, - 100 * evaluation[metric][0].recall, - 100 * evaluation[metric][0].f1, - "{:10.2f}".format(100 * evaluation[metric][0].aligned_accuracy) if evaluation[metric][0].aligned_accuracy is not None else "" - )) + print("Metric | Precision | Recall | F1 Score | AligndAcc") + print("-----------+-----------+-----------+-----------+-----------") + for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"] : + if args.counts : + print("{:11}|{:10} |{:10} |{:10} |{:10}".format( + metric, + evaluation[metric][0].correct, + evaluation[metric][0].gold_total, + evaluation[metric][0].system_total, + evaluation[metric][0].aligned_total or (evaluation[metric][0].correct if metric == "Words" else "") + )) + else : + print("{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format( + metric, + 100 * evaluation[metric][0].precision, + 100 * evaluation[metric][0].recall, + 100 * evaluation[metric][0].f1, + "{:10.2f}".format(100 * evaluation[metric][0].aligned_accuracy) if evaluation[metric][0].aligned_accuracy is not None else "" + )) + + for errors in errors_list : + errors.sort() + print("") + print("Most frequent errors for metric '{}' :".format(errors.metric)) + for error_type in errors.types[:10] : + t = error_type.type + nb = len(error_type) + percent = 100.0*nb/len(errors) + print("{:5} {:5.2f}% {}".format(nb, percent, t)) ################################################################################ @@ -595,56 +643,3 @@ if __name__ == "__main__" : main() ################################################################################ - -################################################################################ -# Tests, which can be executed with `python -m unittest conll18_ud_eval`. -class TestAlignment(unittest.TestCase) : - @staticmethod - def _load_words(words) : - """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" - lines, num_words = [], 0 - for w in words : - parts = w.split(" ") - if len(parts) == 1 : - num_words += 1 - lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) - else : - lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) - for part in parts[1:] : - num_words += 1 - lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) - return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) - - def _test_exception(self, gold, system) : - self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) - - def _test_ok(self, gold, system, correct) : - metrics = evaluate(self._load_words(gold), self._load_words(system)) - gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) - system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) - self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), - (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) - - def test_exception(self) : - self._test_exception(["a"], ["b"]) - - def test_equal(self) : - self._test_ok(["a"], ["a"], 1) - self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) - - def test_equal_with_multiword(self) : - self._test_ok(["abc a b c"], ["a", "b", "c"], 3) - self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) - self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) - self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) - - def test_alignment(self) : - self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) - self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) - self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) - self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) - self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) - self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) - self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1) -################################################################################ -