#!/usr/bin/env python3 # Compatible with Python 2.7 and 3.2+, can be used either as a module # or a standalone executable. # # Copyright 2017, 2018 Institute of Formal and Applied Linguistics (UFAL), # Faculty of Mathematics and Physics, Charles University, Czech Republic. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # # Authors: Milan Straka, Martin Popel <surname@ufal.mff.cuni.cz> # # Changelog: # - [12 Apr 2018] Version 0.9: Initial release. # - [19 Apr 2018] Version 1.0: Fix bug in MLAS (duplicate entries in functional_children). # Add --counts option. # - [02 May 2018] Version 1.1: When removing spaces to match gold and system characters, # consider all Unicode characters of category Zs instead of # just ASCII space. # - [25 Jun 2018] Version 1.2: Use python3 in the she-bang (instead of python). # In Python2, make the whole computation use `unicode` strings. # # Updated by Franck Dary for Macaon # Command line usage # ------------------ # conll18_ud_eval.py gold_conllu_file system_conllu_file # # Metrics printed (as precision, recall, F1 score, # and in case the metric is computed on aligned words also accuracy on these): # - Tokens: how well do the gold tokens match system tokens # - Sentences: how well do the gold sentences match system sentences # - Words: how well can the gold words be aligned to system words # - UPOS: using aligned words, how well does UPOS match # - XPOS: using aligned words, how well does XPOS match # - UFeats: using aligned words, how well does universal FEATS match # - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match # - Lemmas: using aligned words, how well does LEMMA match # - UAS: using aligned words, how well does HEAD match # - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match # - CLAS: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes) match # - MLAS: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes)+UPOS+UFEATS+FunctionalChildren(DEPREL+UPOS+UFEATS) match # - BLEX: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes)+LEMMAS match # - if -c is given, raw counts of correct/gold_total/system_total/aligned words are printed # instead of precision/recall/F1/AlignedAccuracy for all metrics. # API usage # --------- # - load_conllu(file) # - loads CoNLL-U file from given file object to an internal representation # - the file object should return str in both Python 2 and Python 3 # - raises UDError exception if the given file cannot be loaded # - evaluate(gold_ud, system_ud) # - evaluate the given gold and system CoNLL-U files (loaded with load_conllu) # - raises UDError if the concatenated tokens of gold and system file do not match # - returns a dictionary with the metrics described above, each metric having # three fields: precision, recall and f1 # Description of token matching # ----------------------------- # In order to match tokens of gold file and system file, we consider the text # resulting from concatenation of gold tokens and text resulting from # concatenation of system tokens. These texts should match -- if they do not, # the evaluation fails. # # If the texts do match, every token is represented as a range in this original # text, and tokens are equal only if their range is the same. # Description of word matching # ---------------------------- # When matching words of gold file and system file, we first match the tokens. # The words which are also tokens are matched as tokens, but words in multi-word # tokens have to be handled differently. # # To handle multi-word tokens, we start by finding "multi-word spans". # Multi-word span is a span in the original text such that # - it contains at least one multi-word token # - all multi-word tokens in the span (considering both gold and system ones) # are completely inside the span (i.e., they do not "stick out") # - the multi-word span is as small as possible # # For every multi-word span, we align the gold and system words completely # inside this span using LCS on their FORMs. The words not intersecting # (even partially) any multi-word span are then aligned as tokens. from __future__ import division from __future__ import print_function from readMCD import readMCD import argparse import io import os import sys import unicodedata import unittest import math # CoNLL-U column names col2index = {} index2col = {} metric2colname = { "UPOS" : "UPOS", "Lemmas" : "LEMMA", } defaultColumns = { "ID", "FORM", "UPOS", "XPOS", "LEMMA", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC", } # Content and functional relations CONTENT_DEPRELS = { "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative", "expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos", "nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list", "parataxis", "orphan", "goeswith", "reparandum", "root", "dep" } FUNCTIONAL_DEPRELS = { "aux", "cop", "mark", "det", "clf", "case", "cc" } UNIVERSAL_FEATURES = { "PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender", "Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood", "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite" } ################################################################################ def is_float(value) : if not isinstance(value, str) : return False try : float(value) return True except ValueError : return False ################################################################################ ################################################################################ def filter_columns(columns) : res = [] cols = [("ID",4), ("FORM",8), ("UPOS",8), ("HEAD",4), ("DEPREL", 8)] contents = [(columns[col2index[col]], max_size) for (col, max_size) in cols if col in col2index] for (content, max_len) in contents : res.append(("{:"+str(max_len)+"}").format(content if len(content) <= max_len else "{}…{}".format(content[0:math.ceil((max_len-1)/2)],content[-((max_len-1)//2):]))) return res ################################################################################ ################################################################################ # UD Error is used when raising exceptions in this module class UDError(Exception) : pass ################################################################################ ################################################################################ # Conversion methods handling `str` <-> `unicode` conversions in Python2 def _decode(text) : return text if sys.version_info[0] >= 3 or not isinstance(text, str) else text.decode("utf-8") ################################################################################ ################################################################################ def _encode(text) : return text if sys.version_info[0] >= 3 or not isinstance(text, unicode) else text.encode("utf-8") ################################################################################ ################################################################################ # Load given CoNLL-U file into internal representation def load_conllu(file) : global col2index global index2col # Internal representation classes class UDRepresentation : def __init__(self) : # Characters of all the tokens in the whole file. # Whitespace between tokens is not included. self.characters = [] # List of UDSpan instances with start&end indices into `characters`. self.tokens = [] # List of UDWord instances. self.words = [] # List of UDSpan instances with start&end indices into `characters`. self.sentences = [] # List of UDSpan instances with start&end indices into `words`. self.sentences_words = [] # Name of the file this representation has been extracted from. self.filename = "" class UDSpan : def __init__(self, start, end) : self.start = start # Note that self.end marks the first position **after the end** of span, # so we can use characters[start:end] or range(start, end). self.end = end class UDWord : def __init__(self, span, columns, is_multiword) : # Index of the sentence this word is part of, within ud_representation.sentences. self.sentence = None # Span of this word (or MWT, see below) within ud_representation.characters. self.span = span # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,... self.columns = columns # is_multiword==True means that this word is part of a multi-word token. # In that case, self.span marks the span of the whole multi-word token. self.is_multiword = is_multiword # Reference to the UDWord instance representing the HEAD (or None if root). self.parent = None # List of references to UDWord instances representing functional-deprel children. self.functional_children = [] # Only consider universal FEATS. # TODO consider all feats if "FEATS" in col2index : self.columns[col2index["FEATS"]] = "|".join(sorted(feat for feat in columns[col2index["FEATS"]].split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) if "DEPREL" in col2index : # Let's ignore language-specific deprel subtypes. self.columns[col2index["DEPREL"]] = columns[col2index["DEPREL"]].split(":")[0] # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS self.is_content_deprel = self.columns[col2index["DEPREL"]] in CONTENT_DEPRELS self.is_functional_deprel = self.columns[col2index["DEPREL"]] in FUNCTIONAL_DEPRELS ud = UDRepresentation() ud.filename = file.name # Load the CoNLL-U file index, sentence_start = 0, None id_starts_at_zero = False while True : line = file.readline() if not line : break line = _decode(line.rstrip("\r\n")) # Handle sentence start boundaries if sentence_start is None : # Skip comments if line.startswith("#") : splited = line.split("global.columns =") if len(splited) > 1 : col2index, index2col = readMCD(splited[-1].strip()) continue # Start a new sentence sentence_start = len(ud.words) ud.sentences.append(UDSpan(index, 0)) ud.sentences_words.append(UDSpan(sentence_start, 0)) if not line : # Add parent and children UDWord links and check there are no cycles def process_word(word) : if "HEAD" in col2index : if word.parent == "remapping" : raise UDError("There is a cycle in a sentence") if word.parent is None : head = int(word.columns[col2index["HEAD"]]) if head < 0 or head > len(ud.words) - sentence_start : raise UDError("HEAD '{}' points outside of the sentence".format(_encode(word.columns[col2index["HEAD"]]))) if head : parent = ud.words[sentence_start + head - 1] word.parent = "remapping" process_word(parent) word.parent = parent for word in ud.words[sentence_start:] : process_word(word) # func_children cannot be assigned within process_word # because it is called recursively and may result in adding one child twice. for word in ud.words[sentence_start:] : if "HEAD" in col2index and word.parent and word.is_functional_deprel : word.parent.functional_children.append(word) # Check there is a single root node if "HEAD" in col2index and len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1 : raise UDError("There are multiple roots in a sentence") # End the sentence ud.sentences[-1].end = index ud.sentences_words[-1].end = len(ud.words) sentence_start = None continue # Read next token/word columns = line.split("\t") # Skip empty nodes if "ID" in col2index and "." in columns[col2index["ID"]] : continue # Delete spaces from FORM, so gold.characters == system.characters # even if one of them tokenizes the space. Use any Unicode character # with category Zs. if "FORM" in col2index : columns[col2index["FORM"]] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[col2index["FORM"]])) if not columns[col2index["FORM"]] : raise UDError("There is an empty FORM in the CoNLL-U file") # Save token form_value = columns[col2index["FORM"]] if "FORM" in col2index else "_" ud.characters.extend(form_value) ud.tokens.append(UDSpan(index, index + len(form_value))) index += len(form_value) # Handle multi-word tokens to save word(s) if "ID" in col2index and "-" in columns[col2index["ID"]] : try : start, end = map(int, columns[col2index["ID"]].split("-")) except : raise UDError("Cannot parse multi-word token ID '{}'".format(_encode(columns[col2index["ID"]]))) for _ in range(start, end + 1) : word_line = _decode(file.readline().rstrip("\r\n")) word_columns = word_line.split("\t") ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) ud.words[-1].sentence = len(ud.sentences)-1 # Basic tokens/words else : try : word_id = int(columns[col2index["ID"]]) if "ID" in col2index else "_" if word_id == 0 : id_starts_at_zero = True except : raise UDError("Cannot parse word ID '{}'".format(_encode(columns[col2index["ID"]]))) if word_id != len(ud.words) - sentence_start + (0 if id_starts_at_zero else 1) : raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format( _encode(columns[col2index["ID"]]), _encode(columns[col2index["FORM"]]), len(ud.words) - sentence_start + 1)) try : head_id = int(columns[col2index["HEAD"]]) if "HEAD" in col2index else 0 except : raise UDError("Cannot parse HEAD '{}'".format(_encode(columns[col2index["HEAD"]]))) if head_id < 0 : raise UDError("HEAD cannot be negative") ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) ud.words[-1].sentence = len(ud.sentences)-1 if sentence_start is not None : raise UDError("The CoNLL-U file does not end with empty line") return ud ################################################################################ ################################################################################ # Evaluate the gold and system treebanks (loaded using load_conllu). def evaluate(gold_ud, system_ud, extraColumns) : class Score : def __init__(self, gold_total, system_total, correct, aligned_total=None, isNumeric=False, R2=None) : self.correct = correct[0] self.gold_total = gold_total self.system_total = system_total self.aligned_total = aligned_total if isNumeric : self.precision = 0 self.recall = R2 self.f1 = correct[1] self.aligned_accuracy = correct[0] else : self.precision = 100*correct[0] / system_total if system_total else 0.0 self.recall = 100*correct[0] / gold_total if gold_total else 0.0 self.f1 = 2 * 100*correct[0] / (system_total + gold_total) if system_total + gold_total else 0.0 self.aligned_accuracy = 100*correct[0] / aligned_total if aligned_total else aligned_total class AlignmentWord : def __init__(self, gold_word, system_word) : self.gold_word = gold_word self.system_word = system_word class Alignment : def __init__(self, gold_words, system_words) : self.gold_words = gold_words self.system_words = system_words self.matched_words = [] self.matched_words_map = {} def append_aligned_words(self, gold_word, system_word) : self.matched_words.append(AlignmentWord(gold_word, system_word)) self.matched_words_map[system_word] = gold_word def spans_score(gold_spans, system_spans) : correct, gi, si = 0, 0, 0 while gi < len(gold_spans) and si < len(system_spans) : if system_spans[si].start < gold_spans[gi].start : si += 1 elif gold_spans[gi].start < system_spans[si].start : gi += 1 else : correct += gold_spans[gi].end == system_spans[si].end si += 1 gi += 1 return [Score(len(gold_spans), len(system_spans), [correct])] def alignment_score(alignment, key_fn=None, filter_fn=None) : if filter_fn is not None : gold = sum(1 for gold in alignment.gold_words if filter_fn(gold)) system = sum(1 for system in alignment.system_words if filter_fn(system)) aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word)) else : gold = len(alignment.gold_words) system = len(alignment.system_words) aligned = len(alignment.matched_words) if key_fn is None : # Return score for whole aligned words return [Score(gold, system, [aligned])] def gold_aligned_gold(word) : return word def gold_aligned_system(word) : return alignment.matched_words_map.get(word, "NotAligned") if word is not None else None isNumericOnly = True for words in alignment.matched_words : if filter_fn is None or filter_fn(words.gold_word) : goldItem = key_fn(words.gold_word, gold_aligned_gold) systemItem = key_fn(words.system_word, gold_aligned_system) if (not is_float(systemItem)) or (not is_float(goldItem)) : isNumericOnly = False break correct = [0,0] errors = [] goldValues = [] predictedValues = [] for words in alignment.matched_words : if filter_fn is None or filter_fn(words.gold_word) : goldItem = key_fn(words.gold_word, gold_aligned_gold) systemItem = key_fn(words.system_word, gold_aligned_system) if not isNumericOnly : if goldItem == systemItem : correct[0] += 1 else : errors.append(words) else : correct[0] -= abs(float(goldItem) - float(systemItem))**1 correct[1] -= abs(float(goldItem) - float(systemItem))**2 goldValues.append(float(goldItem)) predictedValues.append(float(systemItem)) R2 = 0.0 if isNumericOnly and len(goldValues) > 0 : correct[0] /= len(goldValues) correct[1] /= len(goldValues) goldMean = sum(goldValues) / len(goldValues) predMean = sum(predictedValues) / len(predictedValues) numerator = 0.0 denom1 = 0.0 denom2 = 0.0 for i in range(len(predictedValues)) : numerator += (predictedValues[i]-predMean)*(goldValues[i]-goldMean) denom1 += (predictedValues[i]-predMean)**2 denom2 += (goldValues[i]-goldMean)**2 pearson = 0.0 if denom1 > 0.0 and denom2 > 0.0 : pearson = numerator/((denom1**0.5)*(denom2**0.5)) R2 = pearson**2 return [Score(gold, system, correct, aligned, isNumeric=isNumericOnly, R2=R2), errors] def beyond_end(words, i, multiword_span_end) : if i >= len(words) : return True if words[i].is_multiword : return words[i].span.start >= multiword_span_end return words[i].span.end > multiword_span_end def extend_end(word, multiword_span_end) : if word.is_multiword and word.span.end > multiword_span_end : return word.span.end return multiword_span_end def find_multiword_span(gold_words, system_words, gi, si) : # We know gold_words[gi].is_multiword or system_words[si].is_multiword. # Find the start of the multiword span (gs, ss), so the multiword span is minimal. # Initialize multiword_span_end characters index. if gold_words[gi].is_multiword : multiword_span_end = gold_words[gi].span.end if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start : si += 1 else : # if system_words[si].is_multiword multiword_span_end = system_words[si].span.end if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start : gi += 1 gs, ss = gi, si # Find the end of the multiword span # (so both gi and si are pointing to the word following the multiword span end). while not beyond_end(gold_words, gi, multiword_span_end) or \ not beyond_end(system_words, si, multiword_span_end) : if gi < len(gold_words) and (si >= len(system_words) or gold_words[gi].span.start <= system_words[si].span.start) : multiword_span_end = extend_end(gold_words[gi], multiword_span_end) gi += 1 else : multiword_span_end = extend_end(system_words[si], multiword_span_end) si += 1 return gs, ss, gi, si def compute_lcs(gold_words, system_words, gi, si, gs, ss) : lcs = [[0] * (si - ss) for i in range(gi - gs)] for g in reversed(range(gi - gs)) : for s in reversed(range(si - ss)) : if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() : lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) return lcs def align_words(gold_words, system_words) : alignment = Alignment(gold_words, system_words) gi, si = 0, 0 while gi < len(gold_words) and si < len(system_words) : if gold_words[gi].is_multiword or system_words[si].is_multiword : # A: Multi-word tokens => align via LCS within the whole "multiword span". gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) if si > ss and gi > gs : lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) # Store aligned words s, g = 0, 0 while g < gi - gs and s < si - ss : if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() : alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) g += 1 s += 1 elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0) : g += 1 else : s += 1 else : # B: No multi-word token => align according to spans. if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end) : alignment.append_aligned_words(gold_words[gi], system_words[si]) gi += 1 si += 1 elif gold_words[gi].span.start <= system_words[si].span.start : gi += 1 else : si += 1 return alignment # Check that the underlying character sequences do match. if gold_ud.characters != system_ud.characters : index = 0 while index < len(gold_ud.characters) and index < len(system_ud.characters) and \ gold_ud.characters[index] == system_ud.characters[index] : index += 1 raise UDError( "The concatenation of tokens in gold file and in system file differ!\n" + "First 20 differing characters in gold file: '{}' and system file: '{}'".format( "".join(map(_encode, gold_ud.characters[index:index + 20])), "".join(map(_encode, system_ud.characters[index:index + 20])) ) ) # Align words alignment = align_words(gold_ud.words, system_ud.words) # Compute the F1-scores result = {} if "FORM" in col2index : result["Tokens"] = spans_score(gold_ud.tokens, system_ud.tokens) result["Words"] = alignment_score(alignment) if "UPOS" in col2index : result["UPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["UPOS"]]) if "XPOS" in col2index : result["XPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["XPOS"]]) if "FEATS" in col2index : result["UFeats"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["FEATS"]]) if "LEMMA" in col2index : result["Lemmas"] = alignment_score(alignment, lambda w, ga : w.columns[col2index["LEMMA"]] if ga(w).columns[col2index["LEMMA"]] != "_" else "_") if "HEAD" in col2index : result["UAS"] = alignment_score(alignment, lambda w, ga : ga(w.parent)) if "DEPREL" in col2index : result["LAS"] = alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[col2index["DEPREL"]])) if "DEPREL" in col2index and "UPOS" in col2index and "FEATS" in col2index : result["MLAS"] = alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[col2index["DEPREL"]], w.columns[col2index["UPOS"]], w.columns[col2index["FEATS"]], [(ga(c), c.columns[col2index["DEPREL"]], c.columns[col2index["UPOS"]], c.columns[col2index["FEATS"]]) for c in w.functional_children]), filter_fn=lambda w: w.is_content_deprel) if "ID" in col2index : result["Sentences"] = spans_score(gold_ud.sentences, system_ud.sentences) for colName in col2index : if colName in extraColumns and colName != "_" : result[colName] = alignment_score(alignment, lambda w, _ : w.columns[col2index[colName]]) return result ################################################################################ ################################################################################ def load_conllu_file(path) : _file = open(path, mode="r", **({"encoding" : "utf-8"} if sys.version_info >= (3, 0) else {})) return load_conllu(_file) ################################################################################ ################################################################################ def evaluate_wrapper(args) : # Load CoNLL-U files gold_ud = load_conllu_file(args.gold_file) system_files = [load_conllu_file(args.system_file)] if args.system_file2 is not None : system_files.append(load_conllu_file(args.system_file2)) return gold_ud, [(system, evaluate(gold_ud, system, set(args.extra.split(',')))) for system in system_files] ################################################################################ ################################################################################ class Error : def __init__(self, gold_file, system_file, gold_word, system_word, metric) : self.gold = gold_word self.pred = system_word self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end] self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end] self.type = self.gold.columns[col2index[metric2colname[metric]]]+"->"+self.pred.columns[col2index[metric2colname[metric]]] def __str__(self) : result = [] gold_lines = [] pred_lines = [] for word in self.gold_sentence : gold_lines.append((">" if word == self.gold else " ") + " ".join(filter_columns(word.columns))) for word in self.pred_sentence : pred_lines.append((">" if word == self.pred else " ") + " ".join(filter_columns(word.columns))) for index in range(max(len(gold_lines), len(pred_lines))) : result.append("{} | {}".format(gold_lines[index] if index < len(gold_lines) else " "*len(pred_lines[index]), pred_lines[index] if index < len(pred_lines) else " "*len(gold_lines[index]))) return "\n".join(result) class Errors : def __init__(self, metric, errors1=None, errors2=None) : self.types = [] self.nb_errors = 0 self.metric = metric if errors1 is not None and errors2 is not None : for type in errors1.types : for error in type.errors : if not errors2.has(error) : self.add(error) def __len__(self) : return self.nb_errors def add(self, error) : self.nb_errors += 1 for t in self.types : if t.type == error.type : t.add(error) return self.types.append(ErrorType(error.type)) self.types[-1].add(error) def has(self, error) : for t in self.types : if t.type == error.type : return t.has(error) def sort(self) : self.types.sort(key=len, reverse=True) class ErrorType : def __init__(self, error_type) : self.type = error_type self.errors = [] def __len__(self) : return len(self.errors) def add(self, error) : self.errors.append(error) def has(self, error) : for other_error in self.errors : if other_error.gold == error.gold : return True return False ################################################################################ ################################################################################ def compute_errors(gold_file, system_file, evaluation, metric) : errors = Errors(metric) for alignment_word in evaluation[metric][1] : gold = alignment_word.gold_word pred = alignment_word.system_word error = Error(gold_file, system_file, gold, pred, metric) errors.add(error) return errors ################################################################################ ################################################################################ def main() : # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("gold_file", type=str, help="Name of the CoNLL-U file with the gold data.") parser.add_argument("system_file", type=str, help="Name of the CoNLL-U file with the predicted data.") parser.add_argument("--counts", "-c", default=False, action="store_true", help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.") parser.add_argument("--system_file2", help="Name of another CoNLL-U file with predicted data, for error comparison.") parser.add_argument("--enumerate_errors", "-e", default=None, help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").") parser.add_argument("--extra", "-x", default="", help="Comma separated list of column names for which to compute score (e.g. \"TIME,EOS\").") args = parser.parse_args() errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',') global col2index global index2col col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") # Evaluate gold_ud, evaluations = evaluate_wrapper(args) errors_by_file = [] examples_list = [] for id1 in range(len(evaluations)) : (system_ud, evaluation) = evaluations[id1] fnamelen = len(system_ud.filename) print("*"*math.ceil((80-2-fnamelen)/2),system_ud.filename,"*"*math.floor((80-2-fnamelen)/2)) # Compute errors errors_list = [compute_errors(gold_ud, system_ud, evaluation, metric) for metric in errors_metrics] errors_by_file.append(errors_list) maxColNameSize = 1 + max([len(colName) for colName in evaluation]) # Print the evaluation if args.counts : print("{:^{}}| Correct | Gold | Predicted | Aligned".format("Metric", maxColNameSize)) else : print("{:^{}}| Precision | Recall | F1 Score | AligndAcc".format("Metric", maxColNameSize)) print("{}+-----------+-----------+-----------+-----------".format("-"*maxColNameSize)) for metric in evaluation : if args.counts : print("{:{}}|{:10} |{:10} |{:10} |{:10}".format( metric, maxColNameSize, evaluation[metric][0].correct, evaluation[metric][0].gold_total, evaluation[metric][0].system_total, evaluation[metric][0].aligned_total or (evaluation[metric][0].correct if metric == "Words" else "") )) else : precision = ("{:10.2f}" if abs(evaluation[metric][0].precision) > 1.0 else "{:10.4f}").format(evaluation[metric][0].precision) recall = ("{:10.2f}" if abs(evaluation[metric][0].recall) > 1.0 else "{:10.4f}").format(evaluation[metric][0].recall) f1 = ("{:10.2f}" if abs(evaluation[metric][0].f1) > 1.0 else "{:10.4f}").format(evaluation[metric][0].f1) print("{:{}}|{} |{} |{} |{}".format( metric, maxColNameSize, precision, recall, f1, "{:10.2f}".format(evaluation[metric][0].aligned_accuracy) if evaluation[metric][0].aligned_accuracy is not None else "" )) for id2 in range(len(errors_list)) : errors = errors_list[id2] errors.sort() print("Most frequent errors for metric '{}' :".format(errors.metric)) print("{:>12} {:>5} {:>6} {}\n {:->37}".format("ID", "NB", "%AGE", "GOLD->SYSTEM", "")) print("{:>12} {:5} {:6.2f}%".format("Total", len(errors), 100)) for id3 in range(len(errors.types[:10])) : error_type = errors.types[:10][id3] t = error_type.type nb = len(error_type) percent = 100.0*nb/len(errors) id = ":".join(map(str,[id1,id2,id3,"*"])) print("{:>12} {:5} {:6.2f}% {}".format(id, nb, percent, t)) for id4 in range(len(error_type)) : examples_list.append((":".join(map(str,[id1,id2,id3,id4])), error_type.errors[id4])) print("") for id1 in range(len(evaluations)) : (system1_ud, evaluation) = evaluations[id1] for id2 in range(len(evaluations)) : if id1 == id2 : continue (system2_ud, evaluation) = evaluations[id2] errors1 = errors_by_file[id1] errors2 = errors_by_file[id2] if len(errors1) > 0 : print("{} Error comparison {}".format("*"*31, "*"*31)) print("{:>30} : {}".format("These errors are present in", system1_ud.filename)) print("{:>30} : {}".format("and not in", system2_ud.filename)) for id3 in range(len(errors1)) : metric = errors1[id3].metric errors_diff = Errors(metric, errors1[id3], errors2[id3]) errors_diff.sort() print("{:>12} {:5} {:6.2f}%".format("Total", len(errors_diff), 100)) for id4 in range(len(errors_diff.types[:10])) : error_type = errors_diff.types[:10][id4] t = error_type.type nb = len(error_type) percent = 100.0*nb/len(errors) id = ":".join(map(str,["d"+str(id1),id3,id4,"*"])) print("{:>12} {:5} {:6.2f}% {}".format(id, nb, percent, t)) for id5 in range(len(error_type)) : examples_list.append((":".join(map(str,["d"+str(id1),id3,id4,id5])), error_type.errors[id5])) print("") if len(examples_list) > 0 : print("{}List of all errors by their ID{}".format("*"*25,"*"*25)) print("{}{:^30}{}\n".format("*"*25,"Format is GOLD | PREDICTED","*"*25)) for (id,error) in examples_list : print("ID="+id) print(error) print("") ################################################################################ ################################################################################ if __name__ == "__main__" : main() ################################################################################