Select Git revision
__init__.py
conll18_ud_eval.py 34.59 KiB
#!/usr/bin/env python3
# Compatible with Python 2.7 and 3.2+, can be used either as a module
# or a standalone executable.
#
# Copyright 2017, 2018 Institute of Formal and Applied Linguistics (UFAL),
# Faculty of Mathematics and Physics, Charles University, Czech Republic.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# Authors: Milan Straka, Martin Popel <surname@ufal.mff.cuni.cz>
#
# Changelog:
# - [12 Apr 2018] Version 0.9: Initial release.
# - [19 Apr 2018] Version 1.0: Fix bug in MLAS (duplicate entries in functional_children).
# Add --counts option.
# - [02 May 2018] Version 1.1: When removing spaces to match gold and system characters,
# consider all Unicode characters of category Zs instead of
# just ASCII space.
# - [25 Jun 2018] Version 1.2: Use python3 in the she-bang (instead of python).
# In Python2, make the whole computation use `unicode` strings.
#
# Updated by Franck Dary for Macaon
# Command line usage
# ------------------
# conll18_ud_eval.py gold_conllu_file system_conllu_file
#
# Metrics printed (as precision, recall, F1 score,
# and in case the metric is computed on aligned words also accuracy on these):
# - Tokens: how well do the gold tokens match system tokens
# - Sentences: how well do the gold sentences match system sentences
# - Words: how well can the gold words be aligned to system words
# - UPOS: using aligned words, how well does UPOS match
# - XPOS: using aligned words, how well does XPOS match
# - UFeats: using aligned words, how well does universal FEATS match
# - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
# - Lemmas: using aligned words, how well does LEMMA match
# - UAS: using aligned words, how well does HEAD match
# - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
# - CLAS: using aligned words with content DEPREL, how well does
# HEAD+DEPREL(ignoring subtypes) match
# - MLAS: using aligned words with content DEPREL, how well does
# HEAD+DEPREL(ignoring subtypes)+UPOS+UFEATS+FunctionalChildren(DEPREL+UPOS+UFEATS) match
# - BLEX: using aligned words with content DEPREL, how well does
# HEAD+DEPREL(ignoring subtypes)+LEMMAS match
# - if -c is given, raw counts of correct/gold_total/system_total/aligned words are printed
# instead of precision/recall/F1/AlignedAccuracy for all metrics.
# API usage
# ---------
# - load_conllu(file)
# - loads CoNLL-U file from given file object to an internal representation
# - the file object should return str in both Python 2 and Python 3
# - raises UDError exception if the given file cannot be loaded
# - evaluate(gold_ud, system_ud)
# - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
# - raises UDError if the concatenated tokens of gold and system file do not match
# - returns a dictionary with the metrics described above, each metric having
# three fields: precision, recall and f1
# Description of token matching
# -----------------------------
# In order to match tokens of gold file and system file, we consider the text
# resulting from concatenation of gold tokens and text resulting from
# concatenation of system tokens. These texts should match -- if they do not,
# the evaluation fails.
#
# If the texts do match, every token is represented as a range in this original
# text, and tokens are equal only if their range is the same.
# Description of word matching
# ----------------------------
# When matching words of gold file and system file, we first match the tokens.
# The words which are also tokens are matched as tokens, but words in multi-word
# tokens have to be handled differently.
#
# To handle multi-word tokens, we start by finding "multi-word spans".
# Multi-word span is a span in the original text such that
# - it contains at least one multi-word token
# - all multi-word tokens in the span (considering both gold and system ones)
# are completely inside the span (i.e., they do not "stick out")
# - the multi-word span is as small as possible
#
# For every multi-word span, we align the gold and system words completely
# inside this span using LCS on their FORMs. The words not intersecting
# (even partially) any multi-word span are then aligned as tokens.
from __future__ import division
from __future__ import print_function
from readMCD import readMCD
import argparse
import io
import os
import sys
import unicodedata
import unittest
import math
# CoNLL-U column names
col2index = {}
index2col = {}
metric2colname = {
"UPOS" : "UPOS",
"Lemmas" : "LEMMA",
}
defaultColumns = {
"ID",
"FORM",
"UPOS",
"XPOS",
"LEMMA",
"FEATS",
"HEAD",
"DEPREL",
"DEPS",
"MISC",
}
# Content and functional relations
CONTENT_DEPRELS = {
"nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative",
"expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos",
"nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list",
"parataxis", "orphan", "goeswith", "reparandum", "root", "dep"
}
FUNCTIONAL_DEPRELS = {
"aux", "cop", "mark", "det", "clf", "case", "cc"
}
UNIVERSAL_FEATURES = {
"PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender",
"Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood",
"Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite"
}
################################################################################
def is_float(value) :
if not isinstance(value, str) :
return False
try :
float(value)
return True
except ValueError :
return False
################################################################################
################################################################################
def filter_columns(columns) :
res = []
cols = [("ID",4), ("FORM",8), ("UPOS",8), ("HEAD",4), ("DEPREL", 8)]
contents = [(columns[col2index[col]], max_size) for (col, max_size) in cols if col in col2index]
for (content, max_len) in contents :
res.append(("{:"+str(max_len)+"}").format(content if len(content) <= max_len else "{}…{}".format(content[0:math.ceil((max_len-1)/2)],content[-((max_len-1)//2):])))
return res
################################################################################
################################################################################
# UD Error is used when raising exceptions in this module
class UDError(Exception) :
pass
################################################################################
################################################################################
# Conversion methods handling `str` <-> `unicode` conversions in Python2
def _decode(text) :
return text if sys.version_info[0] >= 3 or not isinstance(text, str) else text.decode("utf-8")
################################################################################
################################################################################
def _encode(text) :
return text if sys.version_info[0] >= 3 or not isinstance(text, unicode) else text.encode("utf-8")
################################################################################
################################################################################
# Load given CoNLL-U file into internal representation
def load_conllu(file) :
global col2index
global index2col
# Internal representation classes
class UDRepresentation :
def __init__(self) :
# Characters of all the tokens in the whole file.
# Whitespace between tokens is not included.
self.characters = []
# List of UDSpan instances with start&end indices into `characters`.
self.tokens = []
# List of UDWord instances.
self.words = []
# List of UDSpan instances with start&end indices into `characters`.
self.sentences = []
# List of UDSpan instances with start&end indices into `words`.
self.sentences_words = []
# Name of the file this representation has been extracted from.
self.filename = ""
class UDSpan :
def __init__(self, start, end) :
self.start = start
# Note that self.end marks the first position **after the end** of span,
# so we can use characters[start:end] or range(start, end).
self.end = end
class UDWord :
def __init__(self, span, columns, is_multiword) :
# Index of the sentence this word is part of, within ud_representation.sentences.
self.sentence = None
# Span of this word (or MWT, see below) within ud_representation.characters.
self.span = span
# 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
self.columns = columns
# is_multiword==True means that this word is part of a multi-word token.
# In that case, self.span marks the span of the whole multi-word token.
self.is_multiword = is_multiword
# Reference to the UDWord instance representing the HEAD (or None if root).
self.parent = None
# List of references to UDWord instances representing functional-deprel children.
self.functional_children = []
# Only consider universal FEATS.
# TODO consider all feats
if "FEATS" in col2index :
self.columns[col2index["FEATS"]] = "|".join(sorted(feat for feat in columns[col2index["FEATS"]].split("|")
if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
if "DEPREL" in col2index :
# Let's ignore language-specific deprel subtypes.
self.columns[col2index["DEPREL"]] = columns[col2index["DEPREL"]].split(":")[0]
# Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS
self.is_content_deprel = self.columns[col2index["DEPREL"]] in CONTENT_DEPRELS
self.is_functional_deprel = self.columns[col2index["DEPREL"]] in FUNCTIONAL_DEPRELS
ud = UDRepresentation()
ud.filename = file.name
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
# Load the CoNLL-U file
index, sentence_start = 0, None
id_starts_at_zero = False
while True :
line = file.readline()
if not line :
break
line = _decode(line.rstrip("\r\n"))
# Handle sentence start boundaries
if sentence_start is None :
# Skip comments
if line.startswith("#") :
splited = line.split("global.columns =")
if len(splited) > 1 :
col2index, index2col = readMCD(splited[-1].strip())
continue
# Start a new sentence
sentence_start = len(ud.words)
ud.sentences.append(UDSpan(index, 0))
ud.sentences_words.append(UDSpan(sentence_start, 0))
if not line :
# Add parent and children UDWord links and check there are no cycles
def process_word(word) :
if "HEAD" in col2index :
if word.parent == "remapping" :
raise UDError("There is a cycle in a sentence")
if word.parent is None :
head = int(word.columns[col2index["HEAD"]])
if head < 0 or head > len(ud.words) - sentence_start :
raise UDError("HEAD '{}' points outside of the sentence".format(_encode(word.columns[col2index["HEAD"]])))
if head :
parent = ud.words[sentence_start + head - 1]
word.parent = "remapping"
process_word(parent)
word.parent = parent
for word in ud.words[sentence_start:] :
process_word(word)
# func_children cannot be assigned within process_word
# because it is called recursively and may result in adding one child twice.
for word in ud.words[sentence_start:] :
if "HEAD" in col2index and word.parent and word.is_functional_deprel :
word.parent.functional_children.append(word)
# Check there is a single root node
if "HEAD" in col2index and len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1 :
raise UDError("There are multiple roots in a sentence")
# End the sentence
ud.sentences[-1].end = index
ud.sentences_words[-1].end = len(ud.words)
sentence_start = None
continue
# Read next token/word
columns = line.split("\t")
# Skip empty nodes
if "ID" in col2index and "." in columns[col2index["ID"]] :
continue
# Delete spaces from FORM, so gold.characters == system.characters
# even if one of them tokenizes the space. Use any Unicode character
# with category Zs.
if "FORM" in col2index :
columns[col2index["FORM"]] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[col2index["FORM"]]))
if not columns[col2index["FORM"]] :
raise UDError("There is an empty FORM in the CoNLL-U file")
# Save token
form_value = columns[col2index["FORM"]] if "FORM" in col2index else "_"
ud.characters.extend(form_value)
ud.tokens.append(UDSpan(index, index + len(form_value)))
index += len(form_value)
# Handle multi-word tokens to save word(s)
if "ID" in col2index and "-" in columns[col2index["ID"]] :
try :
start, end = map(int, columns[col2index["ID"]].split("-"))
except :
raise UDError("Cannot parse multi-word token ID '{}'".format(_encode(columns[col2index["ID"]])))
for _ in range(start, end + 1) :
word_line = _decode(file.readline().rstrip("\r\n"))
word_columns = word_line.split("\t")
ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
ud.words[-1].sentence = len(ud.sentences)-1
# Basic tokens/words
else :
try :
word_id = int(columns[col2index["ID"]]) if "ID" in col2index else "_"
if word_id == 0 :
id_starts_at_zero = True
except :
raise UDError("Cannot parse word ID '{}'".format(_encode(columns[col2index["ID"]])))
if word_id != len(ud.words) - sentence_start + (0 if id_starts_at_zero else 1) :
raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(
_encode(columns[col2index["ID"]]), _encode(columns[col2index["FORM"]]), len(ud.words) - sentence_start + 1))
try :
head_id = int(columns[col2index["HEAD"]]) if "HEAD" in col2index else 0
except :
raise UDError("Cannot parse HEAD '{}'".format(_encode(columns[col2index["HEAD"]])))
if head_id < 0 :
raise UDError("HEAD cannot be negative")
ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
ud.words[-1].sentence = len(ud.sentences)-1
if sentence_start is not None :
raise UDError("The CoNLL-U file does not end with empty line")
return ud
################################################################################
################################################################################
# Evaluate the gold and system treebanks (loaded using load_conllu).
def evaluate(gold_ud, system_ud, extraColumns) :
class Score :
def __init__(self, gold_total, system_total, correct, aligned_total=None, isNumeric=False, R2=None) :
self.correct = correct[0]
self.gold_total = gold_total
self.system_total = system_total
self.aligned_total = aligned_total
if isNumeric :
self.precision = 0
self.recall = R2
self.f1 = correct[1]
self.aligned_accuracy = correct[0]
else :
self.precision = 100*correct[0] / system_total if system_total else 0.0
self.recall = 100*correct[0] / gold_total if gold_total else 0.0
self.f1 = 2 * 100*correct[0] / (system_total + gold_total) if system_total + gold_total else 0.0
self.aligned_accuracy = 100*correct[0] / aligned_total if aligned_total else aligned_total
class AlignmentWord :
def __init__(self, gold_word, system_word) :
self.gold_word = gold_word
self.system_word = system_word
class Alignment :
def __init__(self, gold_words, system_words) :
self.gold_words = gold_words
self.system_words = system_words
self.matched_words = []
self.matched_words_map = {}
def append_aligned_words(self, gold_word, system_word) :
self.matched_words.append(AlignmentWord(gold_word, system_word))
self.matched_words_map[system_word] = gold_word
def spans_score(gold_spans, system_spans) :
correct, gi, si = 0, 0, 0
while gi < len(gold_spans) and si < len(system_spans) :
if system_spans[si].start < gold_spans[gi].start :
si += 1
elif gold_spans[gi].start < system_spans[si].start :
gi += 1
else :
correct += gold_spans[gi].end == system_spans[si].end
si += 1
gi += 1
return [Score(len(gold_spans), len(system_spans), [correct])]
def alignment_score(alignment, key_fn=None, filter_fn=None) :
if filter_fn is not None :
gold = sum(1 for gold in alignment.gold_words if filter_fn(gold))
system = sum(1 for system in alignment.system_words if filter_fn(system))
aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word))
else :
gold = len(alignment.gold_words)
system = len(alignment.system_words)
aligned = len(alignment.matched_words)
if key_fn is None :
# Return score for whole aligned words
return [Score(gold, system, [aligned])]
def gold_aligned_gold(word) :
return word
def gold_aligned_system(word) :
return alignment.matched_words_map.get(word, "NotAligned") if word is not None else None
isNumericOnly = True
for words in alignment.matched_words :
if filter_fn is None or filter_fn(words.gold_word) :
goldItem = key_fn(words.gold_word, gold_aligned_gold)
systemItem = key_fn(words.system_word, gold_aligned_system)
if (not is_float(systemItem)) or (not is_float(goldItem)) :
isNumericOnly = False
break
correct = [0,0]
errors = []
goldValues = []
predictedValues = []
for words in alignment.matched_words :
if filter_fn is None or filter_fn(words.gold_word) :
goldItem = key_fn(words.gold_word, gold_aligned_gold)
systemItem = key_fn(words.system_word, gold_aligned_system)
if not isNumericOnly :
if goldItem == systemItem :
correct[0] += 1
else :
errors.append(words)
else :
correct[0] -= abs(float(goldItem) - float(systemItem))**1
correct[1] -= abs(float(goldItem) - float(systemItem))**2
goldValues.append(float(goldItem))
predictedValues.append(float(systemItem))
R2 = 0.0
if isNumericOnly and len(goldValues) > 0 :
correct[0] /= len(goldValues)
correct[1] /= len(goldValues)
goldMean = sum(goldValues) / len(goldValues)
predMean = sum(predictedValues) / len(predictedValues)
numerator = 0.0
denom1 = 0.0
denom2 = 0.0
for i in range(len(predictedValues)) :
numerator += (predictedValues[i]-predMean)*(goldValues[i]-goldMean)
denom1 += (predictedValues[i]-predMean)**2
denom2 += (goldValues[i]-goldMean)**2
pearson = 0.0
if denom1 > 0.0 and denom2 > 0.0 :
pearson = numerator/((denom1**0.5)*(denom2**0.5))
R2 = pearson**2
return [Score(gold, system, correct, aligned, isNumeric=isNumericOnly, R2=R2), errors]
def beyond_end(words, i, multiword_span_end) :
if i >= len(words) :
return True
if words[i].is_multiword :
return words[i].span.start >= multiword_span_end
return words[i].span.end > multiword_span_end
def extend_end(word, multiword_span_end) :
if word.is_multiword and word.span.end > multiword_span_end :
return word.span.end
return multiword_span_end
def find_multiword_span(gold_words, system_words, gi, si) :
# We know gold_words[gi].is_multiword or system_words[si].is_multiword.
# Find the start of the multiword span (gs, ss), so the multiword span is minimal.
# Initialize multiword_span_end characters index.
if gold_words[gi].is_multiword :
multiword_span_end = gold_words[gi].span.end
if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start :
si += 1
else : # if system_words[si].is_multiword
multiword_span_end = system_words[si].span.end
if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start :
gi += 1
gs, ss = gi, si
# Find the end of the multiword span
# (so both gi and si are pointing to the word following the multiword span end).
while not beyond_end(gold_words, gi, multiword_span_end) or \
not beyond_end(system_words, si, multiword_span_end) :
if gi < len(gold_words) and (si >= len(system_words) or
gold_words[gi].span.start <= system_words[si].span.start) :
multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
gi += 1
else :
multiword_span_end = extend_end(system_words[si], multiword_span_end)
si += 1
return gs, ss, gi, si
def compute_lcs(gold_words, system_words, gi, si, gs, ss) :
lcs = [[0] * (si - ss) for i in range(gi - gs)]
for g in reversed(range(gi - gs)) :
for s in reversed(range(si - ss)) :
if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() :
lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
return lcs
def align_words(gold_words, system_words) :
alignment = Alignment(gold_words, system_words)
gi, si = 0, 0
while gi < len(gold_words) and si < len(system_words) :
if gold_words[gi].is_multiword or system_words[si].is_multiword :
# A: Multi-word tokens => align via LCS within the whole "multiword span".
gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
if si > ss and gi > gs :
lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
# Store aligned words
s, g = 0, 0
while g < gi - gs and s < si - ss :
if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() :
alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
g += 1
s += 1
elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0) :
g += 1
else :
s += 1
else :
# B: No multi-word token => align according to spans.
if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end) :
alignment.append_aligned_words(gold_words[gi], system_words[si])
gi += 1
si += 1
elif gold_words[gi].span.start <= system_words[si].span.start :
gi += 1
else :
si += 1
return alignment
# Check that the underlying character sequences do match.
if gold_ud.characters != system_ud.characters :
index = 0
while index < len(gold_ud.characters) and index < len(system_ud.characters) and \
gold_ud.characters[index] == system_ud.characters[index] :
index += 1
raise UDError(
"The concatenation of tokens in gold file and in system file differ!\n" +
"First 20 differing characters in gold file: '{}' and system file: '{}'".format(
"".join(map(_encode, gold_ud.characters[index:index + 20])),
"".join(map(_encode, system_ud.characters[index:index + 20]))
)
)
# Align words
alignment = align_words(gold_ud.words, system_ud.words)
# Compute the F1-scores
result = {}
if "FORM" in col2index :
result["Tokens"] = spans_score(gold_ud.tokens, system_ud.tokens)
result["Words"] = alignment_score(alignment)
if "UPOS" in col2index :
result["UPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["UPOS"]])
if "XPOS" in col2index :
result["XPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["XPOS"]])
if "FEATS" in col2index :
result["UFeats"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["FEATS"]])
if "LEMMA" in col2index :
result["Lemmas"] = alignment_score(alignment, lambda w, ga : w.columns[col2index["LEMMA"]] if ga(w).columns[col2index["LEMMA"]] != "_" else "_")
if "HEAD" in col2index :
result["UAS"] = alignment_score(alignment, lambda w, ga : ga(w.parent))
if "DEPREL" in col2index :
result["LAS"] = alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[col2index["DEPREL"]]))
if "DEPREL" in col2index and "UPOS" in col2index and "FEATS" in col2index :
result["MLAS"] = alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[col2index["DEPREL"]], w.columns[col2index["UPOS"]], w.columns[col2index["FEATS"]], [(ga(c), c.columns[col2index["DEPREL"]], c.columns[col2index["UPOS"]], c.columns[col2index["FEATS"]]) for c in w.functional_children]), filter_fn=lambda w: w.is_content_deprel)
if "ID" in col2index :
result["Sentences"] = spans_score(gold_ud.sentences, system_ud.sentences)
for colName in col2index :
if colName in extraColumns and colName != "_" :
result[colName] = alignment_score(alignment, lambda w, _ : w.columns[col2index[colName]])
return result
################################################################################
################################################################################
def load_conllu_file(path) :
_file = open(path, mode="r", **({"encoding" : "utf-8"} if sys.version_info >= (3, 0) else {}))
return load_conllu(_file)
################################################################################
################################################################################
def evaluate_wrapper(args) :
# Load CoNLL-U files
gold_ud = load_conllu_file(args.gold_file)
system_files = [load_conllu_file(args.system_file)]
if args.system_file2 is not None :
system_files.append(load_conllu_file(args.system_file2))
return gold_ud, [(system, evaluate(gold_ud, system, set(args.extra.split(',')))) for system in system_files]
################################################################################
################################################################################
class Error :
def __init__(self, gold_file, system_file, gold_word, system_word, metric) :
self.gold = gold_word
self.pred = system_word
self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end]
self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end]
self.type = self.gold.columns[col2index[metric2colname[metric]]]+"->"+self.pred.columns[col2index[metric2colname[metric]]]
def __str__(self) :
result = []
gold_lines = []
pred_lines = []
for word in self.gold_sentence :
gold_lines.append((">" if word == self.gold else " ") + " ".join(filter_columns(word.columns)))
for word in self.pred_sentence :
pred_lines.append((">" if word == self.pred else " ") + " ".join(filter_columns(word.columns)))
for index in range(max(len(gold_lines), len(pred_lines))) :
result.append("{} | {}".format(gold_lines[index] if index < len(gold_lines) else " "*len(pred_lines[index]), pred_lines[index] if index < len(pred_lines) else " "*len(gold_lines[index])))
return "\n".join(result)
class Errors :
def __init__(self, metric, errors1=None, errors2=None) :
self.types = []
self.nb_errors = 0
self.metric = metric
if errors1 is not None and errors2 is not None :
for type in errors1.types :
for error in type.errors :
if not errors2.has(error) :
self.add(error)
def __len__(self) :
return self.nb_errors
def add(self, error) :
self.nb_errors += 1
for t in self.types :
if t.type == error.type :
t.add(error)
return
self.types.append(ErrorType(error.type))
self.types[-1].add(error)
def has(self, error) :
for t in self.types :
if t.type == error.type :
return t.has(error)
def sort(self) :
self.types.sort(key=len, reverse=True)
class ErrorType :
def __init__(self, error_type) :
self.type = error_type
self.errors = []
def __len__(self) :
return len(self.errors)
def add(self, error) :
self.errors.append(error)
def has(self, error) :
for other_error in self.errors :
if other_error.gold == error.gold :
return True
return False
################################################################################
################################################################################
def compute_errors(gold_file, system_file, evaluation, metric) :
errors = Errors(metric)
for alignment_word in evaluation[metric][1] :
gold = alignment_word.gold_word
pred = alignment_word.system_word
error = Error(gold_file, system_file, gold, pred, metric)
errors.add(error)
return errors
################################################################################
################################################################################
def main() :
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("gold_file", type=str,
help="Name of the CoNLL-U file with the gold data.")
parser.add_argument("system_file", type=str,
help="Name of the CoNLL-U file with the predicted data.")
parser.add_argument("--counts", "-c", default=False, action="store_true",
help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.")
parser.add_argument("--system_file2",
help="Name of another CoNLL-U file with predicted data, for error comparison.")
parser.add_argument("--enumerate_errors", "-e", default=None,
help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").")
parser.add_argument("--extra", "-x", default="",
help="Comma separated list of column names for which to compute score (e.g. \"TIME,EOS\").")
args = parser.parse_args()
errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',')
global col2index
global index2col
# Evaluate
gold_ud, evaluations = evaluate_wrapper(args)
errors_by_file = []
examples_list = []
for id1 in range(len(evaluations)) :
(system_ud, evaluation) = evaluations[id1]
fnamelen = len(system_ud.filename)
print("*"*math.ceil((80-2-fnamelen)/2),system_ud.filename,"*"*math.floor((80-2-fnamelen)/2))
# Compute errors
errors_list = [compute_errors(gold_ud, system_ud, evaluation, metric) for metric in errors_metrics]
errors_by_file.append(errors_list)
maxColNameSize = 1 + max([len(colName) for colName in evaluation])
# Print the evaluation
if args.counts :
print("{:^{}}| Correct | Gold | Predicted | Aligned".format("Metric", maxColNameSize))
else :
print("{:^{}}| Precision | Recall | F1 Score | AligndAcc".format("Metric", maxColNameSize))
print("{}+-----------+-----------+-----------+-----------".format("-"*maxColNameSize))
for metric in evaluation :
if args.counts :
print("{:{}}|{:10} |{:10} |{:10} |{:10}".format(
metric,
maxColNameSize,
evaluation[metric][0].correct,
evaluation[metric][0].gold_total,
evaluation[metric][0].system_total,
evaluation[metric][0].aligned_total or (evaluation[metric][0].correct if metric == "Words" else "")
))
else :
precision = ("{:10.2f}" if abs(evaluation[metric][0].precision) > 1.0 else "{:10.4f}").format(evaluation[metric][0].precision)
recall = ("{:10.2f}" if abs(evaluation[metric][0].recall) > 1.0 else "{:10.4f}").format(evaluation[metric][0].recall)
f1 = ("{:10.2f}" if abs(evaluation[metric][0].f1) > 1.0 else "{:10.4f}").format(evaluation[metric][0].f1)
print("{:{}}|{} |{} |{} |{}".format(
metric,
maxColNameSize,
precision,
recall,
f1,
"{:10.2f}".format(evaluation[metric][0].aligned_accuracy) if evaluation[metric][0].aligned_accuracy is not None else ""
))
for id2 in range(len(errors_list)) :
errors = errors_list[id2]
errors.sort()
print("Most frequent errors for metric '{}' :".format(errors.metric))
print("{:>12} {:>5} {:>6} {}\n {:->37}".format("ID", "NB", "%AGE", "GOLD->SYSTEM", ""))
print("{:>12} {:5} {:6.2f}%".format("Total", len(errors), 100))
for id3 in range(len(errors.types[:10])) :
error_type = errors.types[:10][id3]
t = error_type.type
nb = len(error_type)
percent = 100.0*nb/len(errors)
id = ":".join(map(str,[id1,id2,id3,"*"]))
print("{:>12} {:5} {:6.2f}% {}".format(id, nb, percent, t))
for id4 in range(len(error_type)) :
examples_list.append((":".join(map(str,[id1,id2,id3,id4])), error_type.errors[id4]))
print("")
for id1 in range(len(evaluations)) :
(system1_ud, evaluation) = evaluations[id1]
for id2 in range(len(evaluations)) :
if id1 == id2 :
continue
(system2_ud, evaluation) = evaluations[id2]
errors1 = errors_by_file[id1]
errors2 = errors_by_file[id2]
if len(errors1) > 0 :
print("{} Error comparison {}".format("*"*31, "*"*31))
print("{:>30} : {}".format("These errors are present in", system1_ud.filename))
print("{:>30} : {}".format("and not in", system2_ud.filename))
for id3 in range(len(errors1)) :
metric = errors1[id3].metric
errors_diff = Errors(metric, errors1[id3], errors2[id3])
errors_diff.sort()
print("{:>12} {:5} {:6.2f}%".format("Total", len(errors_diff), 100))
for id4 in range(len(errors_diff.types[:10])) :
error_type = errors_diff.types[:10][id4]
t = error_type.type
nb = len(error_type)
percent = 100.0*nb/len(errors)
id = ":".join(map(str,["d"+str(id1),id3,id4,"*"]))
print("{:>12} {:5} {:6.2f}% {}".format(id, nb, percent, t))
for id5 in range(len(error_type)) :
examples_list.append((":".join(map(str,["d"+str(id1),id3,id4,id5])), error_type.errors[id5]))
print("")
if len(examples_list) > 0 :
print("{}List of all errors by their ID{}".format("*"*25,"*"*25))
print("{}{:^30}{}\n".format("*"*25,"Format is GOLD | PREDICTED","*"*25))
for (id,error) in examples_list :
print("ID="+id)
print(error)
print("")
################################################################################
################################################################################
if __name__ == "__main__" :
main()
################################################################################