Skip to content
Snippets Groups Projects
Select Git revision
  • 573829099befb6fed149aead2ba11159139c1c6e
  • master default protected
  • erased
  • states
  • negatives
  • temp
  • negativeExamples
  • Rl
8 results

conll18_ud_eval.py

Blame
  • conll18_ud_eval.py 34.59 KiB
    #!/usr/bin/env python3
    
    # Compatible with Python 2.7 and 3.2+, can be used either as a module
    # or a standalone executable.
    #
    # Copyright 2017, 2018 Institute of Formal and Applied Linguistics (UFAL),
    # Faculty of Mathematics and Physics, Charles University, Czech Republic.
    #
    # This Source Code Form is subject to the terms of the Mozilla Public
    # License, v. 2.0. If a copy of the MPL was not distributed with this
    # file, You can obtain one at http://mozilla.org/MPL/2.0/.
    #
    # Authors: Milan Straka, Martin Popel <surname@ufal.mff.cuni.cz>
    #
    # Changelog:
    # - [12 Apr 2018] Version 0.9: Initial release.
    # - [19 Apr 2018] Version 1.0: Fix bug in MLAS (duplicate entries in functional_children).
    #                              Add --counts option.
    # - [02 May 2018] Version 1.1: When removing spaces to match gold and system characters,
    #                              consider all Unicode characters of category Zs instead of
    #                              just ASCII space.
    # - [25 Jun 2018] Version 1.2: Use python3 in the she-bang (instead of python).
    #                              In Python2, make the whole computation use `unicode` strings.
    #
    # Updated by Franck Dary for Macaon
    
    # Command line usage
    # ------------------
    # conll18_ud_eval.py gold_conllu_file system_conllu_file
    #
    #   Metrics printed (as precision, recall, F1 score,
    #   and in case the metric is computed on aligned words also accuracy on these):
    #   - Tokens: how well do the gold tokens match system tokens
    #   - Sentences: how well do the gold sentences match system sentences
    #   - Words: how well can the gold words be aligned to system words
    #   - UPOS: using aligned words, how well does UPOS match
    #   - XPOS: using aligned words, how well does XPOS match
    #   - UFeats: using aligned words, how well does universal FEATS match
    #   - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match
    #   - Lemmas: using aligned words, how well does LEMMA match
    #   - UAS: using aligned words, how well does HEAD match
    #   - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match
    #   - CLAS: using aligned words with content DEPREL, how well does
    #       HEAD+DEPREL(ignoring subtypes) match
    #   - MLAS: using aligned words with content DEPREL, how well does
    #       HEAD+DEPREL(ignoring subtypes)+UPOS+UFEATS+FunctionalChildren(DEPREL+UPOS+UFEATS) match
    #   - BLEX: using aligned words with content DEPREL, how well does
    #       HEAD+DEPREL(ignoring subtypes)+LEMMAS match
    # - if -c is given, raw counts of correct/gold_total/system_total/aligned words are printed
    #   instead of precision/recall/F1/AlignedAccuracy for all metrics.
    
    # API usage
    # ---------
    # - load_conllu(file)
    #   - loads CoNLL-U file from given file object to an internal representation
    #   - the file object should return str in both Python 2 and Python 3
    #   - raises UDError exception if the given file cannot be loaded
    # - evaluate(gold_ud, system_ud)
    #   - evaluate the given gold and system CoNLL-U files (loaded with load_conllu)
    #   - raises UDError if the concatenated tokens of gold and system file do not match
    #   - returns a dictionary with the metrics described above, each metric having
    #     three fields: precision, recall and f1
    
    # Description of token matching
    # -----------------------------
    # In order to match tokens of gold file and system file, we consider the text
    # resulting from concatenation of gold tokens and text resulting from
    # concatenation of system tokens. These texts should match -- if they do not,
    # the evaluation fails.
    #
    # If the texts do match, every token is represented as a range in this original
    # text, and tokens are equal only if their range is the same.
    
    # Description of word matching
    # ----------------------------
    # When matching words of gold file and system file, we first match the tokens.
    # The words which are also tokens are matched as tokens, but words in multi-word
    # tokens have to be handled differently.
    #
    # To handle multi-word tokens, we start by finding "multi-word spans".
    # Multi-word span is a span in the original text such that
    # - it contains at least one multi-word token
    # - all multi-word tokens in the span (considering both gold and system ones)
    #   are completely inside the span (i.e., they do not "stick out")
    # - the multi-word span is as small as possible
    #
    # For every multi-word span, we align the gold and system words completely
    # inside this span using LCS on their FORMs. The words not intersecting
    # (even partially) any multi-word span are then aligned as tokens.
    
    
    from __future__ import division
    from __future__ import print_function
    
    from readMCD import readMCD
    
    import argparse
    import io
    import os
    import sys
    import unicodedata
    import unittest
    import math
    
    # CoNLL-U column names
    col2index = {}
    index2col = {}
    
    metric2colname = {
      "UPOS" : "UPOS",
      "Lemmas" : "LEMMA",
    }
    
    defaultColumns = {
    "ID",
    "FORM",
    "UPOS",
    "XPOS",
    "LEMMA",
    "FEATS",
    "HEAD",
    "DEPREL",
    "DEPS",
    "MISC",
    }
    
    # Content and functional relations
    CONTENT_DEPRELS = {
      "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative",
      "expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos",
      "nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list",
      "parataxis", "orphan", "goeswith", "reparandum", "root", "dep"
    }
    
    FUNCTIONAL_DEPRELS = {
      "aux", "cop", "mark", "det", "clf", "case", "cc"
    }
    
    UNIVERSAL_FEATURES = {
      "PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender",
      "Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood",
      "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite"
    }
    
    ################################################################################
    def is_float(value) :
      if not isinstance(value, str) :
        return False
      try :
        float(value)
        return True
      except ValueError :
        return False
    ################################################################################
    
    ################################################################################
    def filter_columns(columns) :
      res = []
      cols = [("ID",4), ("FORM",8), ("UPOS",8), ("HEAD",4), ("DEPREL", 8)]
      contents = [(columns[col2index[col]], max_size) for (col, max_size) in cols if col in col2index]
    
      for (content, max_len) in contents :
        res.append(("{:"+str(max_len)+"}").format(content if len(content) <= max_len else "{}…{}".format(content[0:math.ceil((max_len-1)/2)],content[-((max_len-1)//2):])))
    
      return res
    ################################################################################
    
    ################################################################################
    # UD Error is used when raising exceptions in this module
    class UDError(Exception) :
      pass
    ################################################################################
    
    ################################################################################
    # Conversion methods handling `str` <-> `unicode` conversions in Python2
    def _decode(text) :
      return text if sys.version_info[0] >= 3 or not isinstance(text, str) else text.decode("utf-8")
    ################################################################################
    
    
    ################################################################################
    def _encode(text) :
      return text if sys.version_info[0] >= 3 or not isinstance(text, unicode) else text.encode("utf-8")
    ################################################################################
    
    
    ################################################################################
    # Load given CoNLL-U file into internal representation
    def load_conllu(file) :
      global col2index
      global index2col
      # Internal representation classes
      class UDRepresentation :
        def __init__(self) :
          # Characters of all the tokens in the whole file.
          # Whitespace between tokens is not included.
          self.characters = []
          # List of UDSpan instances with start&end indices into `characters`.
          self.tokens = []
          # List of UDWord instances.
          self.words = []
          # List of UDSpan instances with start&end indices into `characters`.
          self.sentences = []
          # List of UDSpan instances with start&end indices into `words`.
          self.sentences_words = []
          # Name of the file this representation has been extracted from.
          self.filename = ""
      class UDSpan :
        def __init__(self, start, end) :
          self.start = start
          # Note that self.end marks the first position **after the end** of span,
          # so we can use characters[start:end] or range(start, end).
          self.end = end
      class UDWord :
        def __init__(self, span, columns, is_multiword) :
          # Index of the sentence this word is part of, within ud_representation.sentences.
          self.sentence = None
          # Span of this word (or MWT, see below) within ud_representation.characters.
          self.span = span
          # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
          self.columns = columns
          # is_multiword==True means that this word is part of a multi-word token.
          # In that case, self.span marks the span of the whole multi-word token.
          self.is_multiword = is_multiword
          # Reference to the UDWord instance representing the HEAD (or None if root).
          self.parent = None
          # List of references to UDWord instances representing functional-deprel children.
          self.functional_children = []
    
          # Only consider universal FEATS.
          # TODO consider all feats
          if "FEATS" in col2index :
            self.columns[col2index["FEATS"]] = "|".join(sorted(feat for feat in columns[col2index["FEATS"]].split("|")
                               if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
          if "DEPREL" in col2index :
            # Let's ignore language-specific deprel subtypes.
            self.columns[col2index["DEPREL"]] = columns[col2index["DEPREL"]].split(":")[0]
            # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS
            self.is_content_deprel = self.columns[col2index["DEPREL"]] in CONTENT_DEPRELS
            self.is_functional_deprel = self.columns[col2index["DEPREL"]] in FUNCTIONAL_DEPRELS
    
      ud = UDRepresentation()
      ud.filename = file.name
    
      col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
    
      # Load the CoNLL-U file
      index, sentence_start = 0, None
      id_starts_at_zero = False
      while True :
        line = file.readline()
        if not line :
          break
        line = _decode(line.rstrip("\r\n"))
    
        # Handle sentence start boundaries
        if sentence_start is None :
          # Skip comments
          if line.startswith("#") :
            splited = line.split("global.columns =")
            if len(splited) > 1 :
              col2index, index2col = readMCD(splited[-1].strip())
            continue
          # Start a new sentence
          sentence_start = len(ud.words)
          ud.sentences.append(UDSpan(index, 0))
          ud.sentences_words.append(UDSpan(sentence_start, 0))
    
        if not line :
          # Add parent and children UDWord links and check there are no cycles
          def process_word(word) :
            if "HEAD" in col2index :
              if word.parent == "remapping" :
                raise UDError("There is a cycle in a sentence")
              if word.parent is None :
                head = int(word.columns[col2index["HEAD"]])
                if head < 0 or head > len(ud.words) - sentence_start :
                  raise UDError("HEAD '{}' points outside of the sentence".format(_encode(word.columns[col2index["HEAD"]])))
                if head :
                  parent = ud.words[sentence_start + head - 1]
                  word.parent = "remapping"
                  process_word(parent)
                  word.parent = parent
    
          for word in ud.words[sentence_start:] :
            process_word(word)
          # func_children cannot be assigned within process_word
          # because it is called recursively and may result in adding one child twice.
          for word in ud.words[sentence_start:] :
            if "HEAD" in col2index and word.parent and word.is_functional_deprel :
              word.parent.functional_children.append(word)
    
          # Check there is a single root node
          if "HEAD" in col2index and len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1 :
            raise UDError("There are multiple roots in a sentence")
    
          # End the sentence
          ud.sentences[-1].end = index
          ud.sentences_words[-1].end = len(ud.words)
          sentence_start = None
          continue
    
        # Read next token/word
        columns = line.split("\t")
    
        # Skip empty nodes
        if "ID" in col2index and "." in columns[col2index["ID"]] :
          continue
    
        # Delete spaces from FORM, so gold.characters == system.characters
        # even if one of them tokenizes the space. Use any Unicode character
        # with category Zs.
        if "FORM" in col2index :
          columns[col2index["FORM"]] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[col2index["FORM"]]))
          if not columns[col2index["FORM"]] :
            raise UDError("There is an empty FORM in the CoNLL-U file")
    
        # Save token
        form_value = columns[col2index["FORM"]] if "FORM" in col2index else "_"
        ud.characters.extend(form_value)
        ud.tokens.append(UDSpan(index, index + len(form_value)))
        index += len(form_value)
    
        # Handle multi-word tokens to save word(s)
        if "ID" in col2index and "-" in columns[col2index["ID"]] :
          try :
            start, end = map(int, columns[col2index["ID"]].split("-"))
          except :
            raise UDError("Cannot parse multi-word token ID '{}'".format(_encode(columns[col2index["ID"]])))
    
          for _ in range(start, end + 1) :
            word_line = _decode(file.readline().rstrip("\r\n"))
            word_columns = word_line.split("\t")
    
            ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
            ud.words[-1].sentence = len(ud.sentences)-1
        # Basic tokens/words
        else :
          try :
            word_id = int(columns[col2index["ID"]]) if "ID" in col2index else "_"
            if word_id == 0 :
              id_starts_at_zero = True
          except :
            raise UDError("Cannot parse word ID '{}'".format(_encode(columns[col2index["ID"]])))
          if word_id != len(ud.words) - sentence_start + (0 if id_starts_at_zero else 1) :
            raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(
              _encode(columns[col2index["ID"]]), _encode(columns[col2index["FORM"]]), len(ud.words) - sentence_start + 1))
    
          try :
            head_id = int(columns[col2index["HEAD"]]) if "HEAD" in col2index else 0
          except :
            raise UDError("Cannot parse HEAD '{}'".format(_encode(columns[col2index["HEAD"]])))
          if head_id < 0 :
            raise UDError("HEAD cannot be negative")
    
          ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))
          ud.words[-1].sentence = len(ud.sentences)-1
    
      if sentence_start is not None :
        raise UDError("The CoNLL-U file does not end with empty line")
    
      return ud
    ################################################################################
    
    
    ################################################################################
    # Evaluate the gold and system treebanks (loaded using load_conllu).
    def evaluate(gold_ud, system_ud, extraColumns) :
      class Score :
        def __init__(self, gold_total, system_total, correct, aligned_total=None, isNumeric=False, R2=None) :
          self.correct = correct[0]
          self.gold_total = gold_total
          self.system_total = system_total
          self.aligned_total = aligned_total
          if isNumeric :
            self.precision = 0
            self.recall = R2
            self.f1 = correct[1]
            self.aligned_accuracy = correct[0]
    
          else :
            self.precision = 100*correct[0] / system_total if system_total else 0.0
            self.recall = 100*correct[0] / gold_total if gold_total else 0.0
            self.f1 = 2 * 100*correct[0] / (system_total + gold_total) if system_total + gold_total else 0.0
            self.aligned_accuracy = 100*correct[0] / aligned_total if aligned_total else aligned_total
    
      class AlignmentWord :
        def __init__(self, gold_word, system_word) :
          self.gold_word = gold_word
          self.system_word = system_word
      class Alignment :
        def __init__(self, gold_words, system_words) :
          self.gold_words = gold_words
          self.system_words = system_words
          self.matched_words = []
          self.matched_words_map = {}
        def append_aligned_words(self, gold_word, system_word) :
          self.matched_words.append(AlignmentWord(gold_word, system_word))
          self.matched_words_map[system_word] = gold_word
    
      def spans_score(gold_spans, system_spans) :
        correct, gi, si = 0, 0, 0
        while gi < len(gold_spans) and si < len(system_spans) :
          if system_spans[si].start < gold_spans[gi].start :
            si += 1
          elif gold_spans[gi].start < system_spans[si].start :
            gi += 1
          else :
            correct += gold_spans[gi].end == system_spans[si].end
            si += 1
            gi += 1
    
        return [Score(len(gold_spans), len(system_spans), [correct])]
    
      def alignment_score(alignment, key_fn=None, filter_fn=None) :
        if filter_fn is not None :
          gold = sum(1 for gold in alignment.gold_words if filter_fn(gold))
          system = sum(1 for system in alignment.system_words if filter_fn(system))
          aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word))
        else :
          gold = len(alignment.gold_words)
          system = len(alignment.system_words)
          aligned = len(alignment.matched_words)
    
        if key_fn is None :
          # Return score for whole aligned words
          return [Score(gold, system, [aligned])]
    
        def gold_aligned_gold(word) :
          return word
        def gold_aligned_system(word) :
          return alignment.matched_words_map.get(word, "NotAligned") if word is not None else None
        isNumericOnly = True
        for words in alignment.matched_words :
          if filter_fn is None or filter_fn(words.gold_word) :
            goldItem = key_fn(words.gold_word, gold_aligned_gold)
            systemItem = key_fn(words.system_word, gold_aligned_system)
            if (not is_float(systemItem)) or  (not is_float(goldItem)) :
              isNumericOnly = False
              break
    
        correct = [0,0]
        errors = []
        goldValues = []
        predictedValues = []
        for words in alignment.matched_words :
          if filter_fn is None or filter_fn(words.gold_word) :
            goldItem = key_fn(words.gold_word, gold_aligned_gold)
            systemItem = key_fn(words.system_word, gold_aligned_system)
            if not isNumericOnly :
              if goldItem == systemItem :
                correct[0] += 1
              else :
                errors.append(words)
            else :
              correct[0] -= abs(float(goldItem) - float(systemItem))**1
              correct[1] -= abs(float(goldItem) - float(systemItem))**2
              goldValues.append(float(goldItem))
              predictedValues.append(float(systemItem))
    
        R2 = 0.0
        if isNumericOnly and len(goldValues) > 0 :
          correct[0] /= len(goldValues)
          correct[1] /= len(goldValues)
          goldMean = sum(goldValues) / len(goldValues)
          predMean = sum(predictedValues) / len(predictedValues)
          numerator = 0.0
          denom1 = 0.0
          denom2 = 0.0
          for i in range(len(predictedValues)) :
            numerator += (predictedValues[i]-predMean)*(goldValues[i]-goldMean)
            denom1 += (predictedValues[i]-predMean)**2
            denom2 += (goldValues[i]-goldMean)**2
            
          pearson = 0.0
          if denom1 > 0.0 and denom2 > 0.0 :
            pearson = numerator/((denom1**0.5)*(denom2**0.5))
          R2 = pearson**2
        return [Score(gold, system, correct, aligned, isNumeric=isNumericOnly, R2=R2), errors]
    
      def beyond_end(words, i, multiword_span_end) :
        if i >= len(words) :
          return True
        if words[i].is_multiword :
          return words[i].span.start >= multiword_span_end
        return words[i].span.end > multiword_span_end
    
      def extend_end(word, multiword_span_end) :
        if word.is_multiword and word.span.end > multiword_span_end :
          return word.span.end
        return multiword_span_end
    
      def find_multiword_span(gold_words, system_words, gi, si) :
        # We know gold_words[gi].is_multiword or system_words[si].is_multiword.
        # Find the start of the multiword span (gs, ss), so the multiword span is minimal.
        # Initialize multiword_span_end characters index.
        if gold_words[gi].is_multiword :
          multiword_span_end = gold_words[gi].span.end
          if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start :
            si += 1
        else : # if system_words[si].is_multiword
          multiword_span_end = system_words[si].span.end
          if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start :
            gi += 1
        gs, ss = gi, si
    
        # Find the end of the multiword span
        # (so both gi and si are pointing to the word following the multiword span end).
        while not beyond_end(gold_words, gi, multiword_span_end) or \
            not beyond_end(system_words, si, multiword_span_end) :
          if gi < len(gold_words) and (si >= len(system_words) or
                         gold_words[gi].span.start <= system_words[si].span.start) :
            multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
            gi += 1
          else :
            multiword_span_end = extend_end(system_words[si], multiword_span_end)
            si += 1
        return gs, ss, gi, si
    
      def compute_lcs(gold_words, system_words, gi, si, gs, ss) :
        lcs = [[0] * (si - ss) for i in range(gi - gs)]
        for g in reversed(range(gi - gs)) :
          for s in reversed(range(si - ss)) :
            if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() :
              lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
            lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
            lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
        return lcs
    
      def align_words(gold_words, system_words) :
        alignment = Alignment(gold_words, system_words)
    
        gi, si = 0, 0
        while gi < len(gold_words) and si < len(system_words) :
          if gold_words[gi].is_multiword or system_words[si].is_multiword :
            # A: Multi-word tokens => align via LCS within the whole "multiword span".
            gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)
    
            if si > ss and gi > gs :
              lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)
    
              # Store aligned words
              s, g = 0, 0
              while g < gi - gs and s < si - ss :
                if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() :
                  alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
                  g += 1
                  s += 1
                elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0) :
                  g += 1
                else :
                  s += 1
          else :
            # B: No multi-word token => align according to spans.
            if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end) :
              alignment.append_aligned_words(gold_words[gi], system_words[si])
              gi += 1
              si += 1
            elif gold_words[gi].span.start <= system_words[si].span.start :
              gi += 1
            else :
              si += 1
    
        return alignment
    
      # Check that the underlying character sequences do match.
      if gold_ud.characters != system_ud.characters :
        index = 0
        while index < len(gold_ud.characters) and index < len(system_ud.characters) and \
            gold_ud.characters[index] == system_ud.characters[index] :
          index += 1
    
        raise UDError(
          "The concatenation of tokens in gold file and in system file differ!\n" +
          "First 20 differing characters in gold file: '{}' and system file: '{}'".format(
            "".join(map(_encode, gold_ud.characters[index:index + 20])),
            "".join(map(_encode, system_ud.characters[index:index + 20]))
          )
        )
    
      # Align words
      alignment = align_words(gold_ud.words, system_ud.words)
    
      # Compute the F1-scores
      result = {}
      if "FORM" in col2index :
        result["Tokens"] = spans_score(gold_ud.tokens, system_ud.tokens)
        result["Words"] = alignment_score(alignment)
      if "UPOS" in col2index :
        result["UPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["UPOS"]])
      if "XPOS" in col2index :
        result["XPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["XPOS"]])
      if "FEATS" in col2index :
        result["UFeats"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["FEATS"]])
      if "LEMMA" in col2index :
        result["Lemmas"] = alignment_score(alignment, lambda w, ga : w.columns[col2index["LEMMA"]] if ga(w).columns[col2index["LEMMA"]] != "_" else "_")
      if "HEAD" in col2index :
        result["UAS"] = alignment_score(alignment, lambda w, ga : ga(w.parent))
      if "DEPREL" in col2index :
        result["LAS"] = alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[col2index["DEPREL"]]))
      if "DEPREL" in col2index and "UPOS" in col2index and "FEATS" in col2index :
        result["MLAS"] = alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[col2index["DEPREL"]], w.columns[col2index["UPOS"]], w.columns[col2index["FEATS"]], [(ga(c), c.columns[col2index["DEPREL"]], c.columns[col2index["UPOS"]], c.columns[col2index["FEATS"]]) for c in w.functional_children]), filter_fn=lambda w: w.is_content_deprel)
      if "ID" in col2index :
        result["Sentences"] = spans_score(gold_ud.sentences, system_ud.sentences)
    
      for colName in col2index :
        if colName in extraColumns and colName != "_" :
          result[colName] = alignment_score(alignment, lambda w, _ : w.columns[col2index[colName]])
    
      return result
    ################################################################################
    
    
    ################################################################################
    def load_conllu_file(path) :
      _file = open(path, mode="r", **({"encoding" : "utf-8"} if sys.version_info >= (3, 0) else {}))
      return load_conllu(_file)
    ################################################################################
    
    
    ################################################################################
    def evaluate_wrapper(args) :
      # Load CoNLL-U files
      gold_ud = load_conllu_file(args.gold_file)
      system_files = [load_conllu_file(args.system_file)]
    
      if args.system_file2 is not None :
        system_files.append(load_conllu_file(args.system_file2))
    
      return gold_ud, [(system, evaluate(gold_ud, system, set(args.extra.split(',')))) for system in system_files]
    ################################################################################
    
    
    ################################################################################
    class Error :
      def __init__(self, gold_file, system_file, gold_word, system_word, metric) :
        self.gold = gold_word
        self.pred = system_word
        self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end]
        self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end]
        self.type = self.gold.columns[col2index[metric2colname[metric]]]+"->"+self.pred.columns[col2index[metric2colname[metric]]]
      def __str__(self) :
        result = []
        gold_lines = []
        pred_lines = []
        for word in self.gold_sentence :
          gold_lines.append((">" if word == self.gold else " ") + " ".join(filter_columns(word.columns)))
        for word in self.pred_sentence :
          pred_lines.append((">" if word == self.pred else " ") + " ".join(filter_columns(word.columns)))
             
        for index in range(max(len(gold_lines), len(pred_lines))) :
          result.append("{} | {}".format(gold_lines[index] if index < len(gold_lines) else " "*len(pred_lines[index]), pred_lines[index] if index < len(pred_lines) else " "*len(gold_lines[index])))
        return "\n".join(result)
    
    class Errors :
      def __init__(self, metric, errors1=None, errors2=None) :
        self.types = []
        self.nb_errors = 0
        self.metric = metric
        if errors1 is not None and errors2 is not None :
          for type in errors1.types :
            for error in type.errors :
              if not errors2.has(error) :
                self.add(error)
      def __len__(self) : 
        return self.nb_errors
      def add(self, error) :
        self.nb_errors += 1
        for t in self.types :
          if t.type == error.type :
            t.add(error)
            return
        self.types.append(ErrorType(error.type))
        self.types[-1].add(error)
      def has(self, error) :
        for t in self.types :
          if t.type == error.type :
            return t.has(error)
      def sort(self) :
        self.types.sort(key=len, reverse=True)
    
    class ErrorType :
      def __init__(self, error_type) :
        self.type = error_type
        self.errors = []
      def __len__(self) :
        return len(self.errors)
      def add(self, error) :
        self.errors.append(error)
      def has(self, error) :
        for other_error in self.errors :
          if other_error.gold == error.gold :
            return True
        return False
    ################################################################################
    
    
    ################################################################################
    def compute_errors(gold_file, system_file, evaluation, metric) :
      errors = Errors(metric)
      for alignment_word in evaluation[metric][1] :
        gold = alignment_word.gold_word
        pred = alignment_word.system_word
        error = Error(gold_file, system_file, gold, pred, metric)
    
        errors.add(error)
    
      return errors
    ################################################################################
    
    
    ################################################################################
    def main() :
      # Parse arguments
      parser = argparse.ArgumentParser()
      parser.add_argument("gold_file", type=str,
        help="Name of the CoNLL-U file with the gold data.")
      parser.add_argument("system_file", type=str,
        help="Name of the CoNLL-U file with the predicted data.")
      parser.add_argument("--counts", "-c", default=False, action="store_true",
        help="Print raw counts of correct/gold/system/aligned words instead of prec/rec/F1 for all metrics.")
      parser.add_argument("--system_file2",
        help="Name of another CoNLL-U file with predicted data, for error comparison.")
      parser.add_argument("--enumerate_errors", "-e", default=None,
        help="Comma separated list of column names for which to enumerate errors (e.g. \"UPOS,FEATS\").")
      parser.add_argument("--extra", "-x", default="",
        help="Comma separated list of column names for which to compute score (e.g. \"TIME,EOS\").")
      args = parser.parse_args()
    
      errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',')
    
      global col2index
      global index2col
    
      # Evaluate
      gold_ud, evaluations = evaluate_wrapper(args)
      errors_by_file = []
      examples_list = []
    
      for id1 in range(len(evaluations)) :
        (system_ud, evaluation) = evaluations[id1]
        fnamelen = len(system_ud.filename)
        print("*"*math.ceil((80-2-fnamelen)/2),system_ud.filename,"*"*math.floor((80-2-fnamelen)/2))
        # Compute errors
        errors_list = [compute_errors(gold_ud, system_ud, evaluation, metric) for metric in errors_metrics]
        errors_by_file.append(errors_list)
    
        maxColNameSize = 1 + max([len(colName) for colName in evaluation])
      
        # Print the evaluation
        if args.counts :
          print("{:^{}}| Correct   |      Gold | Predicted | Aligned".format("Metric", maxColNameSize))
        else :
          print("{:^{}}| Precision |    Recall |  F1 Score | AligndAcc".format("Metric", maxColNameSize))
        print("{}+-----------+-----------+-----------+-----------".format("-"*maxColNameSize))
        for metric in evaluation :
          if args.counts :
            print("{:{}}|{:10} |{:10} |{:10} |{:10}".format(
              metric,
              maxColNameSize,
              evaluation[metric][0].correct,
              evaluation[metric][0].gold_total,
              evaluation[metric][0].system_total,
              evaluation[metric][0].aligned_total or (evaluation[metric][0].correct if metric == "Words" else "")
            ))
          else :
            precision = ("{:10.2f}" if abs(evaluation[metric][0].precision) > 1.0 else "{:10.4f}").format(evaluation[metric][0].precision)
            recall = ("{:10.2f}" if abs(evaluation[metric][0].recall) > 1.0 else "{:10.4f}").format(evaluation[metric][0].recall)
            f1 = ("{:10.2f}" if abs(evaluation[metric][0].f1) > 1.0 else "{:10.4f}").format(evaluation[metric][0].f1)
            print("{:{}}|{} |{} |{} |{}".format(
              metric,
              maxColNameSize,
              precision,
              recall,
              f1,
              "{:10.2f}".format(evaluation[metric][0].aligned_accuracy) if evaluation[metric][0].aligned_accuracy is not None else ""
            ))
      
        for id2 in range(len(errors_list)) :
          errors = errors_list[id2]
          errors.sort()
          print("Most frequent errors for metric '{}' :".format(errors.metric))
          print("{:>12} {:>5} {:>6} {}\n {:->37}".format("ID", "NB", "%AGE", "GOLD->SYSTEM", ""))
    
          print("{:>12} {:5} {:6.2f}%".format("Total", len(errors), 100))
          for id3 in range(len(errors.types[:10])) :
            error_type = errors.types[:10][id3]
            t = error_type.type
            nb = len(error_type)
            percent = 100.0*nb/len(errors)
            id = ":".join(map(str,[id1,id2,id3,"*"]))
            print("{:>12} {:5} {:6.2f}% {}".format(id, nb, percent, t))
            for id4 in range(len(error_type)) :
              examples_list.append((":".join(map(str,[id1,id2,id3,id4])), error_type.errors[id4]))
          print("")
    
      for id1 in range(len(evaluations)) :
        (system1_ud, evaluation) = evaluations[id1]
        for id2 in range(len(evaluations)) :
          if id1 == id2 :
            continue
          (system2_ud, evaluation) = evaluations[id2]
          errors1 = errors_by_file[id1]
          errors2 = errors_by_file[id2]
    
          if len(errors1) > 0 :
            print("{} Error comparison {}".format("*"*31, "*"*31))
            print("{:>30} : {}".format("These errors are present in", system1_ud.filename))
            print("{:>30} : {}".format("and not in", system2_ud.filename))
          for id3 in range(len(errors1)) :
            metric = errors1[id3].metric
            errors_diff = Errors(metric, errors1[id3], errors2[id3])
            errors_diff.sort()
            print("{:>12} {:5} {:6.2f}%".format("Total", len(errors_diff), 100))
            for id4 in range(len(errors_diff.types[:10])) :
              error_type = errors_diff.types[:10][id4]
              t = error_type.type
              nb = len(error_type)
              percent = 100.0*nb/len(errors)
              id = ":".join(map(str,["d"+str(id1),id3,id4,"*"]))
              print("{:>12} {:5} {:6.2f}% {}".format(id, nb, percent, t))
              for id5 in range(len(error_type)) :
                examples_list.append((":".join(map(str,["d"+str(id1),id3,id4,id5])), error_type.errors[id5]))
            print("")
    
      if len(examples_list) > 0 :
        print("{}List of all errors by their ID{}".format("*"*25,"*"*25))
        print("{}{:^30}{}\n".format("*"*25,"Format is GOLD | PREDICTED","*"*25))
    
      for (id,error) in examples_list :
        print("ID="+id)
        print(error)
        print("")
    ################################################################################
    
    
    ################################################################################
    if __name__ == "__main__" :
      main()
    ################################################################################