From 4369a7551e83bb311f141df51a4e4c76e5589914 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Wed, 3 Jun 2020 17:57:36 +0200
Subject: [PATCH] scripts now use conllu mcd by default, and can read mcd from
 file metadata

---
 UD_any/data/Makefile             |  18 +--
 UD_any/data/getTransitionSets.py | 215 +++++++++++++++----------------
 UD_any/evaluate.sh               |  13 +-
 UD_any/train.sh                  |  15 +--
 scripts/conll18_ud_eval.py       | 153 ++++++++++++----------
 scripts/conllu2fplm.py           |  13 +-
 scripts/conllu2splits.py         |  16 ++-
 scripts/readMCD.py               |  15 +--
 8 files changed, 225 insertions(+), 233 deletions(-)

diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index ba99664..86d3e5c 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -2,7 +2,6 @@ include ../config
 
 SCRIPTS=../../../../scripts
 CONLL2TXT=$(SCRIPTS)/conllu_to_text.pl
-MCD=conllu.mcd
 
 TRAIN_FILES=$(shell find $(CORPUS) -type f -name '*train*.conllu')
 DEV_FILES=$(shell find $(CORPUS) -type f -name '*dev*.conllu')
@@ -12,16 +11,16 @@ TEST_FILES=$(shell find $(CORPUS) -type f -name '*test*.conllu')
 THRESHOLD=10
 FPLM_FILENAME=fplm
 
-all: tokenizer.ts segmenter.ts texts all_no_test.conllu columns pretrain
+all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain
 	rm -f col_*\.txt
 	rm -f all_no_test.conllu
 
 all_no_test.conllu:
 	cat $(TRAIN_FILES) $(DEV_FILES) > $@
 
-tokenizer.ts: all_no_test.conllu $(MCD)
+tokenizer.ts: all_no_test.conllu
 	echo "ENDWORD" > $@
-	$(SCRIPTS)/conllu2splits.py $< $(MCD) > splitwords.ts 2> ambiguities.txt
+	$(SCRIPTS)/conllu2splits.py $< > splitwords.ts 2> ambiguities.txt
 	echo "SPLIT 0" >> $@
 	echo "SPLIT 1" >> $@
 	echo "SPLIT 2" >> $@
@@ -40,11 +39,8 @@ segmenter.ts:
 	echo "NOTHING" >> $@
 	sed -i -e 's/^/<segmenter> /' $@
  
-columns: all_no_test.conllu $(MCD)
-	for number in 1 2 3 4 5 6 7 8 9 10 ; do \
-		cat all_no_test.conllu | sed '/^#/ d' | cut -f$$number | sort --unique > col_$$number.txt ; \
-	done
-	./getTransitionSets.py $(MCD) col_*\.txt
+transitions: all_no_test.conllu
+	./getTransitionSets.py $<
 
 texts:
 	./getRawText.py $(CONLL2TXT) $(TRAIN_FILES) $(DEV_FILES) $(TEST_FILES)
@@ -52,8 +48,8 @@ texts:
 pretrain: texts
 	./pretrainEmbeddings.py $(shell find $(CORPUS) -type f -name '*train*.txt') 64
 
-$(FPLM_FILENAME): all_no_test.conllu $(MCD)
-	$(SCRIPTS)/conllu2fplm.py $< $(MCD) > $@
+$(FPLM_FILENAME): all_no_test.conllu
+	$(SCRIPTS)/conllu2fplm.py $< > $@
 
 clean:
 	- rm -f *\.txt
diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py
index 8041e43..39bf020 100755
--- a/UD_any/data/getTransitionSets.py
+++ b/UD_any/data/getTransitionSets.py
@@ -7,125 +7,118 @@ sys.path.insert(1, '../../../../scripts')
 from readMCD import readMCD
 
 def printUsageAndExit() :
-  print("USAGE : %s mcd column_1.txt columns_2.txt..."%sys.argv[0], file=sys.stderr)
+  print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
   exit(1)
 
 if __name__ == "__main__" :
 
   sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
-  if len(sys.argv) < 3 :
+  if len(sys.argv) != 2 :
     printUsageAndExit()
 
-  conllMCD, conllMCDr = readMCD(sys.argv[1])
+  col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
 
-  for colFile in sys.argv[2:] :
-    numCol = int(colFile.split('.')[0].split('_')[-1]) -1
-    if not numCol in conllMCD :
+  fileContent = []
+
+  for line in open(sys.argv[1], "r", encoding="utf8") :
+    if len(line.strip()) < 3 :
+      continue
+    if line.strip()[0] == '#' :
+      splited = line.split("global.columns =")
+      if len(splited) > 1 :
+        col2index, index2col = readMCD(splited[-1].strip())
       continue
-    nameCol = conllMCD[numCol]
-
-    if nameCol == "UPOS" :
-      output = open("tagger.ts", 'w', encoding='utf-8')
-      for line in open(colFile, "r", encoding='utf-8') :
-        striped = line.strip()
-        if len(striped) == 0 :
-          continue
-        print("<tagger> WRITE b.0 UPOS " + striped, file=output)
-      output.close()
-
-    elif nameCol == "XPOS" :
-      output = open("taggerx.ts", 'w', encoding='utf-8')
-      for line in open(colFile, "r", encoding='utf-8') :
-        striped = line.strip()
-        if len(striped) == 0 :
-          continue
-        print("WRITE b.0 XPOS " + striped, file=output)
-      output.close()
-
-    elif nameCol == "FEATS" :
-      output = open("morpho_whole.ts", 'w', encoding='utf-8')
-      for line in open(colFile, "r", encoding='utf-8') :
-        striped = line.strip()
-        if len(striped) == 0 :
-          continue
-        print("<morpho> WRITE b.0 FEATS " + striped, file=output)
-      output.close()
-      output = open("morpho_parts.ts", 'w', encoding='utf-8')
-      allParts = set()
-      allPartsList = []
-      for line in open(colFile, "r", encoding='utf-8') :
-        striped = line.strip()
-        if len(striped) == 0 :
-          continue
-        parts = striped.split('|')
-        for part in parts :
-          allParts.add(part)
-      for part in allParts :
-        allPartsList.append(part)
-      allPartsList.sort()
-      for part in allPartsList :
-        print("<morpho> ADD b.0 FEATS " + part, file=output)
-      print("<morpho> NOTHING", file=output)
-      output.close()
-
-    elif nameCol == "DEPREL" :
-      output = open("parser_eager_rel_strict.ts", 'w', encoding='utf-8')
-      print("<parser> REDUCE_strict", file=output)
-      labels = set()
-      labelsList = []
-      for line in open(colFile, "r", encoding='utf-8') :
-        striped = line.strip()
-        if len(striped) == 0 or striped == "root" or striped == "_" :
-          continue
-        label = striped
-        if label not in labels :
-          labels.add(striped)
-          labelsList.append(striped)
-      labelsList.sort()
-      for label in labelsList :
-        print("<parser> eager_LEFT_rel " + label, file=output)
-        print("<parser> eager_RIGHT_rel " + label, file=output)
-      print("<parser> eager_SHIFT", file=output)
-      output.close()
-
-      output = open("parser_eager_rel_relaxed.ts", 'w', encoding='utf-8')
-      print("<parser> REDUCE_relaxed", file=output)
-      for label in labelsList :
-        print("<parser> eager_LEFT_rel " + label, file=output)
-        print("<parser> eager_RIGHT_rel " + label, file=output)
-      print("<parser> eager_SHIFT", file=output)
-      output.close()
-
-      output = open("parser_eager_strict.ts", 'w', encoding='utf-8')
-      print("<parser> REDUCE_strict", file=output)
-      print("<parser> eager_LEFT", file=output)
-      print("<parser> eager_RIGHT", file=output)
-      print("<parser> eager_SHIFT", file=output)
-      output.close()
-
-      output = open("parser_eager_relaxed.ts", 'w', encoding='utf-8')
-      print("<parser> REDUCE_relaxed", file=output)
-      print("<parser> eager_LEFT", file=output)
-      print("<parser> eager_RIGHT", file=output)
-      print("<parser> eager_SHIFT", file=output)
-      output.close()
-
-      output = open("parser_standard_rel.ts", 'w', encoding='utf-8')
-      for label in labelsList :
-        print("<parser> standard_LEFT_rel " + label, file=output)
-        print("<parser> standard_RIGHT_rel " + label, file=output)
-      print("<parser> standard_SHIFT", file=output)
-      output.close()
-
-      output = open("parser_standard.ts", 'w', encoding='utf-8')
-      print("<parser> standard_LEFT_rel", file=output)
-      print("<parser> standard_RIGHT_rel", file=output)
-      print("<parser> standard_SHIFT", file=output)
-      output.close()
-
-      output = open("deprel.ts", 'w', encoding='utf-8')
-      for label in labelsList :
-        print("deprel " + label, file=output)
-      output.close()
+
+    columns = line.strip().split('\t')
+    fileContent.append(columns)
+
+  if "UPOS" in col2index :
+    values = []
+    for columns in fileContent :
+      values.append(columns[col2index["UPOS"]])
+    values = sorted(set(values))
+    output = open("tagger.ts", 'w', encoding='utf-8')
+    for value in values :
+      print("<tagger> WRITE b.0 UPOS " + value, file=output)
+    output.close()
+
+  if "XPOS" in col2index :
+    values = []
+    for columns in fileContent :
+      values.append(columns[col2index["XPOS"]])
+    values = sorted(set(values))
+    output = open("taggerx.ts", 'w', encoding='utf-8')
+    for value in values :
+      print("<taggerx> WRITE b.0 XPOS " + value, file=output)
+    output.close()
+
+  if "FEATS" in col2index :
+    values = []
+    for columns in fileContent :
+      values.append(columns[col2index["FEATS"]])
+    values = sorted(set(values))
+    parts = []
+    for value in values :
+      for part in value.split("|") :
+        parts.append(part)
+    parts = sorted(set(parts))
+    output = open("morpho_whole.ts", 'w', encoding='utf-8')
+    for value in values :
+      print("<morpho> WRITE b.0 FEATS " + value, file=output)
+    output.close()
+    output = open("morpho_parts.ts", 'w', encoding='utf-8')
+    for value in parts :
+      print("<morpho> ADD b.0 FEATS " + value, file=output)
+    print("<morpho> NOTHING", file=output)
+    output.close()
+
+  if "DEPREL" in col2index :
+    labelsList = []
+    for columns in fileContent :
+      label = columns[col2index["DEPREL"]]
+      if not (label == "_" or label == "root") :
+        labelsList.append(label)
+    labelsList = sorted(set(labelsList))
+    output = open("parser_eager_rel_strict.ts", 'w', encoding='utf-8')
+    print("<parser> REDUCE_strict", file=output)
+    for label in labelsList :
+      print("<parser> eager_LEFT_rel " + label, file=output)
+      print("<parser> eager_RIGHT_rel " + label, file=output)
+    print("<parser> eager_SHIFT", file=output)
+    output.close()
+    output = open("parser_eager_rel_relaxed.ts", 'w', encoding='utf-8')
+    print("<parser> REDUCE_relaxed", file=output)
+    for label in labelsList :
+      print("<parser> eager_LEFT_rel " + label, file=output)
+      print("<parser> eager_RIGHT_rel " + label, file=output)
+    print("<parser> eager_SHIFT", file=output)
+    output.close()
+    output = open("parser_eager_strict.ts", 'w', encoding='utf-8')
+    print("<parser> REDUCE_strict", file=output)
+    print("<parser> eager_LEFT", file=output)
+    print("<parser> eager_RIGHT", file=output)
+    print("<parser> eager_SHIFT", file=output)
+    output.close()
+    output = open("parser_eager_relaxed.ts", 'w', encoding='utf-8')
+    print("<parser> REDUCE_relaxed", file=output)
+    print("<parser> eager_LEFT", file=output)
+    print("<parser> eager_RIGHT", file=output)
+    print("<parser> eager_SHIFT", file=output)
+    output.close()
+    output = open("parser_standard_rel.ts", 'w', encoding='utf-8')
+    for label in labelsList :
+      print("<parser> standard_LEFT_rel " + label, file=output)
+      print("<parser> standard_RIGHT_rel " + label, file=output)
+    print("<parser> standard_SHIFT", file=output)
+    output.close()
+    output = open("parser_standard.ts", 'w', encoding='utf-8')
+    print("<parser> standard_LEFT_rel", file=output)
+    print("<parser> standard_RIGHT_rel", file=output)
+    print("<parser> standard_SHIFT", file=output)
+    output.close()
+    output = open("deprel.ts", 'w', encoding='utf-8')
+    for label in labelsList :
+      print("deprel " + label, file=output)
+    output.close()
 
diff --git a/UD_any/evaluate.sh b/UD_any/evaluate.sh
index 59bbd18..4fcaa38 100755
--- a/UD_any/evaluate.sh
+++ b/UD_any/evaluate.sh
@@ -40,17 +40,15 @@ DEV=$(find $CORPUS -type f -name '*dev*.conllu')
 DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
 TEST=$(find $CORPUS -type f -name '*test*.conllu')
 TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
-MCD=$(find $CORPUS -type f -name '*.mcd')
 
 REF=$TEST
 REFRAW=$TESTRAW
 
-if has_space "$REF" || has_space "$REFRAW" || has_space "$MCD";
+if has_space "$REF" || has_space "$REFRAW";
 then
   >&2 echo "ERROR : more than 1 match"
   >&2 echo "REF : " $REF
   >&2 echo "REFRAW : " $REFRAW
-  >&2 echo "MCD : " $MCD
   print_usage_and_exit
 fi
 
@@ -67,21 +65,16 @@ then
   print_usage_and_exit
 fi
 
-if test -z $MCD;
-then
-	MCD=$EXPPATH"/data/*\.mcd"
-fi
-
 EVALCONLL="../scripts/conll18_ud_eval.py"
 OUTPUT=$EXPPATH"/predicted_eval.tsv"
 
 if [ "$MODE" = "tsv" ]; then
-macaon decode --model $EXPPATH --mcd $MCD --inputTSV $REF $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT || exit 1
+macaon decode --model $EXPPATH --inputTSV $REF $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT || exit 1
 exit 0
 fi
 
 if [ "$MODE" = "txt" ]; then
-macaon decode --model $EXPPATH --mcd $MCD --inputTXT $REFRAW $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT || exit 1
+macaon decode --model $EXPPATH --inputTXT $REFRAW $@ > $OUTPUT && $EVALCONLL $REF $OUTPUT || exit 1
 exit 0
 fi
 
diff --git a/UD_any/train.sh b/UD_any/train.sh
index 8747a28..3bc401b 100755
--- a/UD_any/train.sh
+++ b/UD_any/train.sh
@@ -43,16 +43,14 @@ DEV=$(find $CORPUS -type f -name '*dev*.conllu')
 DEVRAW=$(find $CORPUS -type f -name '*dev*.txt')
 TEST=$(find $CORPUS -type f -name '*test*.conllu')
 TESTRAW=$(find $CORPUS -type f -name '*test*.txt')
-MCD=$(find $CORPUS -type f -name '*.mcd')
 W2V=$(find $CORPUS -type f -name '*.w2v')
 
-if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST" || has_space "$MCD";
+if has_space "$TRAIN" || has_space "$DEV" || has_space "$TEST";
 then
   >&2 echo "ERROR : more than 1 match with keyword" $KEYWORD
   >&2 echo "TRAIN : " $TRAIN
   >&2 echo "DEV : " $DEV
   >&2 echo "TEST : " $TEST
-  >&2 echo "MCD : " $MCD
   print_usage_and_exit
 fi
 
@@ -72,13 +70,6 @@ then
 fi
 fi
 
-if test -z $MCD;
-then
-	MCD=$EXPPATH"/data/*\.mcd"
-fi
-
->&2 echo "Using MCD :" $MCD
-
 if test -f $W2V;
 then
 	>&2 echo "Using W2V :" $W2V
@@ -86,12 +77,12 @@ then
 fi
 
 if [ "$MODE" = "tsv" ]; then
-macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1
+macaon train --model $EXPPATH --trainTSV $TRAIN --devTSV $DEV $W2V "$@" || exit 1
 exit 0
 fi
 
 if [ "$MODE" = "txt" ]; then
-macaon train --model $EXPPATH --mcd $MCD --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1
+macaon train --model $EXPPATH --trainTSV $TRAIN --trainTXT $TRAINRAW --devTSV $DEV --devTXT $DEVRAW $W2V "$@" || exit 1
 exit 0
 fi
 
diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py
index 2554a2b..0866241 100755
--- a/scripts/conll18_ud_eval.py
+++ b/scripts/conll18_ud_eval.py
@@ -92,6 +92,8 @@
 from __future__ import division
 from __future__ import print_function
 
+from readMCD import readMCD
+
 import argparse
 import io
 import os
@@ -101,7 +103,8 @@ import unittest
 import math
 
 # CoNLL-U column names
-ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10)
+col2index = {}
+index2col = {}
 
 # Content and functional relations
 CONTENT_DEPRELS = {
@@ -125,10 +128,10 @@ UNIVERSAL_FEATURES = {
 ################################################################################
 def filter_columns(columns) :
   res = []
-  indexes = [0, 1, 3, 6, 7]
-  lengths = [4, 8, 8, 4, 8]
+  cols = [("ID",4), ("FORM",8), ("UPOS",8), ("HEAD",4), ("DEPREL", 8)]
+  contents = [(columns[col2index[col]], max_size) for (col, max_size) in cols if col in col2index]
 
-  for (content, max_len) in [(columns[indexes[index]], lengths[index]) for index in range(len(indexes))] :
+  for (content, max_len) in contents :
     res.append(("{:"+str(max_len)+"}").format(content if len(content) <= max_len else "{}…{}".format(content[0:math.ceil((max_len-1)/2)],content[-((max_len-1)//2):])))
 
   return res
@@ -158,6 +161,8 @@ def _encode(text) :
 ################################################################################
 # Load given CoNLL-U file into internal representation
 def load_conllu(file) :
+  global col2index
+  global index2col
   # Internal representation classes
   class UDRepresentation :
     def __init__(self) :
@@ -195,15 +200,18 @@ def load_conllu(file) :
       self.parent = None
       # List of references to UDWord instances representing functional-deprel children.
       self.functional_children = []
+
       # Only consider universal FEATS.
       # TODO consider all feats
-      self.columns[FEATS] = "|".join(sorted(feat for feat in columns[FEATS].split("|")
-                          if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
-      # Let's ignore language-specific deprel subtypes.
-      self.columns[DEPREL] = columns[DEPREL].split(":")[0]
-      # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS
-      self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS
-      self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS
+      if "FEATS" in col2index :
+        self.columns[col2index["FEATS"]] = "|".join(sorted(feat for feat in columns[col2index["FEATS"]].split("|")
+                           if feat.split("=", 1)[0] in UNIVERSAL_FEATURES))
+      if "DEPREL" in col2index :
+        # Let's ignore language-specific deprel subtypes.
+        self.columns[col2index["DEPREL"]] = columns[col2index["DEPREL"]].split(":")[0]
+        # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS
+        self.is_content_deprel = self.columns[col2index["DEPREL"]] in CONTENT_DEPRELS
+        self.is_functional_deprel = self.columns[col2index["DEPREL"]] in FUNCTIONAL_DEPRELS
 
   ud = UDRepresentation()
   ud.filename = file.name
@@ -220,36 +228,41 @@ def load_conllu(file) :
     if sentence_start is None :
       # Skip comments
       if line.startswith("#") :
+        splited = line.split("global.columns =")
+        if len(splited) > 1 :
+          col2index, index2col = readMCD(splited[-1].strip())
         continue
       # Start a new sentence
       sentence_start = len(ud.words)
       ud.sentences.append(UDSpan(index, 0))
       ud.sentences_words.append(UDSpan(sentence_start, 0))
+
     if not line :
       # Add parent and children UDWord links and check there are no cycles
       def process_word(word) :
-        if word.parent == "remapping" :
-          raise UDError("There is a cycle in a sentence")
-        if word.parent is None :
-          head = int(word.columns[HEAD])
-          if head < 0 or head > len(ud.words) - sentence_start :
-            raise UDError("HEAD '{}' points outside of the sentence".format(_encode(word.columns[HEAD])))
-          if head :
-            parent = ud.words[sentence_start + head - 1]
-            word.parent = "remapping"
-            process_word(parent)
-            word.parent = parent
+        if "HEAD" in col2index :
+          if word.parent == "remapping" :
+            raise UDError("There is a cycle in a sentence")
+          if word.parent is None :
+            head = int(word.columns[col2index["HEAD"]])
+            if head < 0 or head > len(ud.words) - sentence_start :
+              raise UDError("HEAD '{}' points outside of the sentence".format(_encode(word.columns[col2index["HEAD"]])))
+            if head :
+              parent = ud.words[sentence_start + head - 1]
+              word.parent = "remapping"
+              process_word(parent)
+              word.parent = parent
 
       for word in ud.words[sentence_start:] :
         process_word(word)
       # func_children cannot be assigned within process_word
       # because it is called recursively and may result in adding one child twice.
       for word in ud.words[sentence_start:] :
-        if word.parent and word.is_functional_deprel :
+        if "HEAD" in col2index and word.parent and word.is_functional_deprel :
           word.parent.functional_children.append(word)
 
       # Check there is a single root node
-      if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1 :
+      if "HEAD" in col2index and len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1 :
         raise UDError("There are multiple roots in a sentence")
 
       # End the sentence
@@ -260,53 +273,52 @@ def load_conllu(file) :
 
     # Read next token/word
     columns = line.split("\t")
-    if len(columns) < 10 :
-      raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(line)))
 
     # Skip empty nodes
-    if "." in columns[ID] :
+    if "ID" in col2index and "." in columns[col2index["ID"]] :
       continue
 
     # Delete spaces from FORM, so gold.characters == system.characters
     # even if one of them tokenizes the space. Use any Unicode character
     # with category Zs.
-    columns[FORM] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[FORM]))
-    if not columns[FORM] :
-      raise UDError("There is an empty FORM in the CoNLL-U file")
+    if "FORM" in col2index :
+      columns[col2index["FORM"]] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[col2index["FORM"]]))
+      if not columns[col2index["FORM"]] :
+        raise UDError("There is an empty FORM in the CoNLL-U file")
 
     # Save token
-    ud.characters.extend(columns[FORM])
-    ud.tokens.append(UDSpan(index, index + len(columns[FORM])))
-    index += len(columns[FORM])
+    form_value = columns[col2index["FORM"]] if "FORM" in col2index else "_"
+    ud.characters.extend(form_value)
+    ud.tokens.append(UDSpan(index, index + len(form_value)))
+    index += len(form_value)
 
     # Handle multi-word tokens to save word(s)
-    if "-" in columns[ID] :
+    if "ID" in col2index and "-" in columns[col2index["ID"]] :
       try :
-        start, end = map(int, columns[ID].split("-"))
+        start, end = map(int, columns[col2index["ID"]].split("-"))
       except :
-        raise UDError("Cannot parse multi-word token ID '{}'".format(_encode(columns[ID])))
+        raise UDError("Cannot parse multi-word token ID '{}'".format(_encode(columns[col2index["ID"]])))
 
       for _ in range(start, end + 1) :
         word_line = _decode(file.readline().rstrip("\r\n"))
         word_columns = word_line.split("\t")
-        if len(word_columns) < 10 :
-          raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(word_line)))
+
         ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
         ud.words[-1].sentence = len(ud.sentences)-1
     # Basic tokens/words
     else :
       try :
-        word_id = int(columns[ID])
+        word_id = int(columns[col2index["ID"]]) if "ID" in col2index else "_"
       except :
-        raise UDError("Cannot parse word ID '{}'".format(_encode(columns[ID])))
+        raise UDError("Cannot parse word ID '{}'".format(_encode(columns[col2index["ID"]])))
       if word_id != len(ud.words) - sentence_start + 1 :
         raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(
-          _encode(columns[ID]), _encode(columns[FORM]), len(ud.words) - sentence_start + 1))
+          _encode(columns[col2index["ID"]]), _encode(columns[col2index["FORM"]]), len(ud.words) - sentence_start + 1))
 
       try :
-        head_id = int(columns[HEAD])
+        head_id = int(columns[col2index["HEAD"]]) if "HEAD" in col2index else 0
       except :
-        raise UDError("Cannot parse HEAD '{}'".format(_encode(columns[HEAD])))
+        raise UDError("Cannot parse HEAD '{}'".format(_encode(columns[col2index["HEAD"]])))
       if head_id < 0 :
         raise UDError("HEAD cannot be negative")
 
@@ -433,7 +445,7 @@ def evaluate(gold_ud, system_ud) :
     lcs = [[0] * (si - ss) for i in range(gi - gs)]
     for g in reversed(range(gi - gs)) :
       for s in reversed(range(si - ss)) :
-        if gold_words[gs + g].columns[FORM].lower() == system_words[ss + s].columns[FORM].lower() :
+        if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() :
           lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0)
         lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0)
         lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0)
@@ -454,7 +466,7 @@ def evaluate(gold_ud, system_ud) :
           # Store aligned words
           s, g = 0, 0
           while g < gi - gs and s < si - ss :
-            if gold_words[gs + g].columns[FORM].lower() == system_words[ss + s].columns[FORM].lower() :
+            if gold_words[gs + g].columns[col2index["FORM"]].lower() == system_words[ss + s].columns[col2index["FORM"]].lower() :
               alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s])
               g += 1
               s += 1
@@ -494,27 +506,26 @@ def evaluate(gold_ud, system_ud) :
   alignment = align_words(gold_ud.words, system_ud.words)
 
   # Compute the F1-scores
-  return {
-    "Tokens" : spans_score(gold_ud.tokens, system_ud.tokens),
-    "Sentences" : spans_score(gold_ud.sentences, system_ud.sentences),
-    "Words" : alignment_score(alignment),
-    "UPOS" : alignment_score(alignment, lambda w, _ : w.columns[UPOS]),
-    "XPOS" : alignment_score(alignment, lambda w, _ : w.columns[XPOS]),
-    "UFeats" : alignment_score(alignment, lambda w, _ : w.columns[FEATS]),
-    "AllTags" : alignment_score(alignment, lambda w, _ : (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])),
-    "Lemmas" : alignment_score(alignment, lambda w, ga : w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"),
-    "UAS" : alignment_score(alignment, lambda w, ga : ga(w.parent)),
-    "LAS" : alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[DEPREL])),
-    "CLAS" : alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[DEPREL]),
-                filter_fn=lambda w : w.is_content_deprel),
-    "MLAS" : alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS],
-                             [(ga(c), c.columns[DEPREL], c.columns[UPOS], c.columns[FEATS])
-                              for c in w.functional_children]),
-                filter_fn=lambda w : w.is_content_deprel),
-    "BLEX" : alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[DEPREL],
-                              w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"),
-                filter_fn=lambda w : w.is_content_deprel),
-  }
+  result = {}
+  if "FORM" in col2index :
+    result["Tokens"] = spans_score(gold_ud.tokens, system_ud.tokens)
+    result["Words"] = alignment_score(alignment)
+  if "UPOS" in col2index :
+    result["UPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["UPOS"]])
+  if "XPOS" in col2index :
+    result["XPOS"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["XPOS"]])
+  if "FEATS" in col2index :
+    result["UFeats"] = alignment_score(alignment, lambda w, _ : w.columns[col2index["FEATS"]])
+  if "LEMMA" in col2index :
+    result["Lemmas"] = alignment_score(alignment, lambda w, ga : w.columns[col2index["LEMMA"]] if ga(w).columns[col2index["LEMMA"]] != "_" else "_")
+  if "HEAD" in col2index :
+    result["UAS"] = alignment_score(alignment, lambda w, ga : ga(w.parent))
+  if "DEPREL" in col2index :
+    result["LAS"] = alignment_score(alignment, lambda w, ga : (ga(w.parent), w.columns[col2index["DEPREL"]]))
+  if "ID" in col2index :
+    result["Sentences"] = spans_score(gold_ud.sentences, system_ud.sentences)
+
+  return result
 ################################################################################
 
 
@@ -546,7 +557,7 @@ class Error :
     self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end]
     self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end]
     # TODO : do it for other than UPOS
-    self.type = self.gold.columns[UPOS]+"->"+self.pred.columns[UPOS]
+    self.type = self.gold.columns[col2index["UPOS"]]+"->"+self.pred.columns[col2index["UPOS"]]
   def __str__(self) :
     result = []
     gold_lines = []
@@ -557,7 +568,7 @@ class Error :
       pred_lines.append((">" if word == self.pred else " ") + " ".join(filter_columns(word.columns)))
          
     for index in range(max(len(gold_lines), len(pred_lines))) :
-      result.append("{} | {}".format(gold_lines[index] if index < len(gold_lines) else "", pred_lines[index] if index < len(pred_lines) else ""))
+      result.append("{} | {}".format(gold_lines[index] if index < len(gold_lines) else " "*len(pred_lines[index]), pred_lines[index] if index < len(pred_lines) else " "*len(gold_lines[index])))
     return "\n".join(result)
 
 class Errors :
@@ -635,6 +646,10 @@ def main() :
 
   errors_metrics = [] if args.enumerate_errors is None else args.enumerate_errors.split(',')
 
+  global col2index
+  global index2col
+  col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
+
   # Evaluate
   gold_ud, evaluations = evaluate_wrapper(args)
   errors_by_file = []
@@ -654,7 +669,7 @@ def main() :
     else :
       print("Metric     | Precision |    Recall |  F1 Score | AligndAcc")
     print("-----------+-----------+-----------+-----------+-----------")
-    for metric in["Tokens", "Sentences", "Words", "UPOS", "XPOS", "UFeats", "AllTags", "Lemmas", "UAS", "LAS", "CLAS", "MLAS", "BLEX"] :
+    for metric in evaluation :
       if args.counts :
         print("{:11}|{:10} |{:10} |{:10} |{:10}".format(
           metric,
diff --git a/scripts/conllu2fplm.py b/scripts/conllu2fplm.py
index d5d1613..c42ef39 100755
--- a/scripts/conllu2fplm.py
+++ b/scripts/conllu2fplm.py
@@ -4,7 +4,7 @@ import sys
 from readMCD import readMCD
 
 def printUsageAndExit() :
-  print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr)
+  print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
   exit(1)
 
 def sameLineWithoutLemma(l1, l2) :
@@ -16,10 +16,10 @@ if __name__ == "__main__" :
 
   sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
-  if len(sys.argv) != 3 :
+  if len(sys.argv) != 2 :
     printUsageAndExit()
 
-  conllMCD, conllMCDr = readMCD(sys.argv[2])
+  col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
 
   entriesCount = {}
   entriesList = []
@@ -28,15 +28,18 @@ if __name__ == "__main__" :
     if len(line.strip()) < 3 :
       continue
     if line.strip()[0] == '#' :
+      splited = line.split("global.columns =")
+      if len(splited) > 1 :
+        col2index, index2col = readMCD(splited[-1].strip())
       continue
 
     columns = line.strip().split('\t')
-    if len(columns[int(conllMCDr["ID"])].split('-')) > 1 :
+    if len(columns[col2index["ID"]].split('-')) > 1 :
       continue
 
     entry = ""
     for col in ["FORM", "UPOS", "LEMMA", "FEATS"] :
-      entry = entry + columns[int(conllMCDr[col])] + '\t'
+      entry = entry + (columns[col2index[col]] if col in col2index else "_") + '\t'
     entry = entry[:-1]
 
     if entry not in entriesCount :
diff --git a/scripts/conllu2splits.py b/scripts/conllu2splits.py
index 253160c..d4d939c 100755
--- a/scripts/conllu2splits.py
+++ b/scripts/conllu2splits.py
@@ -7,7 +7,7 @@ rules = {}
 prefix = "SPLITWORD "
 
 def printUsageAndExit() :
-  print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr)
+  print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
   exit(1)
 
 def computeRules(sentence) :
@@ -34,26 +34,28 @@ def computeRules(sentence) :
         rules[word[1]][rule] = 1
 
 def main() :
-
   sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
 
-  if len(sys.argv) != 3 :
+  if len(sys.argv) != 2 :
     printUsageAndExit()
 
-  conllMCD, conllMCDr = readMCD(sys.argv[2])
-
-  idId = int(conllMCDr["ID"])
-  idForm = int(conllMCDr["FORM"])
+  col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
 
   sentence = []
 
   for line in open(sys.argv[1], "r", encoding="utf8") :
     if len(line.strip()) < 2 or line[0] == '#' :
+      splited = line.split("global.columns =")
+      if len(splited) > 1 :
+        col2index, index2col = readMCD(splited[-1].strip())
       if len(sentence) > 0 :
         computeRules(sentence)
       sentence = []
       continue
 
+    idId = int(col2index["ID"])
+    idForm = int(col2index["FORM"])
+
     splited = line.strip().split('\t')
     sentence += [[splited[idId], splited[idForm]]]
 
diff --git a/scripts/readMCD.py b/scripts/readMCD.py
index fa5c5a0..2ced082 100644
--- a/scripts/readMCD.py
+++ b/scripts/readMCD.py
@@ -1,10 +1,9 @@
-def readMCD(mcdFilename) :
-  mcd = {}
-  for line in open(mcdFilename, "r", encoding="utf8") :
-    clean = line.strip()
-    if len(line) == 2 or line[0] == '#' :
-      continue
-    mcd[len(mcd)] = clean
+def readMCD(mcd) :
+  col2index = {}
+  index2col = {}
 
-  return mcd, {v: k for k, v in mcd.items()}
+  for col in mcd.split(' ') :
+    col2index[col] = len(col2index)
+    index2col[len(index2col)] = col
 
+  return col2index, index2col
-- 
GitLab