From 6f89810ba94e3dda3b94a2a7d6f2d4ef4ca6bcab Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Thu, 7 May 2020 10:04:26 +0200
Subject: [PATCH] updated scripts

---
 scripts/addMissingDepTree.py   | 32 ++++++++++++++++++++++++++++++++
 scripts/conll18_ud_eval.py     |  4 ++--
 scripts/conlluCheckProblems.py | 15 ++++++++-------
 scripts/deleteColumns.py       | 28 ++++++++++++++++++++++++++++
 4 files changed, 70 insertions(+), 9 deletions(-)
 create mode 100755 scripts/addMissingDepTree.py
 create mode 100755 scripts/deleteColumns.py

diff --git a/scripts/addMissingDepTree.py b/scripts/addMissingDepTree.py
new file mode 100755
index 0000000..9dad6aa
--- /dev/null
+++ b/scripts/addMissingDepTree.py
@@ -0,0 +1,32 @@
+#! /usr/bin/python3
+
+import sys
+
+def printUsageAndExit() :
+  print("USAGE : %s file.tsv"%sys.argv[0], file=sys.stderr)
+  exit(1)
+
+if __name__ == "__main__" :
+  if len(sys.argv) != 2 :
+    printUsageAndExit()
+
+  curId = 1
+  for line in open(sys.argv[1], 'r') :
+    if len(line) < 3 or line[0] == '#' :
+      curId = 1
+      if len(line) >= 3 :
+        print(line, end="")
+      else :
+        print("")
+      continue
+
+    splited = line.strip().split('\t')
+    if curId == 1 :
+      splited[6] = "0"
+      splited[7] = "root"
+    else :
+      splited[6] = "1"
+
+    print('\t'.join(splited))
+    curId += 1
+
diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py
index afbf9a5..8419298 100755
--- a/scripts/conll18_ud_eval.py
+++ b/scripts/conll18_ud_eval.py
@@ -224,7 +224,7 @@ def load_conllu(file):
 
         # Read next token/word
         columns = line.split("\t")
-        if len(columns) != 10:
+        if len(columns) < 10:
             raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(line)))
 
         # Skip empty nodes
@@ -253,7 +253,7 @@ def load_conllu(file):
             for _ in range(start, end + 1):
                 word_line = _decode(file.readline().rstrip("\r\n"))
                 word_columns = word_line.split("\t")
-                if len(word_columns) != 10:
+                if len(word_columns) < 10:
                     raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(word_line)))
                 ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
         # Basic tokens/words
diff --git a/scripts/conlluCheckProblems.py b/scripts/conlluCheckProblems.py
index 03b0a75..1c7399d 100755
--- a/scripts/conlluCheckProblems.py
+++ b/scripts/conlluCheckProblems.py
@@ -133,14 +133,15 @@ if __name__ == "__main__" :
   for line in open(sys.argv[1], "r", encoding="utf8") :
     fileLineIndex += 1
     clean = line.strip()
-    if len(clean) == 0 :
+
+    if len(clean) < 3 :
+      if sentFirstLine == -1 :
+        exit(1)
       checkSentence(sentFirstLine, sentence, conllMCD, conllMCDr)
       sentence = []
       sentFirstLine = -1
-      continue
-    if clean[0] == '#' :
-      continue
-    if sentFirstLine == -1 :
-      sentFirstLine = fileLineIndex
-    sentence.append(clean.split('\t'))
+    elif clean[0] != '#' :
+      if sentFirstLine == -1 :
+        sentFirstLine = fileLineIndex
+      sentence.append(clean.split('\t'))
 
diff --git a/scripts/deleteColumns.py b/scripts/deleteColumns.py
new file mode 100755
index 0000000..9706063
--- /dev/null
+++ b/scripts/deleteColumns.py
@@ -0,0 +1,28 @@
+#! /usr/bin/python3
+
+import sys
+
+def printUsageAndExit() :
+  print("USAGE : %s file.tsv column1 column2..."%sys.argv[0], file=sys.stderr)
+  exit(1)
+
+if __name__ == "__main__" :
+  if len(sys.argv) < 2 :
+    printUsageAndExit()
+
+  for line in open(sys.argv[1], 'r') :
+    if len(line) < 3 or line[0] == '#' :
+      if len(line) >= 3 :
+        print(line, end="")
+      else :
+        print("")
+      continue
+
+    splited = line.strip().split('\t')
+    for i in sys.argv[2:] :
+      col = int(i)
+      if col in range(len(splited)) :
+        splited[col] = "_"
+
+    print('\t'.join(splited))
+
-- 
GitLab