From 3de41e7ac47f2f9916c8e147a10fd4aa71efc25d Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Fri, 31 Jul 2020 18:27:34 +0200
Subject: [PATCH] conllu2horizontal replace every number by 42 and every digit
 by 0

---
 scripts/conllu2horizontal.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/scripts/conllu2horizontal.py b/scripts/conllu2horizontal.py
index b0997c0..a6ef9c4 100755
--- a/scripts/conllu2horizontal.py
+++ b/scripts/conllu2horizontal.py
@@ -3,6 +3,15 @@
 import sys
 from readMCD import readMCD
 
+def isNumber(s) :
+  hasDigit = False
+  for c in s :
+    if c.isdigit() :
+      hasDigit = True
+    if c.isalpha() :
+      return False
+  return hasDigit
+
 def printUsageAndExit() :
   print("USAGE : %s file.conllu (columnName | LETTERS)"%sys.argv[0], file=sys.stderr)
   sys.exit(1)
@@ -36,7 +45,10 @@ if __name__ == "__main__" :
         print("ERROR : column %s not found in line '%s'"%(index, line.strip()))
         exit(1)
 
-      print(splited[index].replace(" ", "◌"), end=" ")
+      value = splited[index].replace(" ", "◌")
+      if isNumber(splited[index].replace(" ", "").strip()) :
+        value = 42
+      print(value, end=" ")
   else :
     for line in open(sys.argv[1], "r") :
       if line.startswith("#") :
@@ -45,6 +57,9 @@ if __name__ == "__main__" :
           col2index, index2col = readMCD(splited[-1].strip())
         splited = line.split("text =")
         if len(splited) > 1 :
-          text = splited[-1].replace("\n", " ").replace(" ", "◌")
-          print(" ".join(list(text)))
-
+          text = list(splited[-1].replace("\n", " ").replace(" ", "◌"))
+          for elem in text :
+            if isNumber(str(elem)) :
+              elem = '0'
+            print(elem, end=" ")
+          print("")
-- 
GitLab