From 3de41e7ac47f2f9916c8e147a10fd4aa71efc25d Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Fri, 31 Jul 2020 18:27:34 +0200 Subject: [PATCH] conllu2horizontal replace every number by 42 and every digit by 0 --- scripts/conllu2horizontal.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/scripts/conllu2horizontal.py b/scripts/conllu2horizontal.py index b0997c0..a6ef9c4 100755 --- a/scripts/conllu2horizontal.py +++ b/scripts/conllu2horizontal.py @@ -3,6 +3,15 @@ import sys from readMCD import readMCD +def isNumber(s) : + hasDigit = False + for c in s : + if c.isdigit() : + hasDigit = True + if c.isalpha() : + return False + return hasDigit + def printUsageAndExit() : print("USAGE : %s file.conllu (columnName | LETTERS)"%sys.argv[0], file=sys.stderr) sys.exit(1) @@ -36,7 +45,10 @@ if __name__ == "__main__" : print("ERROR : column %s not found in line '%s'"%(index, line.strip())) exit(1) - print(splited[index].replace(" ", "◌"), end=" ") + value = splited[index].replace(" ", "◌") + if isNumber(splited[index].replace(" ", "").strip()) : + value = 42 + print(value, end=" ") else : for line in open(sys.argv[1], "r") : if line.startswith("#") : @@ -45,6 +57,9 @@ if __name__ == "__main__" : col2index, index2col = readMCD(splited[-1].strip()) splited = line.split("text =") if len(splited) > 1 : - text = splited[-1].replace("\n", " ").replace(" ", "◌") - print(" ".join(list(text))) - + text = list(splited[-1].replace("\n", " ").replace(" ", "◌")) + for elem in text : + if isNumber(str(elem)) : + elem = '0' + print(elem, end=" ") + print("") -- GitLab