From 6f89810ba94e3dda3b94a2a7d6f2d4ef4ca6bcab Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 7 May 2020 10:04:26 +0200 Subject: [PATCH] updated scripts --- scripts/addMissingDepTree.py | 32 ++++++++++++++++++++++++++++++++ scripts/conll18_ud_eval.py | 4 ++-- scripts/conlluCheckProblems.py | 15 ++++++++------- scripts/deleteColumns.py | 28 ++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+), 9 deletions(-) create mode 100755 scripts/addMissingDepTree.py create mode 100755 scripts/deleteColumns.py diff --git a/scripts/addMissingDepTree.py b/scripts/addMissingDepTree.py new file mode 100755 index 0000000..9dad6aa --- /dev/null +++ b/scripts/addMissingDepTree.py @@ -0,0 +1,32 @@ +#! /usr/bin/python3 + +import sys + +def printUsageAndExit() : + print("USAGE : %s file.tsv"%sys.argv[0], file=sys.stderr) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) != 2 : + printUsageAndExit() + + curId = 1 + for line in open(sys.argv[1], 'r') : + if len(line) < 3 or line[0] == '#' : + curId = 1 + if len(line) >= 3 : + print(line, end="") + else : + print("") + continue + + splited = line.strip().split('\t') + if curId == 1 : + splited[6] = "0" + splited[7] = "root" + else : + splited[6] = "1" + + print('\t'.join(splited)) + curId += 1 + diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index afbf9a5..8419298 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -224,7 +224,7 @@ def load_conllu(file): # Read next token/word columns = line.split("\t") - if len(columns) != 10: + if len(columns) < 10: raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(line))) # Skip empty nodes @@ -253,7 +253,7 @@ def load_conllu(file): for _ in range(start, end + 1): word_line = _decode(file.readline().rstrip("\r\n")) word_columns = word_line.split("\t") - if len(word_columns) != 10: + if len(word_columns) < 10: raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(word_line))) ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) # Basic tokens/words diff --git a/scripts/conlluCheckProblems.py b/scripts/conlluCheckProblems.py index 03b0a75..1c7399d 100755 --- a/scripts/conlluCheckProblems.py +++ b/scripts/conlluCheckProblems.py @@ -133,14 +133,15 @@ if __name__ == "__main__" : for line in open(sys.argv[1], "r", encoding="utf8") : fileLineIndex += 1 clean = line.strip() - if len(clean) == 0 : + + if len(clean) < 3 : + if sentFirstLine == -1 : + exit(1) checkSentence(sentFirstLine, sentence, conllMCD, conllMCDr) sentence = [] sentFirstLine = -1 - continue - if clean[0] == '#' : - continue - if sentFirstLine == -1 : - sentFirstLine = fileLineIndex - sentence.append(clean.split('\t')) + elif clean[0] != '#' : + if sentFirstLine == -1 : + sentFirstLine = fileLineIndex + sentence.append(clean.split('\t')) diff --git a/scripts/deleteColumns.py b/scripts/deleteColumns.py new file mode 100755 index 0000000..9706063 --- /dev/null +++ b/scripts/deleteColumns.py @@ -0,0 +1,28 @@ +#! /usr/bin/python3 + +import sys + +def printUsageAndExit() : + print("USAGE : %s file.tsv column1 column2..."%sys.argv[0], file=sys.stderr) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) < 2 : + printUsageAndExit() + + for line in open(sys.argv[1], 'r') : + if len(line) < 3 or line[0] == '#' : + if len(line) >= 3 : + print(line, end="") + else : + print("") + continue + + splited = line.strip().split('\t') + for i in sys.argv[2:] : + col = int(i) + if col in range(len(splited)) : + splited[col] = "_" + + print('\t'.join(splited)) + -- GitLab