diff --git a/scripts/addMissingDepTree.py b/scripts/addMissingDepTree.py new file mode 100755 index 0000000000000000000000000000000000000000..9dad6aaf357342a640b61edbb6f4182da9f61811 --- /dev/null +++ b/scripts/addMissingDepTree.py @@ -0,0 +1,32 @@ +#! /usr/bin/python3 + +import sys + +def printUsageAndExit() : + print("USAGE : %s file.tsv"%sys.argv[0], file=sys.stderr) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) != 2 : + printUsageAndExit() + + curId = 1 + for line in open(sys.argv[1], 'r') : + if len(line) < 3 or line[0] == '#' : + curId = 1 + if len(line) >= 3 : + print(line, end="") + else : + print("") + continue + + splited = line.strip().split('\t') + if curId == 1 : + splited[6] = "0" + splited[7] = "root" + else : + splited[6] = "1" + + print('\t'.join(splited)) + curId += 1 + diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index afbf9a552b6917b2de15391b889ad996cb7c746a..8419298797433328b5d8d2447ce2ebd32cea089b 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -224,7 +224,7 @@ def load_conllu(file): # Read next token/word columns = line.split("\t") - if len(columns) != 10: + if len(columns) < 10: raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(line))) # Skip empty nodes @@ -253,7 +253,7 @@ def load_conllu(file): for _ in range(start, end + 1): word_line = _decode(file.readline().rstrip("\r\n")) word_columns = word_line.split("\t") - if len(word_columns) != 10: + if len(word_columns) < 10: raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(_encode(word_line))) ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) # Basic tokens/words diff --git a/scripts/conlluCheckProblems.py b/scripts/conlluCheckProblems.py index 03b0a75998dd1e938cb440dbadb263c202002d16..1c7399d7d94ab8f069c5fe347c37b4805991181a 100755 --- a/scripts/conlluCheckProblems.py +++ b/scripts/conlluCheckProblems.py @@ -133,14 +133,15 @@ if __name__ == "__main__" : for line in open(sys.argv[1], "r", encoding="utf8") : fileLineIndex += 1 clean = line.strip() - if len(clean) == 0 : + + if len(clean) < 3 : + if sentFirstLine == -1 : + exit(1) checkSentence(sentFirstLine, sentence, conllMCD, conllMCDr) sentence = [] sentFirstLine = -1 - continue - if clean[0] == '#' : - continue - if sentFirstLine == -1 : - sentFirstLine = fileLineIndex - sentence.append(clean.split('\t')) + elif clean[0] != '#' : + if sentFirstLine == -1 : + sentFirstLine = fileLineIndex + sentence.append(clean.split('\t')) diff --git a/scripts/deleteColumns.py b/scripts/deleteColumns.py new file mode 100755 index 0000000000000000000000000000000000000000..9706063811cdaab11e430cc724719b79f2bed65c --- /dev/null +++ b/scripts/deleteColumns.py @@ -0,0 +1,28 @@ +#! /usr/bin/python3 + +import sys + +def printUsageAndExit() : + print("USAGE : %s file.tsv column1 column2..."%sys.argv[0], file=sys.stderr) + exit(1) + +if __name__ == "__main__" : + if len(sys.argv) < 2 : + printUsageAndExit() + + for line in open(sys.argv[1], 'r') : + if len(line) < 3 or line[0] == '#' : + if len(line) >= 3 : + print(line, end="") + else : + print("") + continue + + splited = line.strip().split('\t') + for i in sys.argv[2:] : + col = int(i) + if col in range(len(splited)) : + splited[col] = "_" + + print('\t'.join(splited)) +