diff --git a/tools/conlluCheckProblems.py b/tools/conlluCheckProblems.py new file mode 100755 index 0000000000000000000000000000000000000000..27a1e8a0cae06d25ff9fad0321ff92b9144e8031 --- /dev/null +++ b/tools/conlluCheckProblems.py @@ -0,0 +1,102 @@ +#! /usr/bin/python3 + +import sys + +def printUsageAndExit() : + print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 2 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[splited[0].strip()] = splited[1].strip() + + return mcd + +def errorAndExit(message, sentence) : + print(message) + for line in sentence : + for col in line : + print(col,end="\t") + print("") + exit(1) + +def checkSentence(sentence, conllMCD, conllMCDr) : + idIndex = int(conllMCDr["ID"]) + govIndex = int(conllMCDr["GOV"]) + labelIndex = int(conllMCDr["LABEL"]) + + curId = 0 + curDigit = 1 + maxId = 0 + multiWordEmptyNodes = set() + +# Verifying IDS + for i in range(len(sentence)) : + idStr = sentence[i][idIndex] + if idStr.isdigit() : + curId += 1 + curDigit = 1 + maxId = max(maxId, int(idStr)) + if int(idStr) != curId : + errorAndExit("ERROR in IDs :", sentence) + elif len(idStr.split('-')) == 2 : + curDigit = 1 + splited = idStr.split('-') + multiWordEmptyNodes.add(i) + if int(splited[0]) != curId+1 or int(splited[0]) >= int(splited[1]) : + errorAndExit("ERROR in IDs :", sentence) + elif len(idStr.split('.')) == 2 : + multiWordEmptyNodes.add(i) + splited = idStr.split('.') + if int(splited[0]) != curId or int(splited[1]) != curDigit : + errorAndExit("ERROR in IDs :", sentence) + curDigit += 1 + else : + errorAndExit("ERROR in IDs :", sentence) + + nbRoot = 0 +# Verifying root + for i in range(len(sentence)) : + labelStr = sentence[i][labelIndex] + if labelStr == "root" : + nbRoot += 1 + + if nbRoot != 1 : + errorAndExit("ERROR %d root in sentence :"%nbRoot, sentence) + +# Verifying govs + for i in range(len(sentence)) : + if i in multiWordEmptyNodes : + continue + govStr = sentence[i][govIndex] + if not govStr.isdigit() : + errorAndExit("ERROR line %d gov \'%s\' is not positive integer :"%(i,govStr), sentence) + if int(govStr) > maxId : + errorAndExit("ERROR line %d gov \'%s\' is out of sentence :"%(i,govStr), sentence) + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + conllMCD = readMCD(sys.argv[2]) + conllMCDr = {v: k for k, v in conllMCD.items()} + + sentence = [] + + for line in open(sys.argv[1], "r", encoding="utf8") : + clean = line.strip() + if len(clean) == 0 : + checkSentence(sentence, conllMCD, conllMCDr) + sentence = [] + continue + if clean[0] == '#' : + continue + sentence.append(clean.split('\t'))