diff --git a/scripts/conlluCheckProblems.py b/scripts/conlluCheckProblems.py new file mode 100755 index 0000000000000000000000000000000000000000..e912e0a92c0346d530199c1a4aabed59308589c1 --- /dev/null +++ b/scripts/conlluCheckProblems.py @@ -0,0 +1,113 @@ +#! /usr/bin/python3 + +import sys + +headColName = "HEAD" +deprelColName = "DEPREL" +idColName = "ID" + +def printUsageAndExit() : + print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 1 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[len(mcd)] = splited[0].strip() + + return mcd + +def errorAndExit(message, sentence) : + print(message) + for line in sentence : + for col in line : + print(col,end="\t") + print("") + exit(1) + +def checkSentence(sentence, conllMCD, conllMCDr) : + idIndex = int(conllMCDr[idColName]) + govIndex = int(conllMCDr[headColName]) + labelIndex = int(conllMCDr[deprelColName]) + + curId = 0 + curDigit = 1 + maxId = 0 + multiWordEmptyNodes = set() + +# Verifying IDS + for i in range(len(sentence)) : + + for col in sentence[i] : + if len(col) == 0 : + errorAndExit("Empty column", sentence) + + idStr = sentence[i][idIndex] + + if len(idStr.split('-')) == 2 : + curDigit = 1 + splited = idStr.split('-') + multiWordEmptyNodes.add(i) + if int(splited[0]) != curId+1 or int(splited[0]) >= int(splited[1]) : + errorAndExit("ERROR in IDs : %s"%idStr, sentence) + elif len(idStr.split('.')) == 2 : + multiWordEmptyNodes.add(i) + splited = idStr.split('.') + if int(splited[0]) != curId or int(splited[1]) != curDigit : + errorAndExit("ERROR in IDs : %s"%idStr, sentence) + curDigit += 1 + elif idStr.isdigit() : + curId += 1 + curDigit = 1 + maxId = max(maxId, int(idStr)) + if int(idStr) != curId : + errorAndExit("ERROR in IDs : %s"%idStr, sentence) + else : + errorAndExit("ERROR in IDs : %s"%idStr, sentence) + + nbRoot = 0 +# Verifying root + for i in range(len(sentence)) : + labelStr = sentence[i][labelIndex] + if labelStr == "root" : + nbRoot += 1 + + if nbRoot != 1 : + errorAndExit("ERROR %d root in sentence :"%nbRoot, sentence) + +# Verifying govs + for i in range(len(sentence)) : + if i in multiWordEmptyNodes : + continue + govStr = sentence[i][govIndex] + if not govStr.isdigit() : + errorAndExit("ERROR line %d gov \'%s\' is not positive integer :"%(i+1,govStr), sentence) + if int(govStr) > maxId : + errorAndExit("ERROR line %d gov \'%s\' is out of sentence :"%(i+1,govStr), sentence) + +if __name__ == "__main__" : + if len(sys.argv) != 3 : + printUsageAndExit() + + conllMCD = readMCD(sys.argv[2]) + conllMCDr = {v: k for k, v in conllMCD.items()} + + sentence = [] + + for line in open(sys.argv[1], "r", encoding="utf8") : + clean = line.strip() + if len(clean) == 0 : + checkSentence(sentence, conllMCD, conllMCDr) + sentence = [] + continue + if clean[0] == '#' : + continue + sentence.append(clean.split('\t')) +