diff --git a/scripts/conlluCheckProblems.py b/scripts/conlluCheckProblems.py index e912e0a92c0346d530199c1a4aabed59308589c1..c255f9a86169a462553ff191222c57b1c15634bf 100755 --- a/scripts/conlluCheckProblems.py +++ b/scripts/conlluCheckProblems.py @@ -41,6 +41,7 @@ def checkSentence(sentence, conllMCD, conllMCDr) : curDigit = 1 maxId = 0 multiWordEmptyNodes = set() + id2index = {} # Verifying IDS for i in range(len(sentence)) : @@ -50,6 +51,10 @@ def checkSentence(sentence, conllMCD, conllMCDr) : errorAndExit("Empty column", sentence) idStr = sentence[i][idIndex] + if idStr in id2index : + errorAndExit("ERROR in IDs : '%s' already seen"%idStr, sentence) + + id2index[idStr] = i if len(idStr.split('-')) == 2 : curDigit = 1 @@ -92,6 +97,24 @@ def checkSentence(sentence, conllMCD, conllMCDr) : if int(govStr) > maxId : errorAndExit("ERROR line %d gov \'%s\' is out of sentence :"%(i+1,govStr), sentence) +# Verifying cycles + for i in range(len(sentence)) : + if i in multiWordEmptyNodes : + continue + alreadySeen = {} + currentNode = i + while True : + alreadySeen[currentNode] = True + govStr = sentence[currentNode][govIndex] + if govStr == "0" : + break + currentNode = id2index[govStr] + if currentNode in alreadySeen : + errorAndExit("ERROR line %d loop in governors :"%(i+1), sentence) +################################################################################ + + +################################################################################ if __name__ == "__main__" : if len(sys.argv) != 3 : printUsageAndExit()