diff --git a/scripts/conlluCheckProblems.py b/scripts/conlluCheckProblems.py index abb13d3f0b504451f4fdc8dc9c95e0bba95cd22e..03b0a75998dd1e938cb440dbadb263c202002d16 100755 --- a/scripts/conlluCheckProblems.py +++ b/scripts/conlluCheckProblems.py @@ -24,15 +24,14 @@ def readMCD(mcdFilename) : return mcd -def errorAndExit(message, sentence) : +def logError(message, sentence) : print(message) for line in sentence : for col in line : print(col,end="\t") print("") - exit(1) -def checkSentence(sentence, conllMCD, conllMCDr) : +def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : idIndex = int(conllMCDr[idColName]) govIndex = int(conllMCDr[headColName]) labelIndex = int(conllMCDr[deprelColName]) @@ -48,11 +47,12 @@ def checkSentence(sentence, conllMCD, conllMCDr) : for col in sentence[i] : if len(col) == 0 : - errorAndExit("Empty column", sentence) + logError("Empty column on line %s"%(fileLineIndex+i), sentence) + return idStr = sentence[i][idIndex] if idStr in id2index : - errorAndExit("ERROR in IDs : '%s' already seen"%idStr, sentence) + logError("ERROR in IDs : line %s '%s' already seen"%(fileLineIndex+i,idStr), sentence) id2index[idStr] = i @@ -61,21 +61,21 @@ def checkSentence(sentence, conllMCD, conllMCDr) : splited = idStr.split('-') multiWordEmptyNodes.add(i) if int(splited[0]) != curId+1 or int(splited[0]) >= int(splited[1]) : - errorAndExit("ERROR in IDs : %s"%idStr, sentence) + logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence) elif len(idStr.split('.')) == 2 : multiWordEmptyNodes.add(i) splited = idStr.split('.') if int(splited[0]) != curId or int(splited[1]) != curDigit : - errorAndExit("ERROR in IDs : %s"%idStr, sentence) + logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence) curDigit += 1 elif idStr.isdigit() : curId += 1 curDigit = 1 maxId = max(maxId, int(idStr)) if int(idStr) != curId : - errorAndExit("ERROR in IDs : %s"%idStr, sentence) + logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence) else : - errorAndExit("ERROR in IDs : %s"%idStr, sentence) + logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence) nbRoot = 0 # Verifying root @@ -85,7 +85,7 @@ def checkSentence(sentence, conllMCD, conllMCDr) : nbRoot += 1 if nbRoot != 1 : - errorAndExit("ERROR %d root in sentence :"%nbRoot, sentence) + logError("ERROR %d root in sentence on line %s:"%(nbRoot,fileLineIndex), sentence) # Verifying govs for i in range(len(sentence)) : @@ -93,11 +93,12 @@ def checkSentence(sentence, conllMCD, conllMCDr) : continue govStr = sentence[i][govIndex] if not govStr.isdigit() : - errorAndExit("ERROR line %d gov \'%s\' is not positive integer :"%(i+1,govStr), sentence) + logError("ERROR line %d gov \'%s\' is not positive integer :"%(fileLineIndex+i,govStr), sentence) if int(govStr) > maxId : - errorAndExit("ERROR line %d gov \'%s\' is out of sentence :"%(i+1,govStr), sentence) + logError("ERROR line %d gov \'%s\' is out of sentence :"%(fileLineIndex+i,govStr), sentence) # Verifying cycles + alreadyReported = {} for i in range(len(sentence)) : if i in multiWordEmptyNodes : continue @@ -110,7 +111,10 @@ def checkSentence(sentence, conllMCD, conllMCDr) : break currentNode = id2index[govStr] if currentNode in alreadySeen : - errorAndExit("ERROR line %d (id=%s) loop in governors :"%(currentNode+1, sentence[currentNode][idIndex]), sentence) + if currentNode not in alreadyReported : + logError("ERROR line %d (id=%s) loop in governors :"%(fileLineIndex+currentNode, sentence[currentNode][idIndex]), sentence) + alreadyReported[currentNode] = True + break ################################################################################ @@ -123,14 +127,20 @@ if __name__ == "__main__" : conllMCDr = {v: k for k, v in conllMCD.items()} sentence = [] + fileLineIndex = 0 + sentFirstLine = -1 for line in open(sys.argv[1], "r", encoding="utf8") : + fileLineIndex += 1 clean = line.strip() if len(clean) == 0 : - checkSentence(sentence, conllMCD, conllMCDr) + checkSentence(sentFirstLine, sentence, conllMCD, conllMCDr) sentence = [] + sentFirstLine = -1 continue if clean[0] == '#' : continue + if sentFirstLine == -1 : + sentFirstLine = fileLineIndex sentence.append(clean.split('\t'))