diff --git a/scripts/conlluCheckProblems.py b/scripts/conlluCheckProblems.py index 83307bda807bf27c82d9ce52d95f7216ba983445..affdeb232ec69430eedc34463102a322691249b8 100755 --- a/scripts/conlluCheckProblems.py +++ b/scripts/conlluCheckProblems.py @@ -24,11 +24,11 @@ def checkMCD(mcd) : ################################################################################ def logError(message, sentence) : - print(message) + print(message, file=sys.stderr) for line in sentence : for col in line : - print(col,end="\t") - print("") + print(col,end="\t", file=sys.stderr) + print("", file=sys.stderr) ################################################################################ ################################################################################ @@ -43,17 +43,19 @@ def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : multiWordEmptyNodes = set() id2index = {} + hadErr = False # Verifying IDS for i in range(len(sentence)) : for col in sentence[i] : if len(col) == 0 : logError("Empty column on line %s"%(fileLineIndex+i), sentence) - return + return False idStr = sentence[i][idIndex] if idStr in id2index : logError("ERROR in IDs : line %s '%s' already seen"%(fileLineIndex+i,idStr), sentence) + hadErr = True id2index[idStr] = i @@ -63,11 +65,13 @@ def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : multiWordEmptyNodes.add(i) if int(splited[0]) != curId+1 or int(splited[0]) >= int(splited[1]) : logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence) + hadErr = True elif len(idStr.split('.')) == 2 : multiWordEmptyNodes.add(i) splited = idStr.split('.') if int(splited[0]) != curId or int(splited[1]) != curDigit : logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence) + hadErr = True curDigit += 1 elif idStr.isdigit() : curId += 1 @@ -75,8 +79,10 @@ def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : maxId = max(maxId, int(idStr)) if int(idStr) != curId : logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence) + hadErr = True else : logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence) + hadErr = True nbRoot = 0 # Verifying root @@ -87,6 +93,7 @@ def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : if nbRoot != 1 : logError("ERROR %d root in sentence on line %s:"%(nbRoot,fileLineIndex), sentence) + hadErr = True # Verifying govs for i in range(len(sentence)) : @@ -95,8 +102,11 @@ def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : govStr = sentence[i][govIndex] if not govStr.isdigit() : logError("ERROR line %d gov \'%s\' is not positive integer :"%(fileLineIndex+i,govStr), sentence) + hadErr = True + if int(govStr) > maxId : logError("ERROR line %d gov \'%s\' is out of sentence :"%(fileLineIndex+i,govStr), sentence) + hadErr = True # Verifying cycles alreadyReported = {} @@ -114,8 +124,11 @@ def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : if currentNode in alreadySeen : if currentNode not in alreadyReported : logError("ERROR line %d (id=%s) loop in governors :"%(fileLineIndex+currentNode, sentence[currentNode][idIndex]), sentence) + hadErr = True alreadyReported[currentNode] = True break + + return not hadErr ################################################################################ @@ -139,13 +152,17 @@ if __name__ == "__main__" : if len(clean) < 3 : if sentFirstLine == -1 : exit(1) - checkSentence(sentFirstLine, sentence, conllMCDr, conllMCD) + if checkSentence(sentFirstLine, sentence, conllMCDr, conllMCD) : + for line in sentence : + print("\t".join(line)) + print("") sentence = [] sentFirstLine = -1 continue if clean[0] == '#' : splited = line.split("global.columns =") if len(splited) > 1 : + print(line.strip()) conllMCD, conllMCDr = readMCD(splited[-1].strip()) checkMCD(conllMCD) continue