Skip to content
Snippets Groups Projects
Commit 58445ee5 authored by Franck Dary's avatar Franck Dary
Browse files

Improved problem checker script, prompts all problems along with their line number

parent d7f6d0b5
No related branches found
No related tags found
No related merge requests found
......@@ -24,15 +24,14 @@ def readMCD(mcdFilename) :
return mcd
def errorAndExit(message, sentence) :
def logError(message, sentence) :
print(message)
for line in sentence :
for col in line :
print(col,end="\t")
print("")
exit(1)
def checkSentence(sentence, conllMCD, conllMCDr) :
def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) :
idIndex = int(conllMCDr[idColName])
govIndex = int(conllMCDr[headColName])
labelIndex = int(conllMCDr[deprelColName])
......@@ -48,11 +47,12 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
for col in sentence[i] :
if len(col) == 0 :
errorAndExit("Empty column", sentence)
logError("Empty column on line %s"%(fileLineIndex+i), sentence)
return
idStr = sentence[i][idIndex]
if idStr in id2index :
errorAndExit("ERROR in IDs : '%s' already seen"%idStr, sentence)
logError("ERROR in IDs : line %s '%s' already seen"%(fileLineIndex+i,idStr), sentence)
id2index[idStr] = i
......@@ -61,21 +61,21 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
splited = idStr.split('-')
multiWordEmptyNodes.add(i)
if int(splited[0]) != curId+1 or int(splited[0]) >= int(splited[1]) :
errorAndExit("ERROR in IDs : %s"%idStr, sentence)
logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence)
elif len(idStr.split('.')) == 2 :
multiWordEmptyNodes.add(i)
splited = idStr.split('.')
if int(splited[0]) != curId or int(splited[1]) != curDigit :
errorAndExit("ERROR in IDs : %s"%idStr, sentence)
logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence)
curDigit += 1
elif idStr.isdigit() :
curId += 1
curDigit = 1
maxId = max(maxId, int(idStr))
if int(idStr) != curId :
errorAndExit("ERROR in IDs : %s"%idStr, sentence)
logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence)
else :
errorAndExit("ERROR in IDs : %s"%idStr, sentence)
logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence)
nbRoot = 0
# Verifying root
......@@ -85,7 +85,7 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
nbRoot += 1
if nbRoot != 1 :
errorAndExit("ERROR %d root in sentence :"%nbRoot, sentence)
logError("ERROR %d root in sentence on line %s:"%(nbRoot,fileLineIndex), sentence)
# Verifying govs
for i in range(len(sentence)) :
......@@ -93,11 +93,12 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
continue
govStr = sentence[i][govIndex]
if not govStr.isdigit() :
errorAndExit("ERROR line %d gov \'%s\' is not positive integer :"%(i+1,govStr), sentence)
logError("ERROR line %d gov \'%s\' is not positive integer :"%(fileLineIndex+i,govStr), sentence)
if int(govStr) > maxId :
errorAndExit("ERROR line %d gov \'%s\' is out of sentence :"%(i+1,govStr), sentence)
logError("ERROR line %d gov \'%s\' is out of sentence :"%(fileLineIndex+i,govStr), sentence)
# Verifying cycles
alreadyReported = {}
for i in range(len(sentence)) :
if i in multiWordEmptyNodes :
continue
......@@ -110,7 +111,10 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
break
currentNode = id2index[govStr]
if currentNode in alreadySeen :
errorAndExit("ERROR line %d (id=%s) loop in governors :"%(currentNode+1, sentence[currentNode][idIndex]), sentence)
if currentNode not in alreadyReported :
logError("ERROR line %d (id=%s) loop in governors :"%(fileLineIndex+currentNode, sentence[currentNode][idIndex]), sentence)
alreadyReported[currentNode] = True
break
################################################################################
......@@ -123,14 +127,20 @@ if __name__ == "__main__" :
conllMCDr = {v: k for k, v in conllMCD.items()}
sentence = []
fileLineIndex = 0
sentFirstLine = -1
for line in open(sys.argv[1], "r", encoding="utf8") :
fileLineIndex += 1
clean = line.strip()
if len(clean) == 0 :
checkSentence(sentence, conllMCD, conllMCDr)
checkSentence(sentFirstLine, sentence, conllMCD, conllMCDr)
sentence = []
sentFirstLine = -1
continue
if clean[0] == '#' :
continue
if sentFirstLine == -1 :
sentFirstLine = fileLineIndex
sentence.append(clean.split('\t'))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment