Skip to content
Snippets Groups Projects
Commit 58445ee5 authored by Franck Dary's avatar Franck Dary
Browse files

Improved problem checker script, prompts all problems along with their line number

parent d7f6d0b5
No related branches found
No related tags found
No related merge requests found
...@@ -24,15 +24,14 @@ def readMCD(mcdFilename) : ...@@ -24,15 +24,14 @@ def readMCD(mcdFilename) :
return mcd return mcd
def errorAndExit(message, sentence) : def logError(message, sentence) :
print(message) print(message)
for line in sentence : for line in sentence :
for col in line : for col in line :
print(col,end="\t") print(col,end="\t")
print("") print("")
exit(1)
def checkSentence(sentence, conllMCD, conllMCDr) : def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) :
idIndex = int(conllMCDr[idColName]) idIndex = int(conllMCDr[idColName])
govIndex = int(conllMCDr[headColName]) govIndex = int(conllMCDr[headColName])
labelIndex = int(conllMCDr[deprelColName]) labelIndex = int(conllMCDr[deprelColName])
...@@ -48,11 +47,12 @@ def checkSentence(sentence, conllMCD, conllMCDr) : ...@@ -48,11 +47,12 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
for col in sentence[i] : for col in sentence[i] :
if len(col) == 0 : if len(col) == 0 :
errorAndExit("Empty column", sentence) logError("Empty column on line %s"%(fileLineIndex+i), sentence)
return
idStr = sentence[i][idIndex] idStr = sentence[i][idIndex]
if idStr in id2index : if idStr in id2index :
errorAndExit("ERROR in IDs : '%s' already seen"%idStr, sentence) logError("ERROR in IDs : line %s '%s' already seen"%(fileLineIndex+i,idStr), sentence)
id2index[idStr] = i id2index[idStr] = i
...@@ -61,21 +61,21 @@ def checkSentence(sentence, conllMCD, conllMCDr) : ...@@ -61,21 +61,21 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
splited = idStr.split('-') splited = idStr.split('-')
multiWordEmptyNodes.add(i) multiWordEmptyNodes.add(i)
if int(splited[0]) != curId+1 or int(splited[0]) >= int(splited[1]) : if int(splited[0]) != curId+1 or int(splited[0]) >= int(splited[1]) :
errorAndExit("ERROR in IDs : %s"%idStr, sentence) logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence)
elif len(idStr.split('.')) == 2 : elif len(idStr.split('.')) == 2 :
multiWordEmptyNodes.add(i) multiWordEmptyNodes.add(i)
splited = idStr.split('.') splited = idStr.split('.')
if int(splited[0]) != curId or int(splited[1]) != curDigit : if int(splited[0]) != curId or int(splited[1]) != curDigit :
errorAndExit("ERROR in IDs : %s"%idStr, sentence) logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence)
curDigit += 1 curDigit += 1
elif idStr.isdigit() : elif idStr.isdigit() :
curId += 1 curId += 1
curDigit = 1 curDigit = 1
maxId = max(maxId, int(idStr)) maxId = max(maxId, int(idStr))
if int(idStr) != curId : if int(idStr) != curId :
errorAndExit("ERROR in IDs : %s"%idStr, sentence) logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence)
else : else :
errorAndExit("ERROR in IDs : %s"%idStr, sentence) logError("ERROR in line %s, IDs : %s"%(fileLineIndex+i,idStr), sentence)
nbRoot = 0 nbRoot = 0
# Verifying root # Verifying root
...@@ -85,7 +85,7 @@ def checkSentence(sentence, conllMCD, conllMCDr) : ...@@ -85,7 +85,7 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
nbRoot += 1 nbRoot += 1
if nbRoot != 1 : if nbRoot != 1 :
errorAndExit("ERROR %d root in sentence :"%nbRoot, sentence) logError("ERROR %d root in sentence on line %s:"%(nbRoot,fileLineIndex), sentence)
# Verifying govs # Verifying govs
for i in range(len(sentence)) : for i in range(len(sentence)) :
...@@ -93,11 +93,12 @@ def checkSentence(sentence, conllMCD, conllMCDr) : ...@@ -93,11 +93,12 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
continue continue
govStr = sentence[i][govIndex] govStr = sentence[i][govIndex]
if not govStr.isdigit() : if not govStr.isdigit() :
errorAndExit("ERROR line %d gov \'%s\' is not positive integer :"%(i+1,govStr), sentence) logError("ERROR line %d gov \'%s\' is not positive integer :"%(fileLineIndex+i,govStr), sentence)
if int(govStr) > maxId : if int(govStr) > maxId :
errorAndExit("ERROR line %d gov \'%s\' is out of sentence :"%(i+1,govStr), sentence) logError("ERROR line %d gov \'%s\' is out of sentence :"%(fileLineIndex+i,govStr), sentence)
# Verifying cycles # Verifying cycles
alreadyReported = {}
for i in range(len(sentence)) : for i in range(len(sentence)) :
if i in multiWordEmptyNodes : if i in multiWordEmptyNodes :
continue continue
...@@ -110,7 +111,10 @@ def checkSentence(sentence, conllMCD, conllMCDr) : ...@@ -110,7 +111,10 @@ def checkSentence(sentence, conllMCD, conllMCDr) :
break break
currentNode = id2index[govStr] currentNode = id2index[govStr]
if currentNode in alreadySeen : if currentNode in alreadySeen :
errorAndExit("ERROR line %d (id=%s) loop in governors :"%(currentNode+1, sentence[currentNode][idIndex]), sentence) if currentNode not in alreadyReported :
logError("ERROR line %d (id=%s) loop in governors :"%(fileLineIndex+currentNode, sentence[currentNode][idIndex]), sentence)
alreadyReported[currentNode] = True
break
################################################################################ ################################################################################
...@@ -123,14 +127,20 @@ if __name__ == "__main__" : ...@@ -123,14 +127,20 @@ if __name__ == "__main__" :
conllMCDr = {v: k for k, v in conllMCD.items()} conllMCDr = {v: k for k, v in conllMCD.items()}
sentence = [] sentence = []
fileLineIndex = 0
sentFirstLine = -1
for line in open(sys.argv[1], "r", encoding="utf8") : for line in open(sys.argv[1], "r", encoding="utf8") :
fileLineIndex += 1
clean = line.strip() clean = line.strip()
if len(clean) == 0 : if len(clean) == 0 :
checkSentence(sentence, conllMCD, conllMCDr) checkSentence(sentFirstLine, sentence, conllMCD, conllMCDr)
sentence = [] sentence = []
sentFirstLine = -1
continue continue
if clean[0] == '#' : if clean[0] == '#' :
continue continue
if sentFirstLine == -1 :
sentFirstLine = fileLineIndex
sentence.append(clean.split('\t')) sentence.append(clean.split('\t'))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment