Skip to content
Snippets Groups Projects
Commit a25b9b33 authored by Franck Dary's avatar Franck Dary
Browse files

Added script to check for problem in the conllu format

parent 03051b00
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/python3
import sys
headColName = "HEAD"
deprelColName = "DEPREL"
idColName = "ID"
def printUsageAndExit() :
print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 1 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[len(mcd)] = splited[0].strip()
return mcd
def errorAndExit(message, sentence) :
print(message)
for line in sentence :
for col in line :
print(col,end="\t")
print("")
exit(1)
def checkSentence(sentence, conllMCD, conllMCDr) :
idIndex = int(conllMCDr[idColName])
govIndex = int(conllMCDr[headColName])
labelIndex = int(conllMCDr[deprelColName])
curId = 0
curDigit = 1
maxId = 0
multiWordEmptyNodes = set()
# Verifying IDS
for i in range(len(sentence)) :
for col in sentence[i] :
if len(col) == 0 :
errorAndExit("Empty column", sentence)
idStr = sentence[i][idIndex]
if len(idStr.split('-')) == 2 :
curDigit = 1
splited = idStr.split('-')
multiWordEmptyNodes.add(i)
if int(splited[0]) != curId+1 or int(splited[0]) >= int(splited[1]) :
errorAndExit("ERROR in IDs : %s"%idStr, sentence)
elif len(idStr.split('.')) == 2 :
multiWordEmptyNodes.add(i)
splited = idStr.split('.')
if int(splited[0]) != curId or int(splited[1]) != curDigit :
errorAndExit("ERROR in IDs : %s"%idStr, sentence)
curDigit += 1
elif idStr.isdigit() :
curId += 1
curDigit = 1
maxId = max(maxId, int(idStr))
if int(idStr) != curId :
errorAndExit("ERROR in IDs : %s"%idStr, sentence)
else :
errorAndExit("ERROR in IDs : %s"%idStr, sentence)
nbRoot = 0
# Verifying root
for i in range(len(sentence)) :
labelStr = sentence[i][labelIndex]
if labelStr == "root" :
nbRoot += 1
if nbRoot != 1 :
errorAndExit("ERROR %d root in sentence :"%nbRoot, sentence)
# Verifying govs
for i in range(len(sentence)) :
if i in multiWordEmptyNodes :
continue
govStr = sentence[i][govIndex]
if not govStr.isdigit() :
errorAndExit("ERROR line %d gov \'%s\' is not positive integer :"%(i+1,govStr), sentence)
if int(govStr) > maxId :
errorAndExit("ERROR line %d gov \'%s\' is out of sentence :"%(i+1,govStr), sentence)
if __name__ == "__main__" :
if len(sys.argv) != 3 :
printUsageAndExit()
conllMCD = readMCD(sys.argv[2])
conllMCDr = {v: k for k, v in conllMCD.items()}
sentence = []
for line in open(sys.argv[1], "r", encoding="utf8") :
clean = line.strip()
if len(clean) == 0 :
checkSentence(sentence, conllMCD, conllMCDr)
sentence = []
continue
if clean[0] == '#' :
continue
sentence.append(clean.split('\t'))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment