Skip to content
Snippets Groups Projects
Commit 442d956c authored by Franck Dary's avatar Franck Dary
Browse files

Script to check problems in conllu files now reads mcd directly from said file

parent 0b8abdb3
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3 #! /usr/bin/env python3
import sys import sys
from readMCD import readMCD
headColName = "HEAD" headColName = "HEAD"
deprelColName = "DEPREL" deprelColName = "DEPREL"
idColName = "ID" idColName = "ID"
################################################################################
def printUsageAndExit() : def printUsageAndExit() :
print("USAGE : %s file.conllu mcd"%sys.argv[0], file=sys.stderr) print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
exit(1) exit(1)
################################################################################
def readMCD(mcdFilename) : ################################################################################
mcd = {} def checkMCD(mcd) :
for line in open(mcdFilename, "r", encoding="utf8") : for col in [headColName, deprelColName, idColName] :
clean = line.strip() if col not in mcd :
if len(line) < 2 or line[0] == '#' : print("ERROR : column '{}' missing from mcd '{}'"
continue .format(col, " ".join(mcd.keys())), file=sys.stderr)
splited = line.split(' ')
if len(splited) != 1 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1) exit(1)
mcd[len(mcd)] = splited[0].strip() ################################################################################
return mcd
################################################################################
def logError(message, sentence) : def logError(message, sentence) :
print(message) print(message)
for line in sentence : for line in sentence :
for col in line : for col in line :
print(col,end="\t") print(col,end="\t")
print("") print("")
################################################################################
################################################################################
def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) :
idIndex = int(conllMCDr[idColName]) idIndex = int(conllMCDr[idColName])
govIndex = int(conllMCDr[headColName]) govIndex = int(conllMCDr[headColName])
...@@ -120,11 +121,12 @@ def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) : ...@@ -120,11 +121,12 @@ def checkSentence(fileLineIndex, sentence, conllMCD, conllMCDr) :
################################################################################ ################################################################################
if __name__ == "__main__" : if __name__ == "__main__" :
if len(sys.argv) != 3 : if len(sys.argv) != 2 :
printUsageAndExit() printUsageAndExit()
conllMCD = readMCD(sys.argv[2]) baseMCD = "ID FORM LEMMA POS XPOS FEATS HEAD DEPREL"
conllMCDr = {v: k for k, v in conllMCD.items()} conllMCD, conllMCDr = readMCD(baseMCD)
checkMCD(conllMCD)
sentence = [] sentence = []
fileLineIndex = 0 fileLineIndex = 0
...@@ -137,11 +139,18 @@ if __name__ == "__main__" : ...@@ -137,11 +139,18 @@ if __name__ == "__main__" :
if len(clean) < 3 : if len(clean) < 3 :
if sentFirstLine == -1 : if sentFirstLine == -1 :
exit(1) exit(1)
checkSentence(sentFirstLine, sentence, conllMCD, conllMCDr) checkSentence(sentFirstLine, sentence, conllMCDr, conllMCD)
sentence = [] sentence = []
sentFirstLine = -1 sentFirstLine = -1
elif clean[0] != '#' : continue
if sentFirstLine == -1 : if clean[0] == '#' :
sentFirstLine = fileLineIndex splited = line.split("global.columns =")
sentence.append(clean.split('\t')) if len(splited) > 1 :
conllMCD, conllMCDr = readMCD(splited[-1].strip())
checkMCD(conllMCD)
continue
if sentFirstLine == -1 :
sentFirstLine = fileLineIndex
sentence.append(clean.split('\t'))
################################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment