Select Git revision
conllu2splits.py 1.93 KiB
#! /usr/bin/python3
import sys
rules = {}
prefix = "SPLITWORD "
def printUsageAndExit() :
print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 2 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[splited[0].strip()] = splited[1].strip()
return mcd
def computeRules(sentence) :
wordById = {}
for word in sentence :
splited = word[0].split("-")
if len(splited) > 1 :
continue
wordById[word[0]] = word[1]
for word in sentence :
splited = word[0].split("-")
if len(splited) > 1 :
rule = ""
for id in splited :
rule += "@" + wordById[id]
if word[1] in rules :
if rule in rules[word[1]] :
rules[word[1]][rule] += 1
else :
rules[word[1]][rule] = 1
else :
rules[word[1]] = {}
rules[word[1]][rule] = 1
def main() :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) != 3 :
printUsageAndExit()
conllMCD = readMCD(sys.argv[2])
conllMCDr = {v: k for k, v in conllMCD.items()}
idId = int(conllMCDr["ID"])
idForm = int(conllMCDr["FORM"])
sentence = []
for line in open(sys.argv[1], "r", encoding="utf8") :
if len(line.strip()) < 2 or line[0] == '#' :
if len(sentence) > 0 :
computeRules(sentence)
sentence = []
continue
splited = line.strip().split('\t')
sentence += [[splited[idId], splited[idForm]]]
for word in rules :
if len(rules[word]) > 1 :
print("WARNING : Abiguity detected in \'%s\'"%(word+" "+str(rules[word])), file=sys.stderr)
for rule in rules[word] :
print(prefix+word+rule)
break
if __name__ == "__main__" :
main()