Skip to content
Snippets Groups Projects
Commit a3a763db authored by Franck Dary's avatar Franck Dary
Browse files

Added script to get rules of word splitting for conllu

parent deab664e
Branches
No related tags found
No related merge requests found
File moved
#! /usr/bin/python3
import sys
rules = {}
prefix = "SPLITWORD "
def printUsageAndExit() :
print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr)
exit(1)
def readMCD(mcdFilename) :
mcd = {}
for line in open(mcdFilename, "r", encoding="utf8") :
clean = line.strip()
if len(line) < 2 or line[0] == '#' :
continue
splited = line.split(' ')
if len(splited) != 2 :
print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr)
exit(1)
mcd[splited[0].strip()] = splited[1].strip()
return mcd
def computeRules(sentence) :
wordById = {}
for word in sentence :
splited = word[0].split("-")
if len(splited) > 1 :
continue
wordById[word[0]] = word[1]
for word in sentence :
splited = word[0].split("-")
if len(splited) > 1 :
rule = ""
for id in splited :
rule += "@" + wordById[id]
if word[1] in rules :
if rule in rules[word[1]] :
rules[word[1]][rule] += 1
else :
rules[word[1]][rule] = 1
else :
rules[word[1]] = {}
rules[word[1]][rule] = 1
def main() :
if len(sys.argv) != 3 :
printUsageAndExit()
conllMCD = readMCD(sys.argv[2])
conllMCDr = {v: k for k, v in conllMCD.items()}
idId = int(conllMCDr["ID"])
idForm = int(conllMCDr["FORM"])
sentence = []
for line in open(sys.argv[1], "r") :
if len(line.strip()) < 2 or line[0] == '#' :
if len(sentence) > 0 :
computeRules(sentence)
sentence = []
continue
splited = line.strip().split('\t')
sentence += [[splited[idId], splited[idForm]]]
for word in rules :
if len(rules[word]) > 1 :
print("WARNING : Abiguity detected in \'%s\'"%(word+" "+str(rules[word])), file=sys.stderr)
for rule in rules[word] :
print(prefix+word+rule)
break
if __name__ == "__main__" :
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment