From a3a763dbacd3ce64049b11aeca31551fffe37573 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Tue, 17 Sep 2019 14:03:06 +0200 Subject: [PATCH] Added script to get rules of word splitting for conllu --- {scripts => tools}/conll2text.py | 0 tools/conllu2splits.py | 80 ++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) rename {scripts => tools}/conll2text.py (100%) create mode 100755 tools/conllu2splits.py diff --git a/scripts/conll2text.py b/tools/conll2text.py similarity index 100% rename from scripts/conll2text.py rename to tools/conll2text.py diff --git a/tools/conllu2splits.py b/tools/conllu2splits.py new file mode 100755 index 0000000..ce958cf --- /dev/null +++ b/tools/conllu2splits.py @@ -0,0 +1,80 @@ +#! /usr/bin/python3 + +import sys + +rules = {} +prefix = "SPLITWORD " + +def printUsageAndExit() : + print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 2 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[splited[0].strip()] = splited[1].strip() + + return mcd + +def computeRules(sentence) : + wordById = {} + for word in sentence : + splited = word[0].split("-") + if len(splited) > 1 : + continue + wordById[word[0]] = word[1] + + for word in sentence : + splited = word[0].split("-") + if len(splited) > 1 : + rule = "" + for id in splited : + rule += "@" + wordById[id] + if word[1] in rules : + if rule in rules[word[1]] : + rules[word[1]][rule] += 1 + else : + rules[word[1]][rule] = 1 + else : + rules[word[1]] = {} + rules[word[1]][rule] = 1 + +def main() : + if len(sys.argv) != 3 : + printUsageAndExit() + + conllMCD = readMCD(sys.argv[2]) + conllMCDr = {v: k for k, v in conllMCD.items()} + + idId = int(conllMCDr["ID"]) + idForm = int(conllMCDr["FORM"]) + + sentence = [] + + for line in open(sys.argv[1], "r") : + if len(line.strip()) < 2 or line[0] == '#' : + if len(sentence) > 0 : + computeRules(sentence) + sentence = [] + continue + + splited = line.strip().split('\t') + sentence += [[splited[idId], splited[idForm]]] + + for word in rules : + if len(rules[word]) > 1 : + print("WARNING : Abiguity detected in \'%s\'"%(word+" "+str(rules[word])), file=sys.stderr) + for rule in rules[word] : + print(prefix+word+rule) + break + +if __name__ == "__main__" : + main() + -- GitLab