diff --git a/scripts/conll2text.py b/tools/conll2text.py similarity index 100% rename from scripts/conll2text.py rename to tools/conll2text.py diff --git a/tools/conllu2splits.py b/tools/conllu2splits.py new file mode 100755 index 0000000000000000000000000000000000000000..ce958cff38d664c50a71365911dd17c5373bcb9a --- /dev/null +++ b/tools/conllu2splits.py @@ -0,0 +1,80 @@ +#! /usr/bin/python3 + +import sys + +rules = {} +prefix = "SPLITWORD " + +def printUsageAndExit() : + print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr) + exit(1) + +def readMCD(mcdFilename) : + mcd = {} + for line in open(mcdFilename, "r", encoding="utf8") : + clean = line.strip() + if len(line) < 2 or line[0] == '#' : + continue + splited = line.split(' ') + if len(splited) != 2 : + print("ERROR : invalid mcd line \'%s\'. Aborting"%line, file=sys.stderr) + exit(1) + mcd[splited[0].strip()] = splited[1].strip() + + return mcd + +def computeRules(sentence) : + wordById = {} + for word in sentence : + splited = word[0].split("-") + if len(splited) > 1 : + continue + wordById[word[0]] = word[1] + + for word in sentence : + splited = word[0].split("-") + if len(splited) > 1 : + rule = "" + for id in splited : + rule += "@" + wordById[id] + if word[1] in rules : + if rule in rules[word[1]] : + rules[word[1]][rule] += 1 + else : + rules[word[1]][rule] = 1 + else : + rules[word[1]] = {} + rules[word[1]][rule] = 1 + +def main() : + if len(sys.argv) != 3 : + printUsageAndExit() + + conllMCD = readMCD(sys.argv[2]) + conllMCDr = {v: k for k, v in conllMCD.items()} + + idId = int(conllMCDr["ID"]) + idForm = int(conllMCDr["FORM"]) + + sentence = [] + + for line in open(sys.argv[1], "r") : + if len(line.strip()) < 2 or line[0] == '#' : + if len(sentence) > 0 : + computeRules(sentence) + sentence = [] + continue + + splited = line.strip().split('\t') + sentence += [[splited[idId], splited[idForm]]] + + for word in rules : + if len(rules[word]) > 1 : + print("WARNING : Abiguity detected in \'%s\'"%(word+" "+str(rules[word])), file=sys.stderr) + for rule in rules[word] : + print(prefix+word+rule) + break + +if __name__ == "__main__" : + main() +