#! /usr/bin/python3 import sys from readMCD import readMCD rules = {} prefix = "SPLITWORD " def printUsageAndExit() : print("USAGE : %s file.conllu conllu.mcd"%sys.argv[0], file=sys.stderr) exit(1) def computeRules(sentence) : wordById = {} for word in sentence : splited = word[0].split("-") if len(splited) > 1 : continue wordById[word[0]] = word[1] for word in sentence : splited = word[0].split("-") if len(splited) > 1 : rule = "" for id in range(int(splited[0]),int(splited[-1])+1) : rule += "@" + wordById[str(id)] if word[1] in rules : if rule in rules[word[1]] : rules[word[1]][rule] += 1 else : rules[word[1]][rule] = 1 else : rules[word[1]] = {} rules[word[1]][rule] = 1 def main() : sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) if len(sys.argv) != 3 : printUsageAndExit() conllMCD, conllMCDr = readMCD(sys.argv[2]) idId = int(conllMCDr["ID"]) idForm = int(conllMCDr["FORM"]) sentence = [] for line in open(sys.argv[1], "r", encoding="utf8") : if len(line.strip()) < 2 or line[0] == '#' : if len(sentence) > 0 : computeRules(sentence) sentence = [] continue splited = line.strip().split('\t') sentence += [[splited[idId], splited[idForm]]] for word in rules : if len(rules[word]) > 1 : print("WARNING : Abiguity detected in \'%s\'"%(word+" "+str(rules[word])), file=sys.stderr) toPrint = [] for rule in rules[word] : toPrint.append([len(rule.split('@')), prefix+word+rule]) toPrint.sort(reverse=True) for rule in toPrint : print(rule[1]) if __name__ == "__main__" : main()