#! /usr/bin/env python3 import sys sys.path.insert(1, '../../../../scripts') from readMCD import readMCD def printUsageAndExit() : print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr) exit(1) if __name__ == "__main__" : sys.stdout = open(1, 'w', encoding='utf-8', closefd=False) if len(sys.argv) != 2 : printUsageAndExit() col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC") fileContent = [] for line in open(sys.argv[1], "r", encoding="utf8") : if len(line.strip()) < 3 : continue if line.strip()[0] == '#' : splited = line.split("global.columns =") if len(splited) > 1 : col2index, index2col = readMCD(splited[-1].strip()) continue columns = line.strip().split('\t') fileContent.append(columns) if "UPOS" in col2index : values = [] for columns in fileContent : values.append(columns[col2index["UPOS"]]) values = sorted(set(values)) output = open("tagger.ts", 'w', encoding='utf-8') for value in values : print("<tagger> WRITE b.0 UPOS " + value, file=output) output.close() if "XPOS" in col2index : values = [] for columns in fileContent : values.append(columns[col2index["XPOS"]]) values = sorted(set(values)) output = open("taggerx.ts", 'w', encoding='utf-8') for value in values : print("<taggerx> WRITE b.0 XPOS " + value, file=output) output.close() if "FEATS" in col2index : values = [] for columns in fileContent : values.append(columns[col2index["FEATS"]]) values = sorted(set(values)) parts = [] for value in values : for part in value.split("|") : parts.append(part) parts = sorted(set(parts)) output = open("morpho_whole.ts", 'w', encoding='utf-8') for value in values : print("<morpho> WRITE b.0 FEATS " + value, file=output) output.close() output = open("morpho_parts.ts", 'w', encoding='utf-8') for value in parts : print("<morpho> ADD b.0 FEATS " + value, file=output) print("<morpho> NOTHING", file=output) output.close() if "FORM" in col2index and "LEMMA" in col2index : rules = {} for columns in fileContent : form = columns[col2index["FORM"]].lower() lemma = columns[col2index["LEMMA"]].lower() commonIndex = 0 while commonIndex in range(min(len(form), len(lemma))) and form[commonIndex] == lemma[commonIndex] : commonIndex += 1 rule = "\t%s\t%s\t"%(form[commonIndex:], lemma[commonIndex:]) if rule not in rules : rules[rule] = 0 rules[rule] += 1 output = open("lemmatizer_rules.ts", 'w', encoding='utf-8') for rule in rules : print("<lemmatizer_rules> TRANSFORMSUFFIX FORM b.0 LEMMA b.0 %s"%rule, file=output) output.close() output = open("lemmatizer_case.ts", 'w', encoding='utf-8') print("<lemmatizer_case> UPPERCASEINDEX LEMMA b.0 0", file=output) print("<lemmatizer_case> UPPERCASE LEMMA b.0", file=output) print("<lemmatizer_case> LOWERCASEINDEX LEMMA b.0 0", file=output) print("<lemmatizer_case> LOWERCASE LEMMA b.0", file=output) print("<lemmatizer_case> NOTHING", file=output) output.close() if "DEPREL" in col2index : labelsList = [] for columns in fileContent : label = columns[col2index["DEPREL"]] if not (label == "_" or label == "root") : labelsList.append(label) labelsList = sorted(set(labelsList)) output = open("parser_eager_rel_strict.ts", 'w', encoding='utf-8') print("<parser> REDUCE_strict", file=output) for label in labelsList : print("<parser> eager_LEFT_rel " + label, file=output) print("<parser> eager_RIGHT_rel " + label, file=output) print("<parser> eager_SHIFT", file=output) output.close() output = open("parser_eager_rel_relaxed.ts", 'w', encoding='utf-8') print("<parser> REDUCE_relaxed", file=output) for label in labelsList : print("<parser> eager_LEFT_rel " + label, file=output) print("<parser> eager_RIGHT_rel " + label, file=output) print("<parser> eager_SHIFT", file=output) output.close() output = open("parser_eager_strict.ts", 'w', encoding='utf-8') print("<parser> REDUCE_strict", file=output) print("<parser> eager_LEFT", file=output) print("<parser> eager_RIGHT", file=output) print("<parser> eager_SHIFT", file=output) output.close() output = open("parser_eager_relaxed.ts", 'w', encoding='utf-8') print("<parser> REDUCE_relaxed", file=output) print("<parser> eager_LEFT", file=output) print("<parser> eager_RIGHT", file=output) print("<parser> eager_SHIFT", file=output) output.close() output = open("parser_standard_rel.ts", 'w', encoding='utf-8') for label in labelsList : print("<parser> standard_LEFT_rel " + label, file=output) print("<parser> standard_RIGHT_rel " + label, file=output) print("<parser> standard_SHIFT", file=output) output.close() output = open("parser_standard.ts", 'w', encoding='utf-8') print("<parser> standard_LEFT_rel", file=output) print("<parser> standard_RIGHT_rel", file=output) print("<parser> standard_SHIFT", file=output) output.close() output = open("deprel.ts", 'w', encoding='utf-8') for label in labelsList : print("deprel " + label, file=output) output.close()