Something went wrong on our end
-
Franck Dary authoredFranck Dary authored
getTransitionSets.py 5.26 KiB
#! /usr/bin/env python3
import sys
sys.path.insert(1, '../../../../scripts')
from readMCD import readMCD
def printUsageAndExit() :
print("USAGE : %s file.conllu"%sys.argv[0], file=sys.stderr)
exit(1)
if __name__ == "__main__" :
sys.stdout = open(1, 'w', encoding='utf-8', closefd=False)
if len(sys.argv) != 2 :
printUsageAndExit()
col2index, index2col = readMCD("ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC")
fileContent = []
for line in open(sys.argv[1], "r", encoding="utf8") :
if len(line.strip()) < 3 :
continue
if line.strip()[0] == '#' :
splited = line.split("global.columns =")
if len(splited) > 1 :
col2index, index2col = readMCD(splited[-1].strip())
continue
columns = line.strip().split('\t')
fileContent.append(columns)
if "UPOS" in col2index :
values = []
for columns in fileContent :
values.append(columns[col2index["UPOS"]])
values = sorted(set(values))
output = open("tagger.ts", 'w', encoding='utf-8')
for value in values :
print("<tagger> WRITE b.0 UPOS " + value, file=output)
output.close()
if "XPOS" in col2index :
values = []
for columns in fileContent :
values.append(columns[col2index["XPOS"]])
values = sorted(set(values))
output = open("taggerx.ts", 'w', encoding='utf-8')
for value in values :
print("<taggerx> WRITE b.0 XPOS " + value, file=output)
output.close()
if "FEATS" in col2index :
values = []
for columns in fileContent :
values.append(columns[col2index["FEATS"]])
values = sorted(set(values))
parts = []
for value in values :
for part in value.split("|") :
parts.append(part)
parts = sorted(set(parts))
output = open("morpho_whole.ts", 'w', encoding='utf-8')
for value in values :
print("<morpho> WRITE b.0 FEATS " + value, file=output)
output.close()
output = open("morpho_parts.ts", 'w', encoding='utf-8')
for value in parts :
print("<morpho> ADD b.0 FEATS " + value, file=output)
print("<morpho> NOTHING", file=output)
output.close()
if "FORM" in col2index and "LEMMA" in col2index :
rules = {}
for columns in fileContent :
form = columns[col2index["FORM"]].lower()
lemma = columns[col2index["LEMMA"]].lower()
commonIndex = 0
while commonIndex in range(min(len(form), len(lemma))) and form[commonIndex] == lemma[commonIndex] :
commonIndex += 1
rule = "\t%s\t%s\t"%(form[commonIndex:], lemma[commonIndex:])
if rule not in rules :
rules[rule] = 0
rules[rule] += 1
output = open("lemmatizer_rules.ts", 'w', encoding='utf-8')
for rule in rules :
print("<lemmatizer_rules> TRANSFORMSUFFIX FORM b.0 LEMMA b.0 %s"%rule, file=output)
output.close()
output = open("lemmatizer_case.ts", 'w', encoding='utf-8')
print("<lemmatizer_case> UPPERCASEINDEX LEMMA b.0 0", file=output)
print("<lemmatizer_case> UPPERCASE LEMMA b.0", file=output)
print("<lemmatizer_case> LOWERCASEINDEX LEMMA b.0 0", file=output)
print("<lemmatizer_case> LOWERCASE LEMMA b.0", file=output)
print("<lemmatizer_case> NOTHING", file=output)
output.close()
if "DEPREL" in col2index :
labelsList = []
for columns in fileContent :
label = columns[col2index["DEPREL"]]
if not (label == "_" or label == "root") :
labelsList.append(label)
labelsList = sorted(set(labelsList))
output = open("parser_eager_rel_strict.ts", 'w', encoding='utf-8')
print("<parser> REDUCE_strict", file=output)
for label in labelsList :
print("<parser> eager_LEFT_rel " + label, file=output)
print("<parser> eager_RIGHT_rel " + label, file=output)
print("<parser> eager_SHIFT", file=output)
output.close()
output = open("parser_eager_rel_relaxed.ts", 'w', encoding='utf-8')
print("<parser> REDUCE_relaxed", file=output)
for label in labelsList :
print("<parser> eager_LEFT_rel " + label, file=output)
print("<parser> eager_RIGHT_rel " + label, file=output)
print("<parser> eager_SHIFT", file=output)
output.close()
output = open("parser_eager_strict.ts", 'w', encoding='utf-8')
print("<parser> REDUCE_strict", file=output)
print("<parser> eager_LEFT", file=output)
print("<parser> eager_RIGHT", file=output)
print("<parser> eager_SHIFT", file=output)
output.close()
output = open("parser_eager_relaxed.ts", 'w', encoding='utf-8')
print("<parser> REDUCE_relaxed", file=output)
print("<parser> eager_LEFT", file=output)
print("<parser> eager_RIGHT", file=output)
print("<parser> eager_SHIFT", file=output)
output.close()
output = open("parser_standard_rel.ts", 'w', encoding='utf-8')
for label in labelsList :
print("<parser> standard_LEFT_rel " + label, file=output)
print("<parser> standard_RIGHT_rel " + label, file=output)
print("<parser> standard_SHIFT", file=output)
output.close()
output = open("parser_standard.ts", 'w', encoding='utf-8')
print("<parser> standard_LEFT_rel", file=output)
print("<parser> standard_RIGHT_rel", file=output)
print("<parser> standard_SHIFT", file=output)
output.close()
output = open("deprel.ts", 'w', encoding='utf-8')
for label in labelsList :
print("deprel " + label, file=output)
output.close()