diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile index 86d3e5cc58a2c97ee696abef0c521a34d2ff5b13..37640dd6decadc9cfb2a574296369dc4da234803 100644 --- a/UD_any/data/Makefile +++ b/UD_any/data/Makefile @@ -12,7 +12,6 @@ THRESHOLD=10 FPLM_FILENAME=fplm all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain - rm -f col_*\.txt rm -f all_no_test.conllu all_no_test.conllu: diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py index 39bf020b1ad63736dad953507f5841593e6cdc24..421949b23c45ff7bba850134ea10b86ae926939d 100755 --- a/UD_any/data/getTransitionSets.py +++ b/UD_any/data/getTransitionSets.py @@ -73,6 +73,30 @@ if __name__ == "__main__" : print("<morpho> NOTHING", file=output) output.close() + if "FORM" in col2index and "LEMMA" in col2index : + rules = {} + for columns in fileContent : + form = columns[col2index["FORM"]].lower() + lemma = columns[col2index["LEMMA"]].lower() + commonIndex = 0 + while commonIndex in range(min(len(form), len(lemma))) and form[commonIndex] == lemma[commonIndex] : + commonIndex += 1 + rule = "\t%s\t%s\t"%(form[commonIndex:], lemma[commonIndex:]) + if rule not in rules : + rules[rule] = 0 + rules[rule] += 1 + output = open("lemmatizer_rules.ts", 'w', encoding='utf-8') + for rule in rules : + print("<lemmatizer_rules> TRANSFORMSUFFIX FORM b.0 LEMMA b.0 %s"%rule, file=output) + output.close() + output = open("lemmatizer_case.ts", 'w', encoding='utf-8') + print("<lemmatizer_case> UPPERCASEINDEX LEMMA b.0 0", file=output) + print("<lemmatizer_case> UPPERCASE LEMMA b.0", file=output) + print("<lemmatizer_case> LOWERCASEINDEX LEMMA b.0 0", file=output) + print("<lemmatizer_case> LOWERCASE LEMMA b.0", file=output) + print("<lemmatizer_case> NOTHING", file=output) + output.close() + if "DEPREL" in col2index : labelsList = [] for columns in fileContent : diff --git a/UD_any/templates/lemmatizer/machine.rm b/UD_any/templates/lemmatizer/machine.rm new file mode 100644 index 0000000000000000000000000000000000000000..2eca05045438980ff17b214e1bb717db4a11854a --- /dev/null +++ b/UD_any/templates/lemmatizer/machine.rm @@ -0,0 +1,26 @@ +Name : Lemmatizer Machine +Classifier : lemmatizer +{ + Transitions : {lemmatizer_rules,data/lemmatizer_rules.ts, lemmatizer_case,data/lemmatizer_case.ts} + LossMultiplier : {} + Network type : Modular + Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{lower:FORM} LSTM{1 1 0 1} In{64} Out{64} + Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64} + Context : Buffer{-2 -1 0} Stack{} Columns{LEMMA} LSTM{1 1 0 1} In{64} Out{64} + Focused : Column{ID} NbElem{1} Buffer{-1 0 1 1} Stack{} LSTM{1 1 0 1} In{64} Out{64} + Focused : Column{prefix3:FORM} NbElem{3} Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} In{64} Out{64} + Focused : Column{suffix3:FORM} NbElem{3} Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} In{64} Out{64} + Focused : Column{EOS} NbElem{1} Buffer{-1 0} Stack{} LSTM{1 1 0 1} In{64} Out{64} + UppercaseRate : Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} Out{32} + InputDropout : 0.5 + MLP : {4096 0.3} + End + Optimizer : Adam {0.0003 0.9 0.999 0.00000001 0.00002 true} +} +Predictions : LEMMA +Strategy +{ + Block : End{cannotMove} + lemmatizer_rules lemmatizer_case * 0 + lemmatizer_case lemmatizer_rules * 1 +} diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py index 08662415c7475dbd28fd3be611f93cc73aafadc1..7d1f1ffce10e6023057267ab5649b55806e706df 100755 --- a/scripts/conll18_ud_eval.py +++ b/scripts/conll18_ud_eval.py @@ -106,6 +106,11 @@ import math col2index = {} index2col = {} +metric2colname = { + "UPOS" : "UPOS", + "Lemmas" : "LEMMA", +} + # Content and functional relations CONTENT_DEPRELS = { "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative", @@ -556,8 +561,7 @@ class Error : self.pred = system_word self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end] self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end] - # TODO : do it for other than UPOS - self.type = self.gold.columns[col2index["UPOS"]]+"->"+self.pred.columns[col2index["UPOS"]] + self.type = self.gold.columns[col2index[metric2colname[metric]]]+"->"+self.pred.columns[col2index[metric2colname[metric]]] def __str__(self) : result = [] gold_lines = []