Skip to content
Snippets Groups Projects
Commit a301e461 authored by Franck Dary's avatar Franck Dary
Browse files

Added support for lemmatization

parent 4369a755
No related branches found
No related tags found
No related merge requests found
......@@ -12,7 +12,6 @@ THRESHOLD=10
FPLM_FILENAME=fplm
all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain
rm -f col_*\.txt
rm -f all_no_test.conllu
all_no_test.conllu:
......
......@@ -73,6 +73,30 @@ if __name__ == "__main__" :
print("<morpho> NOTHING", file=output)
output.close()
if "FORM" in col2index and "LEMMA" in col2index :
rules = {}
for columns in fileContent :
form = columns[col2index["FORM"]].lower()
lemma = columns[col2index["LEMMA"]].lower()
commonIndex = 0
while commonIndex in range(min(len(form), len(lemma))) and form[commonIndex] == lemma[commonIndex] :
commonIndex += 1
rule = "\t%s\t%s\t"%(form[commonIndex:], lemma[commonIndex:])
if rule not in rules :
rules[rule] = 0
rules[rule] += 1
output = open("lemmatizer_rules.ts", 'w', encoding='utf-8')
for rule in rules :
print("<lemmatizer_rules> TRANSFORMSUFFIX FORM b.0 LEMMA b.0 %s"%rule, file=output)
output.close()
output = open("lemmatizer_case.ts", 'w', encoding='utf-8')
print("<lemmatizer_case> UPPERCASEINDEX LEMMA b.0 0", file=output)
print("<lemmatizer_case> UPPERCASE LEMMA b.0", file=output)
print("<lemmatizer_case> LOWERCASEINDEX LEMMA b.0 0", file=output)
print("<lemmatizer_case> LOWERCASE LEMMA b.0", file=output)
print("<lemmatizer_case> NOTHING", file=output)
output.close()
if "DEPREL" in col2index :
labelsList = []
for columns in fileContent :
......
Name : Lemmatizer Machine
Classifier : lemmatizer
{
Transitions : {lemmatizer_rules,data/lemmatizer_rules.ts, lemmatizer_case,data/lemmatizer_case.ts}
LossMultiplier : {}
Network type : Modular
Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{lower:FORM} LSTM{1 1 0 1} In{64} Out{64}
Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64}
Context : Buffer{-2 -1 0} Stack{} Columns{LEMMA} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{ID} NbElem{1} Buffer{-1 0 1 1} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{EOS} NbElem{1} Buffer{-1 0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
UppercaseRate : Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} Out{32}
InputDropout : 0.5
MLP : {4096 0.3}
End
Optimizer : Adam {0.0003 0.9 0.999 0.00000001 0.00002 true}
}
Predictions : LEMMA
Strategy
{
Block : End{cannotMove}
lemmatizer_rules lemmatizer_case * 0
lemmatizer_case lemmatizer_rules * 1
}
......@@ -106,6 +106,11 @@ import math
col2index = {}
index2col = {}
metric2colname = {
"UPOS" : "UPOS",
"Lemmas" : "LEMMA",
}
# Content and functional relations
CONTENT_DEPRELS = {
"nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative",
......@@ -556,8 +561,7 @@ class Error :
self.pred = system_word
self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end]
self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end]
# TODO : do it for other than UPOS
self.type = self.gold.columns[col2index["UPOS"]]+"->"+self.pred.columns[col2index["UPOS"]]
self.type = self.gold.columns[col2index[metric2colname[metric]]]+"->"+self.pred.columns[col2index[metric2colname[metric]]]
def __str__(self) :
result = []
gold_lines = []
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment