Skip to content
Snippets Groups Projects
Commit a301e461 authored by Franck Dary's avatar Franck Dary
Browse files

Added support for lemmatization

parent 4369a755
Branches
No related tags found
No related merge requests found
...@@ -12,7 +12,6 @@ THRESHOLD=10 ...@@ -12,7 +12,6 @@ THRESHOLD=10
FPLM_FILENAME=fplm FPLM_FILENAME=fplm
all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain
rm -f col_*\.txt
rm -f all_no_test.conllu rm -f all_no_test.conllu
all_no_test.conllu: all_no_test.conllu:
......
...@@ -73,6 +73,30 @@ if __name__ == "__main__" : ...@@ -73,6 +73,30 @@ if __name__ == "__main__" :
print("<morpho> NOTHING", file=output) print("<morpho> NOTHING", file=output)
output.close() output.close()
if "FORM" in col2index and "LEMMA" in col2index :
rules = {}
for columns in fileContent :
form = columns[col2index["FORM"]].lower()
lemma = columns[col2index["LEMMA"]].lower()
commonIndex = 0
while commonIndex in range(min(len(form), len(lemma))) and form[commonIndex] == lemma[commonIndex] :
commonIndex += 1
rule = "\t%s\t%s\t"%(form[commonIndex:], lemma[commonIndex:])
if rule not in rules :
rules[rule] = 0
rules[rule] += 1
output = open("lemmatizer_rules.ts", 'w', encoding='utf-8')
for rule in rules :
print("<lemmatizer_rules> TRANSFORMSUFFIX FORM b.0 LEMMA b.0 %s"%rule, file=output)
output.close()
output = open("lemmatizer_case.ts", 'w', encoding='utf-8')
print("<lemmatizer_case> UPPERCASEINDEX LEMMA b.0 0", file=output)
print("<lemmatizer_case> UPPERCASE LEMMA b.0", file=output)
print("<lemmatizer_case> LOWERCASEINDEX LEMMA b.0 0", file=output)
print("<lemmatizer_case> LOWERCASE LEMMA b.0", file=output)
print("<lemmatizer_case> NOTHING", file=output)
output.close()
if "DEPREL" in col2index : if "DEPREL" in col2index :
labelsList = [] labelsList = []
for columns in fileContent : for columns in fileContent :
......
Name : Lemmatizer Machine
Classifier : lemmatizer
{
Transitions : {lemmatizer_rules,data/lemmatizer_rules.ts, lemmatizer_case,data/lemmatizer_case.ts}
LossMultiplier : {}
Network type : Modular
Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{lower:FORM} LSTM{1 1 0 1} In{64} Out{64}
Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64}
Context : Buffer{-2 -1 0} Stack{} Columns{LEMMA} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{ID} NbElem{1} Buffer{-1 0 1 1} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{prefix3:FORM} NbElem{3} Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{suffix3:FORM} NbElem{3} Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} In{64} Out{64}
Focused : Column{EOS} NbElem{1} Buffer{-1 0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
UppercaseRate : Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} Out{32}
InputDropout : 0.5
MLP : {4096 0.3}
End
Optimizer : Adam {0.0003 0.9 0.999 0.00000001 0.00002 true}
}
Predictions : LEMMA
Strategy
{
Block : End{cannotMove}
lemmatizer_rules lemmatizer_case * 0
lemmatizer_case lemmatizer_rules * 1
}
...@@ -106,6 +106,11 @@ import math ...@@ -106,6 +106,11 @@ import math
col2index = {} col2index = {}
index2col = {} index2col = {}
metric2colname = {
"UPOS" : "UPOS",
"Lemmas" : "LEMMA",
}
# Content and functional relations # Content and functional relations
CONTENT_DEPRELS = { CONTENT_DEPRELS = {
"nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative", "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative",
...@@ -556,8 +561,7 @@ class Error : ...@@ -556,8 +561,7 @@ class Error :
self.pred = system_word self.pred = system_word
self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end] self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end]
self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end] self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end]
# TODO : do it for other than UPOS self.type = self.gold.columns[col2index[metric2colname[metric]]]+"->"+self.pred.columns[col2index[metric2colname[metric]]]
self.type = self.gold.columns[col2index["UPOS"]]+"->"+self.pred.columns[col2index["UPOS"]]
def __str__(self) : def __str__(self) :
result = [] result = []
gold_lines = [] gold_lines = []
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment