From a301e461ed26ed8ee3ee12eda5b5ef0c5d09dc1b Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Tue, 9 Jun 2020 14:07:52 +0200
Subject: [PATCH] Added support for lemmatization

---
 UD_any/data/Makefile                   |  1 -
 UD_any/data/getTransitionSets.py       | 24 ++++++++++++++++++++++++
 UD_any/templates/lemmatizer/machine.rm | 26 ++++++++++++++++++++++++++
 scripts/conll18_ud_eval.py             |  8 ++++++--
 4 files changed, 56 insertions(+), 3 deletions(-)
 create mode 100644 UD_any/templates/lemmatizer/machine.rm

diff --git a/UD_any/data/Makefile b/UD_any/data/Makefile
index 86d3e5c..37640dd 100644
--- a/UD_any/data/Makefile
+++ b/UD_any/data/Makefile
@@ -12,7 +12,6 @@ THRESHOLD=10
 FPLM_FILENAME=fplm
 
 all: tokenizer.ts segmenter.ts texts all_no_test.conllu transitions pretrain
-	rm -f col_*\.txt
 	rm -f all_no_test.conllu
 
 all_no_test.conllu:
diff --git a/UD_any/data/getTransitionSets.py b/UD_any/data/getTransitionSets.py
index 39bf020..421949b 100755
--- a/UD_any/data/getTransitionSets.py
+++ b/UD_any/data/getTransitionSets.py
@@ -73,6 +73,30 @@ if __name__ == "__main__" :
     print("<morpho> NOTHING", file=output)
     output.close()
 
+  if "FORM" in col2index and "LEMMA" in col2index :
+    rules = {}
+    for columns in fileContent :
+      form = columns[col2index["FORM"]].lower()
+      lemma = columns[col2index["LEMMA"]].lower()
+      commonIndex = 0
+      while commonIndex in range(min(len(form), len(lemma))) and form[commonIndex] == lemma[commonIndex] :
+        commonIndex += 1
+      rule = "\t%s\t%s\t"%(form[commonIndex:], lemma[commonIndex:])
+      if rule not in rules :
+        rules[rule] = 0
+      rules[rule] += 1
+    output = open("lemmatizer_rules.ts", 'w', encoding='utf-8')
+    for rule in rules :
+      print("<lemmatizer_rules> TRANSFORMSUFFIX FORM b.0 LEMMA b.0 %s"%rule, file=output)
+    output.close()
+    output = open("lemmatizer_case.ts", 'w', encoding='utf-8')
+    print("<lemmatizer_case> UPPERCASEINDEX LEMMA b.0 0", file=output)
+    print("<lemmatizer_case> UPPERCASE LEMMA b.0", file=output)
+    print("<lemmatizer_case> LOWERCASEINDEX LEMMA b.0 0", file=output)
+    print("<lemmatizer_case> LOWERCASE LEMMA b.0", file=output)
+    print("<lemmatizer_case> NOTHING", file=output)
+    output.close()
+
   if "DEPREL" in col2index :
     labelsList = []
     for columns in fileContent :
diff --git a/UD_any/templates/lemmatizer/machine.rm b/UD_any/templates/lemmatizer/machine.rm
new file mode 100644
index 0000000..2eca050
--- /dev/null
+++ b/UD_any/templates/lemmatizer/machine.rm
@@ -0,0 +1,26 @@
+Name : Lemmatizer Machine
+Classifier : lemmatizer
+{
+	Transitions : {lemmatizer_rules,data/lemmatizer_rules.ts, lemmatizer_case,data/lemmatizer_case.ts}
+	LossMultiplier : {}
+	Network type : Modular
+	Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{lower:FORM} LSTM{1 1 0 1} In{64} Out{64}
+	Context : Buffer{-3 -2 -1 0 1 2} Stack{} Columns{UPOS} LSTM{1 1 0 1} In{64} Out{64}
+	Context : Buffer{-2 -1 0} Stack{} Columns{LEMMA} LSTM{1 1 0 1} In{64} Out{64}
+	Focused : Column{ID} NbElem{1} Buffer{-1 0 1 1} Stack{} LSTM{1 1 0 1} In{64} Out{64}
+	Focused : Column{prefix3:FORM} NbElem{3} Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} In{64} Out{64}
+	Focused : Column{suffix3:FORM} NbElem{3} Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} In{64} Out{64}
+	Focused : Column{EOS} NbElem{1} Buffer{-1 0} Stack{} LSTM{1 1 0 1} In{64} Out{64}
+	UppercaseRate : Buffer{-1 0 1 2} Stack{} LSTM{1 1 0 1} Out{32}
+	InputDropout : 0.5
+	MLP : {4096 0.3}
+	End
+  Optimizer : Adam {0.0003 0.9 0.999 0.00000001 0.00002 true}
+}
+Predictions : LEMMA
+Strategy
+{
+	Block : End{cannotMove}
+	lemmatizer_rules lemmatizer_case * 0
+	lemmatizer_case lemmatizer_rules * 1
+}
diff --git a/scripts/conll18_ud_eval.py b/scripts/conll18_ud_eval.py
index 0866241..7d1f1ff 100755
--- a/scripts/conll18_ud_eval.py
+++ b/scripts/conll18_ud_eval.py
@@ -106,6 +106,11 @@ import math
 col2index = {}
 index2col = {}
 
+metric2colname = {
+  "UPOS" : "UPOS",
+  "Lemmas" : "LEMMA",
+}
+
 # Content and functional relations
 CONTENT_DEPRELS = {
   "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative",
@@ -556,8 +561,7 @@ class Error :
     self.pred = system_word
     self.gold_sentence = gold_file.words[gold_file.sentences_words[self.gold.sentence].start:gold_file.sentences_words[self.gold.sentence].end]
     self.pred_sentence = system_file.words[system_file.sentences_words[self.pred.sentence].start:system_file.sentences_words[self.pred.sentence].end]
-    # TODO : do it for other than UPOS
-    self.type = self.gold.columns[col2index["UPOS"]]+"->"+self.pred.columns[col2index["UPOS"]]
+    self.type = self.gold.columns[col2index[metric2colname[metric]]]+"->"+self.pred.columns[col2index[metric2colname[metric]]]
   def __str__(self) :
     result = []
     gold_lines = []
-- 
GitLab