From 406e60a755d37fc88af6369f68f602e9ba9fba26 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Wed, 23 Oct 2019 12:01:29 +0200
Subject: [PATCH] Improved generation of lemmatizer rules

---
 maca_common/src/macaon_compute_l_rules.cpp | 91 ++++++++++++++++++++--
 1 file changed, 84 insertions(+), 7 deletions(-)

diff --git a/maca_common/src/macaon_compute_l_rules.cpp b/maca_common/src/macaon_compute_l_rules.cpp
index 475f20d..d62c9c7 100644
--- a/maca_common/src/macaon_compute_l_rules.cpp
+++ b/maca_common/src/macaon_compute_l_rules.cpp
@@ -83,6 +83,50 @@ po::variables_map checkOptions(po::options_description & od, int argc, char ** a
   return vm;
 }
 
+struct FPLM
+{
+  std::string form;
+  std::string pos;
+  std::string lemma;
+  std::string morpho;
+
+  FPLM(std::string form, std::string pos, std::string lemma, std::string morpho)
+  {
+    this->form = form;
+    this->pos = pos;
+    this->lemma = lemma;
+    this->morpho = morpho;
+  }
+
+  std::string toString()
+  {
+    return form + "\t" + pos + "\t" + lemma + "\t" + morpho;
+  }
+
+  std::string getFp()
+  {
+    return form + "\t" + pos;
+  }
+
+  int getNbKnown()
+  {
+    int nbKnown = 0;
+
+    if (!form.empty() && form != "_")
+      nbKnown++;
+    if (!pos.empty() && pos != "_")
+      nbKnown++;
+    if (!lemma.empty() && lemma != "_")
+      nbKnown++;
+    if (!morpho.empty() && morpho != "_")
+      nbKnown++;
+
+    return nbKnown;
+  }
+
+
+};
+
 /// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions.
 ///
 /// @param argc The number of arguments given to this program.
@@ -105,7 +149,7 @@ int main(int argc, char * argv[])
   File fplm(fplmFilename, "r");
   char buffer[100000];
 
-  std::map<std::string, std::vector<std::string> > rules;
+  std::map<std::string, std::vector<FPLM> > rules;
   while (fscanf(fplm.getDescriptor(), "%[^\n]\n", buffer) == 1)
   {
     auto splited = util::split(buffer, '\t');
@@ -116,32 +160,65 @@ int main(int argc, char * argv[])
       exit(1);
     }
 
-    auto form = splited[0];
-    auto lemma = splited[2];
-    auto rule = util::getRule(form, lemma);
+    std::string form = splited[0];
+    std::string pos = splited[1];
+    std::string lemma = splited[2];
+    std::string morpho = splited[3];
+    std::string rule = util::getRule(form, lemma);
 
-    rules[rule].emplace_back(buffer);
+    rules[rule].emplace_back(form, pos, lemma, morpho);
   }
 
   File rulesFile(rulesFilename, "w");
   File exceptionsFile(exceptionsFilename, "w");
 
+  std::vector<std::string> exceptionsToPrint;
+  std::map< std::string, std::vector<FPLM> > exceptions;
+  std::map<std::string, bool> fpInRules;
+
   for (auto & it : rules)
   {
     if ((int)it.second.size() >= threshold)
+    {
       fprintf(rulesFile.getDescriptor(), "%s\n", it.first.c_str());
+      for (auto & fplm : it.second)
+        fpInRules[fplm.getFp()] = true;
+    }
     else
       for (auto & line : it.second)
-        fprintf(exceptionsFile.getDescriptor(), "%s\n", line.c_str());
+        exceptions[line.getFp()].emplace_back(line);
   }
 
+  for (auto it : exceptions)
+  {
+    if (fpInRules.count(it.first))
+      continue;
+
+    int indexMax = 0;
+    int knownMax = 0;
+
+    for (unsigned int i = 0; i < it.second.size(); i++)
+      if (it.second[i].getNbKnown() > knownMax)
+      {
+        indexMax = i;
+        knownMax = it.second[i].getNbKnown();
+      }
+
+    exceptionsToPrint.emplace_back(it.second[indexMax].toString());
+  }
+
+  std::sort(exceptionsToPrint.begin(), exceptionsToPrint.end());
+
+  for (auto & line : exceptionsToPrint)
+    fprintf(exceptionsFile.getDescriptor(), "%s\n", line.c_str());
+
   if (debug)
   {
     for (auto & it : rules)
     {
       fprintf(stderr, "<%s> : %lu\n", it.first.c_str(), it.second.size());
       for (auto & example : it.second)
-        fprintf(stderr, "\t<%s>\n", example.c_str());
+        fprintf(stderr, "\t<%s>\n", example.toString().c_str());
     }
   }
 
-- 
GitLab