From ca56eef3eaaf69d87c557da7872c51836fb6d49e Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Wed, 10 Apr 2019 14:58:32 +0200
Subject: [PATCH] Added program macaon_compute_l_rules

---
 maca_common/CMakeLists.txt                 |   5 +
 maca_common/src/macaon_compute_l_rules.cpp | 131 +++++++++++++++++++++
 maca_common/src/util.cpp                   |  38 +++---
 3 files changed, 154 insertions(+), 20 deletions(-)
 create mode 100644 maca_common/src/macaon_compute_l_rules.cpp

diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt
index e7a36d6..11a9756 100644
--- a/maca_common/CMakeLists.txt
+++ b/maca_common/CMakeLists.txt
@@ -1,5 +1,10 @@
 FILE(GLOB SOURCES src/*.cpp)
 
+add_executable(macaon_compute_l_rules src/macaon_compute_l_rules.cpp)
+target_link_libraries(macaon_compute_l_rules ${Boost_PROGRAM_OPTIONS_LIBRARY})
+target_link_libraries(macaon_compute_l_rules maca_common)
+install(TARGETS macaon_compute_l_rules DESTINATION bin)
+
 #compiling library
 add_library(maca_common STATIC ${SOURCES})
 target_link_libraries(maca_common fasttext)
diff --git a/maca_common/src/macaon_compute_l_rules.cpp b/maca_common/src/macaon_compute_l_rules.cpp
new file mode 100644
index 0000000..12a1b94
--- /dev/null
+++ b/maca_common/src/macaon_compute_l_rules.cpp
@@ -0,0 +1,131 @@
+/// \file macaon_compute_l_rules.cpp
+/// \author Franck Dary
+/// @version 1.0
+/// @date 2019-04-10
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include "File.hpp"
+#include "util.hpp"
+#include <boost/program_options.hpp>
+
+namespace po = boost::program_options;
+
+/// @brief Get the list of mandatory and optional program arguments.
+///
+/// @return The lists.
+po::options_description getOptionsDescription()
+{
+  po::options_description desc("Command-Line Arguments ");
+
+  po::options_description req("Required");
+  req.add_options()
+    ("fplm,f", po::value<std::string>()->required(),
+      "fplm file that contains words and their lemmas")
+    ("exceptions,e", po::value<std::string>()->required(),
+      "Output filename for exceptions")
+    ("rules,r", po::value<std::string>()->required(),
+      "Output filename for rules")
+    ("threshold,t", po::value<int>()->required(),
+      "Number of times a rule must be used in the fplm before it is outputted");
+
+  po::options_description opt("Optional");
+  opt.add_options()
+    ("help,h", "Produce this help message")
+    ("strict,s", "TODO : find what it does")
+    ("debug,d", "Print infos on stderr");
+
+  desc.add(req).add(opt);
+  return desc;
+}
+
+/// @brief Store the program arguments inside a variables_map
+///
+/// @param od The description of all the possible options.
+/// @param argc The number of arguments given to this program.
+/// @param argv The values of arguments given to this program.
+///
+/// @return The variables map
+po::variables_map checkOptions(po::options_description & od, int argc, char ** argv)
+{
+  po::variables_map vm;
+
+  try {po::store(po::parse_command_line(argc, argv, od), vm);}
+  catch(std::exception& e)
+  {
+    std::cerr << "Error: " << e.what() << "\n";
+    od.print(std::cerr);
+    exit(1);
+  }
+
+  if (vm.count("help"))
+  {
+    std::cout << od << "\n";
+    exit(0);
+  }
+
+  try {po::notify(vm);}
+  catch(std::exception& e)
+  {
+    std::cerr << "Error: " << e.what() << "\n";
+    od.print(std::cerr);
+    exit(1);
+  }
+
+  return vm;
+}
+
+/// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions.
+///
+/// @param argc The number of arguments given to this program.
+/// @param argv[] Array of arguments given to this program.
+///
+/// @return 0 if there was no crash.
+int main(int argc, char * argv[])
+{
+  auto od = getOptionsDescription();
+
+  po::variables_map vm = checkOptions(od, argc, argv);
+
+  std::string fplmFilename = vm["fplm"].as<std::string>();
+  std::string exceptionsFilename = vm["exceptions"].as<std::string>();
+  std::string rulesFilename = vm["rules"].as<std::string>();
+  int threshold = vm["threshold"].as<int>();
+  bool strict = vm.count("strict") == 0 ? false : true;
+
+  File fplm(fplmFilename, "r");
+  char buffer[100000];
+
+  std::map<std::string, int> rules;
+  while (fscanf(fplm.getDescriptor(), "%[^\n]\n", buffer) == 1)
+  {
+    auto splited = split(buffer, '\t');
+
+    if (splited.size() != 4)
+    {
+      fprintf(stderr, "ERROR (%s) : fplm line \'%s\' wrong format. Aborting.\n", ERRINFO, buffer);
+      exit(1);
+    }
+
+    auto form = splited[0];
+    auto lemma = splited[2];
+    auto rule = getRule(form, lemma);
+
+    rules[rule]++;
+  }
+
+  File rulesFile(rulesFilename, "w");
+  File exceptionsFile(exceptionsFilename, "w");
+
+  for (auto & it : rules)
+  {
+    if (it.second >= threshold)
+      fprintf(rulesFile.getDescriptor(), "%s\n", it.first.c_str());
+    else
+      fprintf(exceptionsFile.getDescriptor(), "%s\n", it.first.c_str());
+  }
+
+  return 0;
+}
+
diff --git a/maca_common/src/util.cpp b/maca_common/src/util.cpp
index 965da20..7e921f1 100644
--- a/maca_common/src/util.cpp
+++ b/maca_common/src/util.cpp
@@ -206,32 +206,30 @@ std::string getRule(const std::string & Ufrom, const std::string & Uto)
   std::string from = toLowerCase(Ufrom);
   std::string to = toLowerCase(Uto);
 
-  unsigned int prefixFrom = 0;
-  unsigned int prefixTo = 0;
+  int fromL = getNbSymbols(from);
+  int toL = getNbSymbols(to);
+  int minL = std::min(fromL, toL);
 
-  for(; prefixFrom < Ufrom.size() && prefixTo < Uto.size();)
+  int longestCommonPrefix = 0;
+
+  for (int i = 0; i < minL; i++)
   {
-    if(from[prefixFrom] == to[prefixTo])
-    {
-      prefixFrom++;
-      prefixTo++;
-      continue;
-    }
+    int limitFrom = getEndIndexOfNthSymbol(from, i);
+    int limitTo = getEndIndexOfNthSymbol(to, i);
+
+    if (limitFrom == limitTo && !memcmp(from.c_str(), to.c_str(), limitFrom+1))
+      longestCommonPrefix++;
+    else
       break;
   }
 
-  std::string rule;
-  rule.push_back('@');
-  for(unsigned int i = prefixFrom; i < from.size(); i++)
-    rule.push_back(from[i]);
-  rule.push_back('@');
-  for(unsigned int i = prefixTo; i < to.size(); i++)
-    rule.push_back(to[i]);
+  int prefixEndIndex = getEndIndexOfNthSymbol(from, longestCommonPrefix-1);
+  int suffixStartIndex = prefixEndIndex + 1;
 
-  if(rule.size() >= 20)
-    rule = "@@";
+  std::string toDelete(from.begin()+suffixStartIndex, from.end());
+  std::string toAdd(to.begin()+suffixStartIndex, to.end());
 
-  return rule;
+  return "@" + toDelete + "@" + toAdd;
 }
 
 bool ruleIsAppliable(const std::string & Ufrom, const std::string & rule)
@@ -458,7 +456,7 @@ int getEndIndexOfNthSymbol(const std::string & s, int n)
   auto it = s.begin();
   for (int i = 0; i < n+1; i++)
     try {utf8::next(it, s.end());}
-    catch (utf8::not_enough_room &) {return i == n ? s.end() - s.begin() : -1;}
+    catch (utf8::not_enough_room &) {return i == n ? s.end() - s.begin() - 1 : -1;}
 
   return (it-1) - s.begin();
 }
-- 
GitLab