diff --git a/maca_common/CMakeLists.txt b/maca_common/CMakeLists.txt index e7a36d6be487550c3bded4fa06699dcd4245957b..11a9756e73ea4b2bbd7e5672d719998f3ed861a7 100644 --- a/maca_common/CMakeLists.txt +++ b/maca_common/CMakeLists.txt @@ -1,5 +1,10 @@ FILE(GLOB SOURCES src/*.cpp) +add_executable(macaon_compute_l_rules src/macaon_compute_l_rules.cpp) +target_link_libraries(macaon_compute_l_rules ${Boost_PROGRAM_OPTIONS_LIBRARY}) +target_link_libraries(macaon_compute_l_rules maca_common) +install(TARGETS macaon_compute_l_rules DESTINATION bin) + #compiling library add_library(maca_common STATIC ${SOURCES}) target_link_libraries(maca_common fasttext) diff --git a/maca_common/src/macaon_compute_l_rules.cpp b/maca_common/src/macaon_compute_l_rules.cpp new file mode 100644 index 0000000000000000000000000000000000000000..12a1b94f761571cf166843d022d30f3c290361f8 --- /dev/null +++ b/maca_common/src/macaon_compute_l_rules.cpp @@ -0,0 +1,131 @@ +/// \file macaon_compute_l_rules.cpp +/// \author Franck Dary +/// @version 1.0 +/// @date 2019-04-10 + +#include <cstdio> +#include <cstdlib> +#include <iostream> +#include "File.hpp" +#include "util.hpp" +#include <boost/program_options.hpp> + +namespace po = boost::program_options; + +/// @brief Get the list of mandatory and optional program arguments. +/// +/// @return The lists. +po::options_description getOptionsDescription() +{ + po::options_description desc("Command-Line Arguments "); + + po::options_description req("Required"); + req.add_options() + ("fplm,f", po::value<std::string>()->required(), + "fplm file that contains words and their lemmas") + ("exceptions,e", po::value<std::string>()->required(), + "Output filename for exceptions") + ("rules,r", po::value<std::string>()->required(), + "Output filename for rules") + ("threshold,t", po::value<int>()->required(), + "Number of times a rule must be used in the fplm before it is outputted"); + + po::options_description opt("Optional"); + opt.add_options() + ("help,h", "Produce this help message") + ("strict,s", "TODO : find what it does") + ("debug,d", "Print infos on stderr"); + + desc.add(req).add(opt); + return desc; +} + +/// @brief Store the program arguments inside a variables_map +/// +/// @param od The description of all the possible options. +/// @param argc The number of arguments given to this program. +/// @param argv The values of arguments given to this program. +/// +/// @return The variables map +po::variables_map checkOptions(po::options_description & od, int argc, char ** argv) +{ + po::variables_map vm; + + try {po::store(po::parse_command_line(argc, argv, od), vm);} + catch(std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + od.print(std::cerr); + exit(1); + } + + if (vm.count("help")) + { + std::cout << od << "\n"; + exit(0); + } + + try {po::notify(vm);} + catch(std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + od.print(std::cerr); + exit(1); + } + + return vm; +} + +/// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions. +/// +/// @param argc The number of arguments given to this program. +/// @param argv[] Array of arguments given to this program. +/// +/// @return 0 if there was no crash. +int main(int argc, char * argv[]) +{ + auto od = getOptionsDescription(); + + po::variables_map vm = checkOptions(od, argc, argv); + + std::string fplmFilename = vm["fplm"].as<std::string>(); + std::string exceptionsFilename = vm["exceptions"].as<std::string>(); + std::string rulesFilename = vm["rules"].as<std::string>(); + int threshold = vm["threshold"].as<int>(); + bool strict = vm.count("strict") == 0 ? false : true; + + File fplm(fplmFilename, "r"); + char buffer[100000]; + + std::map<std::string, int> rules; + while (fscanf(fplm.getDescriptor(), "%[^\n]\n", buffer) == 1) + { + auto splited = split(buffer, '\t'); + + if (splited.size() != 4) + { + fprintf(stderr, "ERROR (%s) : fplm line \'%s\' wrong format. Aborting.\n", ERRINFO, buffer); + exit(1); + } + + auto form = splited[0]; + auto lemma = splited[2]; + auto rule = getRule(form, lemma); + + rules[rule]++; + } + + File rulesFile(rulesFilename, "w"); + File exceptionsFile(exceptionsFilename, "w"); + + for (auto & it : rules) + { + if (it.second >= threshold) + fprintf(rulesFile.getDescriptor(), "%s\n", it.first.c_str()); + else + fprintf(exceptionsFile.getDescriptor(), "%s\n", it.first.c_str()); + } + + return 0; +} + diff --git a/maca_common/src/util.cpp b/maca_common/src/util.cpp index 965da20d4bf60bb147a0d72b7a83551e9b98264f..7e921f1c85361fc5ba259ec0c678856262fd2d06 100644 --- a/maca_common/src/util.cpp +++ b/maca_common/src/util.cpp @@ -206,32 +206,30 @@ std::string getRule(const std::string & Ufrom, const std::string & Uto) std::string from = toLowerCase(Ufrom); std::string to = toLowerCase(Uto); - unsigned int prefixFrom = 0; - unsigned int prefixTo = 0; + int fromL = getNbSymbols(from); + int toL = getNbSymbols(to); + int minL = std::min(fromL, toL); - for(; prefixFrom < Ufrom.size() && prefixTo < Uto.size();) + int longestCommonPrefix = 0; + + for (int i = 0; i < minL; i++) { - if(from[prefixFrom] == to[prefixTo]) - { - prefixFrom++; - prefixTo++; - continue; - } + int limitFrom = getEndIndexOfNthSymbol(from, i); + int limitTo = getEndIndexOfNthSymbol(to, i); + + if (limitFrom == limitTo && !memcmp(from.c_str(), to.c_str(), limitFrom+1)) + longestCommonPrefix++; + else break; } - std::string rule; - rule.push_back('@'); - for(unsigned int i = prefixFrom; i < from.size(); i++) - rule.push_back(from[i]); - rule.push_back('@'); - for(unsigned int i = prefixTo; i < to.size(); i++) - rule.push_back(to[i]); + int prefixEndIndex = getEndIndexOfNthSymbol(from, longestCommonPrefix-1); + int suffixStartIndex = prefixEndIndex + 1; - if(rule.size() >= 20) - rule = "@@"; + std::string toDelete(from.begin()+suffixStartIndex, from.end()); + std::string toAdd(to.begin()+suffixStartIndex, to.end()); - return rule; + return "@" + toDelete + "@" + toAdd; } bool ruleIsAppliable(const std::string & Ufrom, const std::string & rule) @@ -458,7 +456,7 @@ int getEndIndexOfNthSymbol(const std::string & s, int n) auto it = s.begin(); for (int i = 0; i < n+1; i++) try {utf8::next(it, s.end());} - catch (utf8::not_enough_room &) {return i == n ? s.end() - s.begin() : -1;} + catch (utf8::not_enough_room &) {return i == n ? s.end() - s.begin() - 1 : -1;} return (it-1) - s.begin(); }