Skip to content
Snippets Groups Projects
Commit ca56eef3 authored by Franck Dary's avatar Franck Dary
Browse files

Added program macaon_compute_l_rules

parent 0307121c
Branches
No related tags found
No related merge requests found
FILE(GLOB SOURCES src/*.cpp) FILE(GLOB SOURCES src/*.cpp)
add_executable(macaon_compute_l_rules src/macaon_compute_l_rules.cpp)
target_link_libraries(macaon_compute_l_rules ${Boost_PROGRAM_OPTIONS_LIBRARY})
target_link_libraries(macaon_compute_l_rules maca_common)
install(TARGETS macaon_compute_l_rules DESTINATION bin)
#compiling library #compiling library
add_library(maca_common STATIC ${SOURCES}) add_library(maca_common STATIC ${SOURCES})
target_link_libraries(maca_common fasttext) target_link_libraries(maca_common fasttext)
/// \file macaon_compute_l_rules.cpp
/// \author Franck Dary
/// @version 1.0
/// @date 2019-04-10
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include "File.hpp"
#include "util.hpp"
#include <boost/program_options.hpp>
namespace po = boost::program_options;
/// @brief Get the list of mandatory and optional program arguments.
///
/// @return The lists.
po::options_description getOptionsDescription()
{
po::options_description desc("Command-Line Arguments ");
po::options_description req("Required");
req.add_options()
("fplm,f", po::value<std::string>()->required(),
"fplm file that contains words and their lemmas")
("exceptions,e", po::value<std::string>()->required(),
"Output filename for exceptions")
("rules,r", po::value<std::string>()->required(),
"Output filename for rules")
("threshold,t", po::value<int>()->required(),
"Number of times a rule must be used in the fplm before it is outputted");
po::options_description opt("Optional");
opt.add_options()
("help,h", "Produce this help message")
("strict,s", "TODO : find what it does")
("debug,d", "Print infos on stderr");
desc.add(req).add(opt);
return desc;
}
/// @brief Store the program arguments inside a variables_map
///
/// @param od The description of all the possible options.
/// @param argc The number of arguments given to this program.
/// @param argv The values of arguments given to this program.
///
/// @return The variables map
po::variables_map checkOptions(po::options_description & od, int argc, char ** argv)
{
po::variables_map vm;
try {po::store(po::parse_command_line(argc, argv, od), vm);}
catch(std::exception& e)
{
std::cerr << "Error: " << e.what() << "\n";
od.print(std::cerr);
exit(1);
}
if (vm.count("help"))
{
std::cout << od << "\n";
exit(0);
}
try {po::notify(vm);}
catch(std::exception& e)
{
std::cerr << "Error: " << e.what() << "\n";
od.print(std::cerr);
exit(1);
}
return vm;
}
/// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions.
///
/// @param argc The number of arguments given to this program.
/// @param argv[] Array of arguments given to this program.
///
/// @return 0 if there was no crash.
int main(int argc, char * argv[])
{
auto od = getOptionsDescription();
po::variables_map vm = checkOptions(od, argc, argv);
std::string fplmFilename = vm["fplm"].as<std::string>();
std::string exceptionsFilename = vm["exceptions"].as<std::string>();
std::string rulesFilename = vm["rules"].as<std::string>();
int threshold = vm["threshold"].as<int>();
bool strict = vm.count("strict") == 0 ? false : true;
File fplm(fplmFilename, "r");
char buffer[100000];
std::map<std::string, int> rules;
while (fscanf(fplm.getDescriptor(), "%[^\n]\n", buffer) == 1)
{
auto splited = split(buffer, '\t');
if (splited.size() != 4)
{
fprintf(stderr, "ERROR (%s) : fplm line \'%s\' wrong format. Aborting.\n", ERRINFO, buffer);
exit(1);
}
auto form = splited[0];
auto lemma = splited[2];
auto rule = getRule(form, lemma);
rules[rule]++;
}
File rulesFile(rulesFilename, "w");
File exceptionsFile(exceptionsFilename, "w");
for (auto & it : rules)
{
if (it.second >= threshold)
fprintf(rulesFile.getDescriptor(), "%s\n", it.first.c_str());
else
fprintf(exceptionsFile.getDescriptor(), "%s\n", it.first.c_str());
}
return 0;
}
...@@ -206,32 +206,30 @@ std::string getRule(const std::string & Ufrom, const std::string & Uto) ...@@ -206,32 +206,30 @@ std::string getRule(const std::string & Ufrom, const std::string & Uto)
std::string from = toLowerCase(Ufrom); std::string from = toLowerCase(Ufrom);
std::string to = toLowerCase(Uto); std::string to = toLowerCase(Uto);
unsigned int prefixFrom = 0; int fromL = getNbSymbols(from);
unsigned int prefixTo = 0; int toL = getNbSymbols(to);
int minL = std::min(fromL, toL);
for(; prefixFrom < Ufrom.size() && prefixTo < Uto.size();) int longestCommonPrefix = 0;
{
if(from[prefixFrom] == to[prefixTo]) for (int i = 0; i < minL; i++)
{ {
prefixFrom++; int limitFrom = getEndIndexOfNthSymbol(from, i);
prefixTo++; int limitTo = getEndIndexOfNthSymbol(to, i);
continue;
} if (limitFrom == limitTo && !memcmp(from.c_str(), to.c_str(), limitFrom+1))
longestCommonPrefix++;
else
break; break;
} }
std::string rule; int prefixEndIndex = getEndIndexOfNthSymbol(from, longestCommonPrefix-1);
rule.push_back('@'); int suffixStartIndex = prefixEndIndex + 1;
for(unsigned int i = prefixFrom; i < from.size(); i++)
rule.push_back(from[i]);
rule.push_back('@');
for(unsigned int i = prefixTo; i < to.size(); i++)
rule.push_back(to[i]);
if(rule.size() >= 20) std::string toDelete(from.begin()+suffixStartIndex, from.end());
rule = "@@"; std::string toAdd(to.begin()+suffixStartIndex, to.end());
return rule; return "@" + toDelete + "@" + toAdd;
} }
bool ruleIsAppliable(const std::string & Ufrom, const std::string & rule) bool ruleIsAppliable(const std::string & Ufrom, const std::string & rule)
...@@ -458,7 +456,7 @@ int getEndIndexOfNthSymbol(const std::string & s, int n) ...@@ -458,7 +456,7 @@ int getEndIndexOfNthSymbol(const std::string & s, int n)
auto it = s.begin(); auto it = s.begin();
for (int i = 0; i < n+1; i++) for (int i = 0; i < n+1; i++)
try {utf8::next(it, s.end());} try {utf8::next(it, s.end());}
catch (utf8::not_enough_room &) {return i == n ? s.end() - s.begin() : -1;} catch (utf8::not_enough_room &) {return i == n ? s.end() - s.begin() - 1 : -1;}
return (it-1) - s.begin(); return (it-1) - s.begin();
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment