From 406e60a755d37fc88af6369f68f602e9ba9fba26 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Wed, 23 Oct 2019 12:01:29 +0200 Subject: [PATCH] Improved generation of lemmatizer rules --- maca_common/src/macaon_compute_l_rules.cpp | 91 ++++++++++++++++++++-- 1 file changed, 84 insertions(+), 7 deletions(-) diff --git a/maca_common/src/macaon_compute_l_rules.cpp b/maca_common/src/macaon_compute_l_rules.cpp index 475f20d..d62c9c7 100644 --- a/maca_common/src/macaon_compute_l_rules.cpp +++ b/maca_common/src/macaon_compute_l_rules.cpp @@ -83,6 +83,50 @@ po::variables_map checkOptions(po::options_description & od, int argc, char ** a return vm; } +struct FPLM +{ + std::string form; + std::string pos; + std::string lemma; + std::string morpho; + + FPLM(std::string form, std::string pos, std::string lemma, std::string morpho) + { + this->form = form; + this->pos = pos; + this->lemma = lemma; + this->morpho = morpho; + } + + std::string toString() + { + return form + "\t" + pos + "\t" + lemma + "\t" + morpho; + } + + std::string getFp() + { + return form + "\t" + pos; + } + + int getNbKnown() + { + int nbKnown = 0; + + if (!form.empty() && form != "_") + nbKnown++; + if (!pos.empty() && pos != "_") + nbKnown++; + if (!lemma.empty() && lemma != "_") + nbKnown++; + if (!morpho.empty() && morpho != "_") + nbKnown++; + + return nbKnown; + } + + +}; + /// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions. /// /// @param argc The number of arguments given to this program. @@ -105,7 +149,7 @@ int main(int argc, char * argv[]) File fplm(fplmFilename, "r"); char buffer[100000]; - std::map<std::string, std::vector<std::string> > rules; + std::map<std::string, std::vector<FPLM> > rules; while (fscanf(fplm.getDescriptor(), "%[^\n]\n", buffer) == 1) { auto splited = util::split(buffer, '\t'); @@ -116,32 +160,65 @@ int main(int argc, char * argv[]) exit(1); } - auto form = splited[0]; - auto lemma = splited[2]; - auto rule = util::getRule(form, lemma); + std::string form = splited[0]; + std::string pos = splited[1]; + std::string lemma = splited[2]; + std::string morpho = splited[3]; + std::string rule = util::getRule(form, lemma); - rules[rule].emplace_back(buffer); + rules[rule].emplace_back(form, pos, lemma, morpho); } File rulesFile(rulesFilename, "w"); File exceptionsFile(exceptionsFilename, "w"); + std::vector<std::string> exceptionsToPrint; + std::map< std::string, std::vector<FPLM> > exceptions; + std::map<std::string, bool> fpInRules; + for (auto & it : rules) { if ((int)it.second.size() >= threshold) + { fprintf(rulesFile.getDescriptor(), "%s\n", it.first.c_str()); + for (auto & fplm : it.second) + fpInRules[fplm.getFp()] = true; + } else for (auto & line : it.second) - fprintf(exceptionsFile.getDescriptor(), "%s\n", line.c_str()); + exceptions[line.getFp()].emplace_back(line); } + for (auto it : exceptions) + { + if (fpInRules.count(it.first)) + continue; + + int indexMax = 0; + int knownMax = 0; + + for (unsigned int i = 0; i < it.second.size(); i++) + if (it.second[i].getNbKnown() > knownMax) + { + indexMax = i; + knownMax = it.second[i].getNbKnown(); + } + + exceptionsToPrint.emplace_back(it.second[indexMax].toString()); + } + + std::sort(exceptionsToPrint.begin(), exceptionsToPrint.end()); + + for (auto & line : exceptionsToPrint) + fprintf(exceptionsFile.getDescriptor(), "%s\n", line.c_str()); + if (debug) { for (auto & it : rules) { fprintf(stderr, "<%s> : %lu\n", it.first.c_str(), it.second.size()); for (auto & example : it.second) - fprintf(stderr, "\t<%s>\n", example.c_str()); + fprintf(stderr, "\t<%s>\n", example.toString().c_str()); } } -- GitLab