Skip to content
Snippets Groups Projects
Commit 406e60a7 authored by Franck Dary's avatar Franck Dary
Browse files

Improved generation of lemmatizer rules

parent 9c9135a9
No related branches found
No related tags found
No related merge requests found
......@@ -83,6 +83,50 @@ po::variables_map checkOptions(po::options_description & od, int argc, char ** a
return vm;
}
struct FPLM
{
std::string form;
std::string pos;
std::string lemma;
std::string morpho;
FPLM(std::string form, std::string pos, std::string lemma, std::string morpho)
{
this->form = form;
this->pos = pos;
this->lemma = lemma;
this->morpho = morpho;
}
std::string toString()
{
return form + "\t" + pos + "\t" + lemma + "\t" + morpho;
}
std::string getFp()
{
return form + "\t" + pos;
}
int getNbKnown()
{
int nbKnown = 0;
if (!form.empty() && form != "_")
nbKnown++;
if (!pos.empty() && pos != "_")
nbKnown++;
if (!lemma.empty() && lemma != "_")
nbKnown++;
if (!morpho.empty() && morpho != "_")
nbKnown++;
return nbKnown;
}
};
/// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions.
///
/// @param argc The number of arguments given to this program.
......@@ -105,7 +149,7 @@ int main(int argc, char * argv[])
File fplm(fplmFilename, "r");
char buffer[100000];
std::map<std::string, std::vector<std::string> > rules;
std::map<std::string, std::vector<FPLM> > rules;
while (fscanf(fplm.getDescriptor(), "%[^\n]\n", buffer) == 1)
{
auto splited = util::split(buffer, '\t');
......@@ -116,32 +160,65 @@ int main(int argc, char * argv[])
exit(1);
}
auto form = splited[0];
auto lemma = splited[2];
auto rule = util::getRule(form, lemma);
std::string form = splited[0];
std::string pos = splited[1];
std::string lemma = splited[2];
std::string morpho = splited[3];
std::string rule = util::getRule(form, lemma);
rules[rule].emplace_back(buffer);
rules[rule].emplace_back(form, pos, lemma, morpho);
}
File rulesFile(rulesFilename, "w");
File exceptionsFile(exceptionsFilename, "w");
std::vector<std::string> exceptionsToPrint;
std::map< std::string, std::vector<FPLM> > exceptions;
std::map<std::string, bool> fpInRules;
for (auto & it : rules)
{
if ((int)it.second.size() >= threshold)
{
fprintf(rulesFile.getDescriptor(), "%s\n", it.first.c_str());
for (auto & fplm : it.second)
fpInRules[fplm.getFp()] = true;
}
else
for (auto & line : it.second)
fprintf(exceptionsFile.getDescriptor(), "%s\n", line.c_str());
exceptions[line.getFp()].emplace_back(line);
}
for (auto it : exceptions)
{
if (fpInRules.count(it.first))
continue;
int indexMax = 0;
int knownMax = 0;
for (unsigned int i = 0; i < it.second.size(); i++)
if (it.second[i].getNbKnown() > knownMax)
{
indexMax = i;
knownMax = it.second[i].getNbKnown();
}
exceptionsToPrint.emplace_back(it.second[indexMax].toString());
}
std::sort(exceptionsToPrint.begin(), exceptionsToPrint.end());
for (auto & line : exceptionsToPrint)
fprintf(exceptionsFile.getDescriptor(), "%s\n", line.c_str());
if (debug)
{
for (auto & it : rules)
{
fprintf(stderr, "<%s> : %lu\n", it.first.c_str(), it.second.size());
for (auto & example : it.second)
fprintf(stderr, "\t<%s>\n", example.c_str());
fprintf(stderr, "\t<%s>\n", example.toString().c_str());
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment