Skip to content
Snippets Groups Projects
Commit 406e60a7 authored by Franck Dary's avatar Franck Dary
Browse files

Improved generation of lemmatizer rules

parent 9c9135a9
No related branches found
No related tags found
No related merge requests found
...@@ -83,6 +83,50 @@ po::variables_map checkOptions(po::options_description & od, int argc, char ** a ...@@ -83,6 +83,50 @@ po::variables_map checkOptions(po::options_description & od, int argc, char ** a
return vm; return vm;
} }
struct FPLM
{
std::string form;
std::string pos;
std::string lemma;
std::string morpho;
FPLM(std::string form, std::string pos, std::string lemma, std::string morpho)
{
this->form = form;
this->pos = pos;
this->lemma = lemma;
this->morpho = morpho;
}
std::string toString()
{
return form + "\t" + pos + "\t" + lemma + "\t" + morpho;
}
std::string getFp()
{
return form + "\t" + pos;
}
int getNbKnown()
{
int nbKnown = 0;
if (!form.empty() && form != "_")
nbKnown++;
if (!pos.empty() && pos != "_")
nbKnown++;
if (!lemma.empty() && lemma != "_")
nbKnown++;
if (!morpho.empty() && morpho != "_")
nbKnown++;
return nbKnown;
}
};
/// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions. /// @brief Given a fplm file (pairs of word / lemma), compute rules that will transform these words into lemmas, as well as exceptions.
/// ///
/// @param argc The number of arguments given to this program. /// @param argc The number of arguments given to this program.
...@@ -105,7 +149,7 @@ int main(int argc, char * argv[]) ...@@ -105,7 +149,7 @@ int main(int argc, char * argv[])
File fplm(fplmFilename, "r"); File fplm(fplmFilename, "r");
char buffer[100000]; char buffer[100000];
std::map<std::string, std::vector<std::string> > rules; std::map<std::string, std::vector<FPLM> > rules;
while (fscanf(fplm.getDescriptor(), "%[^\n]\n", buffer) == 1) while (fscanf(fplm.getDescriptor(), "%[^\n]\n", buffer) == 1)
{ {
auto splited = util::split(buffer, '\t'); auto splited = util::split(buffer, '\t');
...@@ -116,32 +160,65 @@ int main(int argc, char * argv[]) ...@@ -116,32 +160,65 @@ int main(int argc, char * argv[])
exit(1); exit(1);
} }
auto form = splited[0]; std::string form = splited[0];
auto lemma = splited[2]; std::string pos = splited[1];
auto rule = util::getRule(form, lemma); std::string lemma = splited[2];
std::string morpho = splited[3];
std::string rule = util::getRule(form, lemma);
rules[rule].emplace_back(buffer); rules[rule].emplace_back(form, pos, lemma, morpho);
} }
File rulesFile(rulesFilename, "w"); File rulesFile(rulesFilename, "w");
File exceptionsFile(exceptionsFilename, "w"); File exceptionsFile(exceptionsFilename, "w");
std::vector<std::string> exceptionsToPrint;
std::map< std::string, std::vector<FPLM> > exceptions;
std::map<std::string, bool> fpInRules;
for (auto & it : rules) for (auto & it : rules)
{ {
if ((int)it.second.size() >= threshold) if ((int)it.second.size() >= threshold)
{
fprintf(rulesFile.getDescriptor(), "%s\n", it.first.c_str()); fprintf(rulesFile.getDescriptor(), "%s\n", it.first.c_str());
for (auto & fplm : it.second)
fpInRules[fplm.getFp()] = true;
}
else else
for (auto & line : it.second) for (auto & line : it.second)
fprintf(exceptionsFile.getDescriptor(), "%s\n", line.c_str()); exceptions[line.getFp()].emplace_back(line);
} }
for (auto it : exceptions)
{
if (fpInRules.count(it.first))
continue;
int indexMax = 0;
int knownMax = 0;
for (unsigned int i = 0; i < it.second.size(); i++)
if (it.second[i].getNbKnown() > knownMax)
{
indexMax = i;
knownMax = it.second[i].getNbKnown();
}
exceptionsToPrint.emplace_back(it.second[indexMax].toString());
}
std::sort(exceptionsToPrint.begin(), exceptionsToPrint.end());
for (auto & line : exceptionsToPrint)
fprintf(exceptionsFile.getDescriptor(), "%s\n", line.c_str());
if (debug) if (debug)
{ {
for (auto & it : rules) for (auto & it : rules)
{ {
fprintf(stderr, "<%s> : %lu\n", it.first.c_str(), it.second.size()); fprintf(stderr, "<%s> : %lu\n", it.first.c_str(), it.second.size());
for (auto & example : it.second) for (auto & example : it.second)
fprintf(stderr, "\t<%s>\n", example.c_str()); fprintf(stderr, "\t<%s>\n", example.toString().c_str());
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment