From 69624ae5b4f6637827e9fc0354cadbf1bf90239f Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 21 Feb 2019 15:38:50 +0100 Subject: [PATCH] Added dictCapacity as a program argument --- decoder/src/macaon_decode.cpp | 3 +++ maca_common/include/Dict.hpp | 4 ---- maca_common/include/ProgramParameters.hpp | 1 + maca_common/src/Dict.cpp | 12 +++++++++--- maca_common/src/ProgramParameters.cpp | 1 + trainer/src/Trainer.cpp | 19 +++++++++++++++++++ trainer/src/macaon_train.cpp | 3 +++ transition_machine/src/ActionBank.cpp | 1 - 8 files changed, 36 insertions(+), 8 deletions(-) diff --git a/decoder/src/macaon_decode.cpp b/decoder/src/macaon_decode.cpp index 3288568..1be61fd 100644 --- a/decoder/src/macaon_decode.cpp +++ b/decoder/src/macaon_decode.cpp @@ -51,6 +51,8 @@ po::options_description getOptionsDescription() "For each state of the Config, show its feature representation") ("readSize", po::value<int>()->default_value(0), "The number of lines of input that will be read and stored in memory at once.") + ("dictCapacity", po::value<int>()->default_value(30000), + "The maximal size of each Dict (number of differents embeddings).") ("interactive", po::value<bool>()->default_value(true), "Is the shell interactive ? Display advancement informations") ("lang", po::value<std::string>()->default_value("fr"), @@ -149,6 +151,7 @@ int main(int argc, char * argv[]) ProgramParameters::readSize = vm["readSize"].as<int>(); if (ProgramParameters::readSize == 0) ProgramParameters::readSize = ProgramParameters::tapeSize; + ProgramParameters::dictCapacity = vm["dictCapacity"].as<int>(); ProgramParameters::beamSize = vm["beamSize"].as<int>(); ProgramParameters::nbChilds = vm["nbChilds"].as<int>(); ProgramParameters::optimizer = "none"; diff --git a/maca_common/include/Dict.hpp b/maca_common/include/Dict.hpp index db95ee2..f783873 100644 --- a/maca_common/include/Dict.hpp +++ b/maca_common/include/Dict.hpp @@ -95,10 +95,6 @@ class Dict private : - /// @brief The maximum number of entry a Dict can hold. - /// - /// This limit exists because the dynet LookupParameter associed with the Dict is not dynamic and must come with a fixed finite size. - static constexpr unsigned int MAX_CAPACITY = 200000; /// @brief The dimension of each vector of this Dict (in number of float). int dimension; /// @brief A storage that map every string entry of this Dict to its index as a lookup parameter. diff --git a/maca_common/include/ProgramParameters.hpp b/maca_common/include/ProgramParameters.hpp index 9f30b7e..675843d 100644 --- a/maca_common/include/ProgramParameters.hpp +++ b/maca_common/include/ProgramParameters.hpp @@ -66,6 +66,7 @@ struct ProgramParameters static int tapeSize; static int devTapeSize; static int readSize; + static int dictCapacity; static bool printOutputEntropy; private : diff --git a/maca_common/src/Dict.cpp b/maca_common/src/Dict.cpp index d974ca9..37aee0d 100644 --- a/maca_common/src/Dict.cpp +++ b/maca_common/src/Dict.cpp @@ -77,7 +77,7 @@ void Dict::init(dynet::ParameterCollection & pc) } isInit = true; - this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension}); + this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension}); addEntry(nullValueStr); addEntry(unknownValueStr); } @@ -125,7 +125,7 @@ void Dict::initFromFile(dynet::ParameterCollection & pc) } ftVector.reset(new fasttext::Vector(dimension)); - this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension}); + this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension}); } // If policy is FromZero, we don't need to read the current entries @@ -140,7 +140,7 @@ void Dict::initFromFile(dynet::ParameterCollection & pc) if (readIndex == -1) // No parameters to read { - this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension}); + this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension}); addEntry(nullValueStr); addEntry(unknownValueStr); return; @@ -361,6 +361,12 @@ unsigned int Dict::addEntry(const std::string & s) auto index = str2index.size(); str2index.emplace(s, index); + if ((int)str2index.size() >= ProgramParameters::dictCapacity) + { + fprintf(stderr, "ERROR (%s) : Dict %s of maximal capacity %d is full. Aborting.\n", ERRINFO, name.c_str(), ProgramParameters::dictCapacity); + exit(1); + } + if(mode == Mode::OneHot) { if(oneHotIndex >= dimension) diff --git a/maca_common/src/ProgramParameters.cpp b/maca_common/src/ProgramParameters.cpp index 6e5b999..593e22c 100644 --- a/maca_common/src/ProgramParameters.cpp +++ b/maca_common/src/ProgramParameters.cpp @@ -61,4 +61,5 @@ int ProgramParameters::tapeSize; int ProgramParameters::devTapeSize; int ProgramParameters::readSize; bool ProgramParameters::printOutputEntropy; +int ProgramParameters::dictCapacity; diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp index dc00a0e..be1de41 100644 --- a/trainer/src/Trainer.cpp +++ b/trainer/src/Trainer.cpp @@ -140,7 +140,13 @@ void Trainer::computeScoreOnDev() } } + if (ProgramParameters::debug) + fprintf(stderr, "Dev Config is final\n"); + TI.computeDevScores(); + + if (ProgramParameters::debug) + fprintf(stderr, "End of %s\n", __func__); } void Trainer::train() @@ -320,6 +326,9 @@ void Trainer::train() } } + if (ProgramParameters::debug) + fprintf(stderr, "Config is final\n"); + if (ProgramParameters::iterationSize == -1) { printScoresAndSave(stderr); @@ -329,6 +338,9 @@ void Trainer::train() if (TI.getEpoch() > ProgramParameters::nbIter) break; } + + if (ProgramParameters::debug) + fprintf(stderr, "End of epoch\n"); } } @@ -342,10 +354,17 @@ void Trainer::printScoresAndSave(FILE * output) for (auto * cla : classifiers) if (TI.mustSave(cla->name)) { + if (ProgramParameters::debug) + fprintf(stderr, "Saving %s...", cla->name.c_str()); cla->save(ProgramParameters::expPath + cla->name + ".model"); Dict::saveDicts(ProgramParameters::expPath, cla->name); + if (ProgramParameters::debug) + fprintf(stderr, "Done !\n"); } TI.printScores(output); + + if (ProgramParameters::debug) + fprintf(stderr, "End of %s\n", __func__); } diff --git a/trainer/src/macaon_train.cpp b/trainer/src/macaon_train.cpp index dd77570..325ef72 100644 --- a/trainer/src/macaon_train.cpp +++ b/trainer/src/macaon_train.cpp @@ -79,6 +79,8 @@ po::options_description getOptionsDescription() "The value of the token that act as a delimiter for sequences") ("batchSize", po::value<int>()->default_value(50), "The size of each minibatch (in number of taining examples)") + ("dictCapacity", po::value<int>()->default_value(30000), + "The maximal size of each Dict (number of differents embeddings).") ("printTime", "Print time on stderr") ("shuffle", po::value<bool>()->default_value(true), "Shuffle examples after each iteration"); @@ -268,6 +270,7 @@ int main(int argc, char * argv[]) ProgramParameters::nbIter = vm["nbiter"].as<int>(); ProgramParameters::seed = vm["seed"].as<int>(); ProgramParameters::batchSize = vm["batchSize"].as<int>(); + ProgramParameters::dictCapacity = vm["dictCapacity"].as<int>(); ProgramParameters::nbTrain = vm["nbTrain"].as<int>(); ProgramParameters::removeDuplicates = vm["duplicates"].as<bool>(); ProgramParameters::interactive = vm["interactive"].as<bool>(); diff --git a/transition_machine/src/ActionBank.cpp b/transition_machine/src/ActionBank.cpp index a423f8f..4737d48 100644 --- a/transition_machine/src/ActionBank.cpp +++ b/transition_machine/src/ActionBank.cpp @@ -370,7 +370,6 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na ba.data += "+"+std::to_string(s-b0); } } - } if (rootIndex == -1) -- GitLab