diff --git a/decoder/src/macaon_decode.cpp b/decoder/src/macaon_decode.cpp index 3288568a9a88c63df1fa5ad221b4fe6c548a6d43..1be61fdb9e3f497c33c551be1107038241cb39e0 100644 --- a/decoder/src/macaon_decode.cpp +++ b/decoder/src/macaon_decode.cpp @@ -51,6 +51,8 @@ po::options_description getOptionsDescription() "For each state of the Config, show its feature representation") ("readSize", po::value<int>()->default_value(0), "The number of lines of input that will be read and stored in memory at once.") + ("dictCapacity", po::value<int>()->default_value(30000), + "The maximal size of each Dict (number of differents embeddings).") ("interactive", po::value<bool>()->default_value(true), "Is the shell interactive ? Display advancement informations") ("lang", po::value<std::string>()->default_value("fr"), @@ -149,6 +151,7 @@ int main(int argc, char * argv[]) ProgramParameters::readSize = vm["readSize"].as<int>(); if (ProgramParameters::readSize == 0) ProgramParameters::readSize = ProgramParameters::tapeSize; + ProgramParameters::dictCapacity = vm["dictCapacity"].as<int>(); ProgramParameters::beamSize = vm["beamSize"].as<int>(); ProgramParameters::nbChilds = vm["nbChilds"].as<int>(); ProgramParameters::optimizer = "none"; diff --git a/maca_common/include/Dict.hpp b/maca_common/include/Dict.hpp index db95ee27784ba513566fa0bc90e1a7f4b3a9d49a..f78387385ac1e8c688540dff2251c099dfeae734 100644 --- a/maca_common/include/Dict.hpp +++ b/maca_common/include/Dict.hpp @@ -95,10 +95,6 @@ class Dict private : - /// @brief The maximum number of entry a Dict can hold. - /// - /// This limit exists because the dynet LookupParameter associed with the Dict is not dynamic and must come with a fixed finite size. - static constexpr unsigned int MAX_CAPACITY = 200000; /// @brief The dimension of each vector of this Dict (in number of float). int dimension; /// @brief A storage that map every string entry of this Dict to its index as a lookup parameter. diff --git a/maca_common/include/ProgramParameters.hpp b/maca_common/include/ProgramParameters.hpp index 9f30b7e74c15a684fccbaa4446c0394dc1c48f08..675843d7044069c64d69d9a1e6044abd72f37a6b 100644 --- a/maca_common/include/ProgramParameters.hpp +++ b/maca_common/include/ProgramParameters.hpp @@ -66,6 +66,7 @@ struct ProgramParameters static int tapeSize; static int devTapeSize; static int readSize; + static int dictCapacity; static bool printOutputEntropy; private : diff --git a/maca_common/src/Dict.cpp b/maca_common/src/Dict.cpp index d974ca927cb5579cce10ecb3714db8ad60967113..37aee0d198dc95b4bd1a65afaf1c73d95d141223 100644 --- a/maca_common/src/Dict.cpp +++ b/maca_common/src/Dict.cpp @@ -77,7 +77,7 @@ void Dict::init(dynet::ParameterCollection & pc) } isInit = true; - this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension}); + this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension}); addEntry(nullValueStr); addEntry(unknownValueStr); } @@ -125,7 +125,7 @@ void Dict::initFromFile(dynet::ParameterCollection & pc) } ftVector.reset(new fasttext::Vector(dimension)); - this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension}); + this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension}); } // If policy is FromZero, we don't need to read the current entries @@ -140,7 +140,7 @@ void Dict::initFromFile(dynet::ParameterCollection & pc) if (readIndex == -1) // No parameters to read { - this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension}); + this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension}); addEntry(nullValueStr); addEntry(unknownValueStr); return; @@ -361,6 +361,12 @@ unsigned int Dict::addEntry(const std::string & s) auto index = str2index.size(); str2index.emplace(s, index); + if ((int)str2index.size() >= ProgramParameters::dictCapacity) + { + fprintf(stderr, "ERROR (%s) : Dict %s of maximal capacity %d is full. Aborting.\n", ERRINFO, name.c_str(), ProgramParameters::dictCapacity); + exit(1); + } + if(mode == Mode::OneHot) { if(oneHotIndex >= dimension) diff --git a/maca_common/src/ProgramParameters.cpp b/maca_common/src/ProgramParameters.cpp index 6e5b9991d99278cb12cf290fd49a86ff568e85e6..593e22cf21002aa0ed882f84bc51e715c0c18968 100644 --- a/maca_common/src/ProgramParameters.cpp +++ b/maca_common/src/ProgramParameters.cpp @@ -61,4 +61,5 @@ int ProgramParameters::tapeSize; int ProgramParameters::devTapeSize; int ProgramParameters::readSize; bool ProgramParameters::printOutputEntropy; +int ProgramParameters::dictCapacity; diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp index dc00a0ef10c3fe1b90d30f189f948a73371c8cab..be1de414248741e87c52b184e82e485b6eb034e0 100644 --- a/trainer/src/Trainer.cpp +++ b/trainer/src/Trainer.cpp @@ -140,7 +140,13 @@ void Trainer::computeScoreOnDev() } } + if (ProgramParameters::debug) + fprintf(stderr, "Dev Config is final\n"); + TI.computeDevScores(); + + if (ProgramParameters::debug) + fprintf(stderr, "End of %s\n", __func__); } void Trainer::train() @@ -320,6 +326,9 @@ void Trainer::train() } } + if (ProgramParameters::debug) + fprintf(stderr, "Config is final\n"); + if (ProgramParameters::iterationSize == -1) { printScoresAndSave(stderr); @@ -329,6 +338,9 @@ void Trainer::train() if (TI.getEpoch() > ProgramParameters::nbIter) break; } + + if (ProgramParameters::debug) + fprintf(stderr, "End of epoch\n"); } } @@ -342,10 +354,17 @@ void Trainer::printScoresAndSave(FILE * output) for (auto * cla : classifiers) if (TI.mustSave(cla->name)) { + if (ProgramParameters::debug) + fprintf(stderr, "Saving %s...", cla->name.c_str()); cla->save(ProgramParameters::expPath + cla->name + ".model"); Dict::saveDicts(ProgramParameters::expPath, cla->name); + if (ProgramParameters::debug) + fprintf(stderr, "Done !\n"); } TI.printScores(output); + + if (ProgramParameters::debug) + fprintf(stderr, "End of %s\n", __func__); } diff --git a/trainer/src/macaon_train.cpp b/trainer/src/macaon_train.cpp index dd77570651e2ee4519ab41ca7e8704a4d6f5edd1..325ef72717b56fa545af7990756f7874391907eb 100644 --- a/trainer/src/macaon_train.cpp +++ b/trainer/src/macaon_train.cpp @@ -79,6 +79,8 @@ po::options_description getOptionsDescription() "The value of the token that act as a delimiter for sequences") ("batchSize", po::value<int>()->default_value(50), "The size of each minibatch (in number of taining examples)") + ("dictCapacity", po::value<int>()->default_value(30000), + "The maximal size of each Dict (number of differents embeddings).") ("printTime", "Print time on stderr") ("shuffle", po::value<bool>()->default_value(true), "Shuffle examples after each iteration"); @@ -268,6 +270,7 @@ int main(int argc, char * argv[]) ProgramParameters::nbIter = vm["nbiter"].as<int>(); ProgramParameters::seed = vm["seed"].as<int>(); ProgramParameters::batchSize = vm["batchSize"].as<int>(); + ProgramParameters::dictCapacity = vm["dictCapacity"].as<int>(); ProgramParameters::nbTrain = vm["nbTrain"].as<int>(); ProgramParameters::removeDuplicates = vm["duplicates"].as<bool>(); ProgramParameters::interactive = vm["interactive"].as<bool>(); diff --git a/transition_machine/src/ActionBank.cpp b/transition_machine/src/ActionBank.cpp index a423f8fbf13cd3e8c6648d85615b7eaeb83a8911..4737d48da91aaaf467973b06f89334fc75d5cf9c 100644 --- a/transition_machine/src/ActionBank.cpp +++ b/transition_machine/src/ActionBank.cpp @@ -370,7 +370,6 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na ba.data += "+"+std::to_string(s-b0); } } - } if (rootIndex == -1)