From 69624ae5b4f6637827e9fc0354cadbf1bf90239f Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Thu, 21 Feb 2019 15:38:50 +0100
Subject: [PATCH] Added dictCapacity as a program argument

---
 decoder/src/macaon_decode.cpp             |  3 +++
 maca_common/include/Dict.hpp              |  4 ----
 maca_common/include/ProgramParameters.hpp |  1 +
 maca_common/src/Dict.cpp                  | 12 +++++++++---
 maca_common/src/ProgramParameters.cpp     |  1 +
 trainer/src/Trainer.cpp                   | 19 +++++++++++++++++++
 trainer/src/macaon_train.cpp              |  3 +++
 transition_machine/src/ActionBank.cpp     |  1 -
 8 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/decoder/src/macaon_decode.cpp b/decoder/src/macaon_decode.cpp
index 3288568..1be61fd 100644
--- a/decoder/src/macaon_decode.cpp
+++ b/decoder/src/macaon_decode.cpp
@@ -51,6 +51,8 @@ po::options_description getOptionsDescription()
       "For each state of the Config, show its feature representation")
     ("readSize", po::value<int>()->default_value(0),
       "The number of lines of input that will be read and stored in memory at once.")
+    ("dictCapacity", po::value<int>()->default_value(30000),
+      "The maximal size of each Dict (number of differents embeddings).")
     ("interactive", po::value<bool>()->default_value(true),
       "Is the shell interactive ? Display advancement informations")
     ("lang", po::value<std::string>()->default_value("fr"),
@@ -149,6 +151,7 @@ int main(int argc, char * argv[])
   ProgramParameters::readSize = vm["readSize"].as<int>();
   if (ProgramParameters::readSize == 0)
     ProgramParameters::readSize = ProgramParameters::tapeSize;
+  ProgramParameters::dictCapacity = vm["dictCapacity"].as<int>();
   ProgramParameters::beamSize = vm["beamSize"].as<int>();
   ProgramParameters::nbChilds = vm["nbChilds"].as<int>();
   ProgramParameters::optimizer = "none";
diff --git a/maca_common/include/Dict.hpp b/maca_common/include/Dict.hpp
index db95ee2..f783873 100644
--- a/maca_common/include/Dict.hpp
+++ b/maca_common/include/Dict.hpp
@@ -95,10 +95,6 @@ class Dict
 
   private :
 
-  /// @brief The maximum number of entry a Dict can hold.
-  ///
-  /// This limit exists because the dynet LookupParameter associed with the Dict is not dynamic and must come with a fixed finite size.
-  static constexpr unsigned int MAX_CAPACITY = 200000;
   /// @brief The dimension of each vector of this Dict (in number of float).
   int dimension;
   /// @brief A storage that map every string entry of this Dict to its index as a lookup parameter.
diff --git a/maca_common/include/ProgramParameters.hpp b/maca_common/include/ProgramParameters.hpp
index 9f30b7e..675843d 100644
--- a/maca_common/include/ProgramParameters.hpp
+++ b/maca_common/include/ProgramParameters.hpp
@@ -66,6 +66,7 @@ struct ProgramParameters
   static int tapeSize;
   static int devTapeSize;
   static int readSize;
+  static int dictCapacity;
   static bool printOutputEntropy;
 
   private :
diff --git a/maca_common/src/Dict.cpp b/maca_common/src/Dict.cpp
index d974ca9..37aee0d 100644
--- a/maca_common/src/Dict.cpp
+++ b/maca_common/src/Dict.cpp
@@ -77,7 +77,7 @@ void Dict::init(dynet::ParameterCollection & pc)
   }
 
   isInit = true;
-  this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension});
+  this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension});
   addEntry(nullValueStr);
   addEntry(unknownValueStr);
 }
@@ -125,7 +125,7 @@ void Dict::initFromFile(dynet::ParameterCollection & pc)
     }
 
     ftVector.reset(new fasttext::Vector(dimension));
-    this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension});
+    this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension});
   }
 
   // If policy is FromZero, we don't need to read the current entries
@@ -140,7 +140,7 @@ void Dict::initFromFile(dynet::ParameterCollection & pc)
 
   if (readIndex == -1) // No parameters to read
   {
-    this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension});
+    this->lookupParameter = pc.add_lookup_parameters(ProgramParameters::dictCapacity, {(unsigned int)dimension});
     addEntry(nullValueStr);
     addEntry(unknownValueStr);
     return;
@@ -361,6 +361,12 @@ unsigned int Dict::addEntry(const std::string & s)
   auto index = str2index.size();
   str2index.emplace(s, index);
 
+  if ((int)str2index.size() >= ProgramParameters::dictCapacity)
+  {
+    fprintf(stderr, "ERROR (%s) : Dict %s of maximal capacity %d is full. Aborting.\n", ERRINFO, name.c_str(), ProgramParameters::dictCapacity);
+    exit(1);
+  }
+
   if(mode == Mode::OneHot)
   {
     if(oneHotIndex >= dimension)
diff --git a/maca_common/src/ProgramParameters.cpp b/maca_common/src/ProgramParameters.cpp
index 6e5b999..593e22c 100644
--- a/maca_common/src/ProgramParameters.cpp
+++ b/maca_common/src/ProgramParameters.cpp
@@ -61,4 +61,5 @@ int ProgramParameters::tapeSize;
 int ProgramParameters::devTapeSize;
 int ProgramParameters::readSize;
 bool ProgramParameters::printOutputEntropy;
+int ProgramParameters::dictCapacity;
 
diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp
index dc00a0e..be1de41 100644
--- a/trainer/src/Trainer.cpp
+++ b/trainer/src/Trainer.cpp
@@ -140,7 +140,13 @@ void Trainer::computeScoreOnDev()
     }
   }
 
+  if (ProgramParameters::debug)
+    fprintf(stderr, "Dev Config is final\n");
+
   TI.computeDevScores();
+
+  if (ProgramParameters::debug)
+    fprintf(stderr, "End of %s\n", __func__);
 }
 
 void Trainer::train()
@@ -320,6 +326,9 @@ void Trainer::train()
       }
     }
 
+    if (ProgramParameters::debug)
+      fprintf(stderr, "Config is final\n");
+
     if (ProgramParameters::iterationSize == -1)
     {
       printScoresAndSave(stderr);
@@ -329,6 +338,9 @@ void Trainer::train()
       if (TI.getEpoch() > ProgramParameters::nbIter)
         break;
     }
+
+    if (ProgramParameters::debug)
+      fprintf(stderr, "End of epoch\n");
   }
 }
 
@@ -342,10 +354,17 @@ void Trainer::printScoresAndSave(FILE * output)
   for (auto * cla : classifiers)
     if (TI.mustSave(cla->name))
     {
+      if (ProgramParameters::debug)
+        fprintf(stderr, "Saving %s...", cla->name.c_str());
       cla->save(ProgramParameters::expPath + cla->name + ".model");
       Dict::saveDicts(ProgramParameters::expPath, cla->name);
+      if (ProgramParameters::debug)
+        fprintf(stderr, "Done !\n");
     }
 
   TI.printScores(output);
+
+   if (ProgramParameters::debug)
+    fprintf(stderr, "End of %s\n", __func__); 
 }
 
diff --git a/trainer/src/macaon_train.cpp b/trainer/src/macaon_train.cpp
index dd77570..325ef72 100644
--- a/trainer/src/macaon_train.cpp
+++ b/trainer/src/macaon_train.cpp
@@ -79,6 +79,8 @@ po::options_description getOptionsDescription()
       "The value of the token that act as a delimiter for sequences")
     ("batchSize", po::value<int>()->default_value(50),
       "The size of each minibatch (in number of taining examples)")
+    ("dictCapacity", po::value<int>()->default_value(30000),
+      "The maximal size of each Dict (number of differents embeddings).")
     ("printTime", "Print time on stderr")
     ("shuffle", po::value<bool>()->default_value(true),
       "Shuffle examples after each iteration");
@@ -268,6 +270,7 @@ int main(int argc, char * argv[])
   ProgramParameters::nbIter = vm["nbiter"].as<int>();
   ProgramParameters::seed = vm["seed"].as<int>();
   ProgramParameters::batchSize = vm["batchSize"].as<int>();
+  ProgramParameters::dictCapacity = vm["dictCapacity"].as<int>();
   ProgramParameters::nbTrain = vm["nbTrain"].as<int>();
   ProgramParameters::removeDuplicates = vm["duplicates"].as<bool>();
   ProgramParameters::interactive = vm["interactive"].as<bool>();
diff --git a/transition_machine/src/ActionBank.cpp b/transition_machine/src/ActionBank.cpp
index a423f8f..4737d48 100644
--- a/transition_machine/src/ActionBank.cpp
+++ b/transition_machine/src/ActionBank.cpp
@@ -370,7 +370,6 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
               ba.data += "+"+std::to_string(s-b0);
             }
           }
-
         }
 
         if (rootIndex == -1)
-- 
GitLab