From 0daac76c2ca564d507a8e32b8bf6f9169fe8dd22 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Tue, 24 Sep 2019 16:26:05 +0200
Subject: [PATCH] UD output for tokenizer and tagger

---
 decoder/src/Decoder.cpp               |  15 +-
 trainer/src/TrainInfos.cpp            |  10 +-
 trainer/src/Trainer.cpp               |  21 ++-
 transition_machine/include/Config.hpp |  16 +--
 transition_machine/src/ActionBank.cpp |   2 +
 transition_machine/src/BD.cpp         |  11 --
 transition_machine/src/Classifier.cpp |   6 +-
 transition_machine/src/Config.cpp     | 190 ++++++++++++--------------
 8 files changed, 131 insertions(+), 140 deletions(-)

diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp
index fec7ae7..bfca7e9 100644
--- a/decoder/src/Decoder.cpp
+++ b/decoder/src/Decoder.cpp
@@ -64,13 +64,13 @@ void printAdvancement(Config & config, float currentSpeed, int nbActionsCutoff)
   {
     int totalSize = ProgramParameters::tapeSize;
     int steps = config.getHead();
-    if (steps && (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff))
+    if (ProgramParameters::rawInput)
     {
-      if (ProgramParameters::rawInput)
-        fprintf(stderr, "Decode : %.2f%%  speed : %s actions/s\r", 100.0*config.rawInputHeadIndex/config.rawInput.size(), int2humanStr((int)currentSpeed).c_str());
-      else
-        fprintf(stderr, "Decode : %.2f%%  speed : %s actions/s\r", 100.0*steps/totalSize, int2humanStr((int)currentSpeed).c_str());
+      totalSize = config.rawInput.size();
+      steps = config.rawInputHeadIndex;
     }
+    if (steps && (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff))
+      fprintf(stderr, "Decode : %.2f%%  speed : %s actions/s\r", 100.0*config.rawInputHeadIndex/config.rawInput.size(), int2humanStr((int)currentSpeed).c_str());
   }
 }
 
@@ -183,12 +183,13 @@ void computeAndRecordEntropy(Config & config, Classifier::WeightedActions & weig
 
 void applyActionAndTakeTransition(TransitionMachine & tm, const std::string & actionName, Config & config)
 {
-    if (ProgramParameters::debug)
-      fprintf(stderr, "Applying action=<%s>\n", actionName.c_str());
+
     Action * action = tm.getCurrentClassifier()->getAction(actionName);
     TransitionMachine::Transition * transition = tm.getTransition(actionName);
     action->setInfos(tm.getCurrentClassifier()->name);
     config.addToActionsHistory(tm.getCurrentClassifier()->name, actionName, 0);
+    if (ProgramParameters::debug)
+      fprintf(stderr, "Applying action=<%s>\n", action->name.c_str());
     action->apply(config);
     tm.takeTransition(transition);
 }
diff --git a/trainer/src/TrainInfos.cpp b/trainer/src/TrainInfos.cpp
index 8677127..c866df7 100644
--- a/trainer/src/TrainInfos.cpp
+++ b/trainer/src/TrainInfos.cpp
@@ -158,15 +158,15 @@ void TrainInfos::computeTrainScores(Config & c)
   for (auto & it : topologyPrinted)
   {
     if (it.first == "Parser")
-      addTrainScore(it.first, computeScoreOnTapes(c, {"GOV", "LABEL"}, 0, c.getHead()));
+      addTrainScore(it.first, computeScoreOnTapes(c, {"GOV", "LABEL"}, 0, c.getHead()-1));
     else if (it.first == "Tagger")
-      addTrainScore(it.first, computeScoreOnTapes(c, {"POS"}, 0, c.getHead()));
+      addTrainScore(it.first, computeScoreOnTapes(c, {"POS"}, 0, c.getHead()-1));
     else if (it.first == "Tokenizer")
-      addTrainScore(it.first, computeScoreOnTapes(c, {"FORM"}, 0, c.getHead()));
+      addTrainScore(it.first, computeScoreOnTapes(c, {"FORM"}, 0, c.getHead()-1));
     else if (it.first == "Morpho")
-      addTrainScore(it.first, computeScoreOnTapes(c, {"MORPHO"}, 0, c.getHead()));
+      addTrainScore(it.first, computeScoreOnTapes(c, {"MORPHO"}, 0, c.getHead()-1));
     else if (it.first == "Lemmatizer_Rules")
-      addTrainScore(it.first, computeScoreOnTapes(c, {"LEMMA"}, 0, c.getHead()));
+      addTrainScore(it.first, computeScoreOnTapes(c, {"LEMMA"}, 0, c.getHead()-1));
     else if (split(it.first, '_')[0] == "Error")
       addTrainScore(it.first, 100.0);
     else
diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp
index dd9fb08..b917de5 100644
--- a/trainer/src/Trainer.cpp
+++ b/trainer/src/Trainer.cpp
@@ -87,6 +87,11 @@ void Trainer::computeScoreOnDev()
       {
         int totalSize = ProgramParameters::devTapeSize;
         int steps = devConfig->getHead();
+        if (devConfig->rawInputHeadIndex > 0)
+        {
+          totalSize = devConfig->rawInput.size();
+          steps = devConfig->rawInputHeadIndex;
+        }
         if (steps && (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff))
         {
           fprintf(stderr, "                                                      \r");
@@ -197,7 +202,7 @@ void Trainer::resetAndShuffle()
   trainConfig.reset();
 
   if(ProgramParameters::shuffleExamples)
-    trainConfig.shuffle(ProgramParameters::sequenceDelimiterTape, ProgramParameters::sequenceDelimiter);
+    trainConfig.shuffle();
 }
 
 void Trainer::doStepNoTrain()
@@ -233,6 +238,11 @@ void Trainer::doStepTrain()
   {
     int totalSize = ProgramParameters::iterationSize == -1 ? ProgramParameters::tapeSize : ProgramParameters::iterationSize;
     int steps = ProgramParameters::iterationSize == -1 ? trainConfig.getHead() : nbSteps;
+    if (trainConfig.rawInputHeadIndex > 0)
+    {
+      totalSize = trainConfig.rawInput.size();
+      steps = trainConfig.rawInputHeadIndex;
+    }
     if (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff)
     {
       fprintf(stderr, "                                                      \r");
@@ -270,7 +280,14 @@ void Trainer::doStepTrain()
     }
   
     if (oAction.empty())
+    {
       oAction = tm.getCurrentClassifier()->getDefaultAction();
+      if(!tm.getCurrentClassifier()->getAction(oAction)->appliable(trainConfig))
+        oAction.clear();
+    }
+
+    if (oAction.empty())
+      oAction = pAction;
   
     if (oAction.empty())
     {
@@ -544,6 +561,8 @@ void Trainer::train()
 
 void Trainer::printScoresAndSave(FILE * output)
 {
+  trainConfig.transformSymbol("", "_");
+  devConfig->transformSymbol("", "_");
   TI.computeTrainScores(trainConfig);
   computeScoreOnDev();
   TI.computeMustSaves();
diff --git a/transition_machine/include/Config.hpp b/transition_machine/include/Config.hpp
index f988f2b..9ba2cb8 100644
--- a/transition_machine/include/Config.hpp
+++ b/transition_machine/include/Config.hpp
@@ -73,6 +73,7 @@ class Config
     /// @param relativeIndex The index of the cell relatively to the head.
     /// @param elem The new content of the cell.
     void setHyp(int relativeIndex, const std::string & elem);
+    int getHead();
     /// @brief Return true if the head of this tape is on the last cell.
     ///
     /// @return True if the head of this tape is on the last cell.
@@ -189,6 +190,8 @@ class Config
   int rawInputHeadIndex;
   /// @brief Index of current word in the sentence, as in conll format.
   int currentWordIndex;
+  /// @brief The conll input as it was read.
+  std::vector< std::vector<std::string> > inputContent;
 
   public :
 
@@ -221,6 +224,7 @@ class Config
   Tape & getTapeByInputCol(int col);
   /// @brief Read a part of a formated input file (mcf) and use it to fill the tapes.
   void readInput();
+  void fillTapesWithInput();
   /// @brief Print the Config for debug purposes.
   ///
   /// @param output Where to print.
@@ -274,13 +278,8 @@ class Config
   ///
   /// @return The history of entropies of the current state in the TransitionMachine.
   LimitedStack<float> & getCurrentStateEntropyHistory();
-  /// @brief Shuffle the segments of the Config.
-  ///
-  /// For instance if you call shuffle("EOS", "1");\n 
-  /// Sentences will be preserved, but their order will be shuffled.
-  /// @param delimiterTape The tape containing the delimiters of segments.
-  /// @param delimiter The delimiters of segments.
-  void shuffle(const std::string & delimiterTape, const std::string & delimiter);
+  /// @brief Shuffle the Config per sequences.
+  void shuffle();
   /// @brief Get element from the stack at depth index.
   ///
   /// @param index The depth of the requested element.
@@ -352,8 +351,6 @@ class Config
   ///
   /// @return True if the head is at the end of the tapes.
   bool endOfTapes() const;
-  /// @brief Update rawInput according to the tape TEXT.
-  void updateRawInput();
   /// @brief Set the output file.
   void setOutputFile(FILE * outputFile);
   /// @brief Print the cells that have not been printed.
@@ -367,6 +364,7 @@ class Config
   void printColumnInfos(unsigned int index);
   void addToActionsHistory(std::string & state, const std::string & action, int cost);
   std::vector< std::pair<std::string, int> > & getActionsHistory(std::string & state);
+  void transformSymbol(const std::string & from, const std::string & to);
 };
 
 #endif
diff --git a/transition_machine/src/ActionBank.cpp b/transition_machine/src/ActionBank.cpp
index 26d167e..7bb9819 100644
--- a/transition_machine/src/ActionBank.cpp
+++ b/transition_machine/src/ActionBank.cpp
@@ -321,6 +321,8 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
   }
   else if(std::string(b1) == "ADDCHARTOWORD")
   {
+    sequence.emplace_back(increaseTapesIfNeeded(0));
+
     auto apply = [](Config & c, Action::BasicAction &)
       {addCharToBuffer(c, "FORM", 0);};
     auto undo = [](Config & c, Action::BasicAction &)
diff --git a/transition_machine/src/BD.cpp b/transition_machine/src/BD.cpp
index 957c01a..b9eb9c5 100644
--- a/transition_machine/src/BD.cpp
+++ b/transition_machine/src/BD.cpp
@@ -40,17 +40,6 @@ BD::BD(const std::string & BDfilename, const std::string & MCDfilename)
       exit(1);
     }
 
-    if(mcdCol2Str.find(col) != mcdCol2Str.end())
-    {
-      fprintf(stderr, "ERROR (%s) : MCD column \'%d\' already exists. Aborting.\n", ERRINFO, col);
-      exit(1);
-    }
-    if(mcdStr2Col.find(name) != mcdStr2Col.end())
-    {
-      fprintf(stderr, "ERROR (%s) : MCD column \'%s\' already exists. Aborting.\n", ERRINFO, name);
-      exit(1);
-    }
-
     mcdCol2Str[col] = name;
     mcdStr2Col[name] = col;
   }
diff --git a/transition_machine/src/Classifier.cpp b/transition_machine/src/Classifier.cpp
index f68f1cc..25b16a6 100644
--- a/transition_machine/src/Classifier.cpp
+++ b/transition_machine/src/Classifier.cpp
@@ -279,14 +279,16 @@ std::vector<std::string> Classifier::getZeroCostActions(Config & config)
       result.emplace_back(a.name);
 
   if (result.empty() && as->hasDefaultAction)
-    result.emplace_back(as->getDefaultAction()->name);
+    if (as->getDefaultAction()->appliable(config))
+      result.emplace_back(as->getDefaultAction()->name);
 
   return result;
 }
 
 std::string Classifier::getDefaultAction() const
 {
-  return as->getDefaultAction()->name;
+  if (as->hasDefaultAction)
+    return as->getDefaultAction()->name;
 
   return std::string();
 }
diff --git a/transition_machine/src/Config.cpp b/transition_machine/src/Config.cpp
index 2871956..cd85b31 100644
--- a/transition_machine/src/Config.cpp
+++ b/transition_machine/src/Config.cpp
@@ -20,6 +20,7 @@ Config::Config(BD & bd, const std::string inputFilename) : bd(bd), hashHistory(H
   for(int i = 0; i < bd.getNbLines(); i++)
     tapes.emplace_back(bd.getNameOfLine(i), bd.lineIsKnown(i));
   this->totalEntropy = 0;
+  readInput();
 }
 
 Config::Config(const Config & other) : bd(other.bd), hashHistory(other.hashHistory), pastActions(other.pastActions)
@@ -94,46 +95,88 @@ void Config::readInput()
   FILE * fd = file->getDescriptor();
 
   char buffer[100000];
-  std::vector<std::string> cols;
-  unsigned int usualColsSize = 0;
 
-  int toRead = ProgramParameters::readSize;
-  int haveRead = 0;
+  int lineIndex = 0;
 
-  while(haveRead < toRead && fscanf(fd, "%[^\n]\n", buffer) == 1)
+  while (fscanf(fd, "%[^\n]\n", buffer) == 1)
   {
+    lineIndex++;
+
     if (!utf8::is_valid(buffer, buffer+std::strlen(buffer)))
     {
-      fprintf(stderr, "ERROR (%s) : input (%s) line %d is not toally utf-8 formated. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size());
+      fprintf(stderr, "ERROR (%s) : input (%s) line %d is not toally utf-8 formated. Aborting.\n", ERRINFO, inputFilename.c_str(), lineIndex);
       exit(1);
     }
 
-    cols = split(buffer, '\t');
-    if (!usualColsSize)
-      usualColsSize = cols.size();
+    if (std::strlen(buffer) <= 3)
+      continue;
 
-    if (cols.size() != usualColsSize)
-    {
-      fprintf(stderr, "ERROR (%s) : input (%s) line %d has %lu columns instead of %u. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size(), cols.size(), usualColsSize);
-      exit(1);
-    }
+    if (split(buffer, '=')[0] == "# sent_id ")
+      inputContent.emplace_back();
+    else if (buffer[0] == '#' && split(buffer, '=')[0] != "# text ")
+      continue;
+
+    inputContent.back().emplace_back(buffer);
+  }
 
-    printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex(), tapes[0].getNextOverridenRealIndex());
+  inputAllRead = true;
+  fillTapesWithInput();
+}
+
+void Config::fillTapesWithInput()
+{
+  rawInput = "";
+  std::vector<std::string> cols;
+  unsigned int usualColsSize = 0;
 
-    for(unsigned int i = 0; i < cols.size(); i++)
-      if(bd.hasLineOfInputCol(i))
+  for (auto & sentence : inputContent)
+  {
+    for (unsigned int wordIndex = 0; wordIndex < sentence.size(); wordIndex++)
+    {
+      auto & word = sentence[wordIndex];
+      if (split(word, '=')[0] == "# text ")
       {
-        auto & tape = getTapeByInputCol(i);
+        std::string prefix = rawInput.empty() ? "" : " ";
+        if (choiceWithProbability(0.3))
+          prefix = "\n";
+        else if (choiceWithProbability(0.3))
+          prefix = "";
+        rawInput += prefix + std::string(word.begin()+9, word.end());
+        continue;
+      }
+      else if (word[0] == '#')
+        continue;
 
-        tape.addToRef(cols[i]);
-        tape.addToHyp("");
+      cols = split(word, '\t');
+      if (!usualColsSize)
+        usualColsSize = cols.size();
 
-        if (tape.getName() == ProgramParameters::tapeToMask)
-          if (choiceWithProbability(ProgramParameters::maskRate))
-            tape.maskIndex(tape.refSize()-1);
+      if (cols.size() != usualColsSize)
+      {
+        fprintf(stderr, "ERROR (%s) : input (%s) line %d has %lu columns instead of %u. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size(), cols.size(), usualColsSize);
+        exit(1);
       }
 
-    haveRead++;
+      for(unsigned int i = 0; i < cols.size(); i++)
+        if(bd.hasLineOfInputCol(i))
+        {
+          auto & tape = getTapeByInputCol(i);
+  
+          tape.addToRef(cols[i]);
+          tape.addToHyp("");
+  
+          if (tape.getName() == ProgramParameters::tapeToMask)
+            if (choiceWithProbability(ProgramParameters::maskRate))
+              tape.maskIndex(tape.refSize()-1);
+          if (tape.getName() == ProgramParameters::sequenceDelimiterTape)
+          {
+            fprintf(stderr, "ERROR (%s) : Tape \'%s\' must not be given as a column in the input since it's the sequence delimiter. Aborting.\n", ERRINFO, tape.getName().c_str());
+            exit(1);
+          }
+        }
+      getTape(ProgramParameters::sequenceDelimiterTape).addToRef(wordIndex == sentence.size()-1 ? ProgramParameters::sequenceDelimiter : "_");
+      getTape(ProgramParameters::sequenceDelimiterTape).addToHyp("");
+    }
   }
 
   // Making all tapes the same size
@@ -141,12 +184,6 @@ void Config::readInput()
   for(auto & tape : tapes)
     maxTapeSize = std::max<unsigned int>(maxTapeSize, tape.refSize());
 
-  if (haveRead < toRead || tapes[0].size() == ProgramParameters::tapeSize)
-  {
-    printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex(), tapes[0].getNextOverridenRealIndex());
-    inputAllRead = true;
-  }
-
   for(auto & tape : tapes)
   {
     while(tape.refSize() < maxTapeSize)
@@ -155,15 +192,9 @@ void Config::readInput()
     while(tape.hypSize() < maxTapeSize)
       tape.addToHyp("");
 
-    if (inputAllRead)
-    {
-      tape.addToRef("0");
-      tape.addToHyp("");
-    }
+    tape.addToRef("0");
+    tape.addToHyp("");
   }
-
-  if (hasTape("TEXT"))
-    updateRawInput();
 }
 
 void Config::printForDebug(FILE * output)
@@ -252,7 +283,7 @@ void Config::printAsOutput(FILE * output, int dataIndex, int realIndex)
 
 void Config::moveHead(int mvt)
 {
-  if (head + mvt < tapes[0].size())
+  if (head + mvt <= tapes[0].size())
   {
     head += mvt;
 
@@ -316,15 +347,10 @@ void Config::reset()
   stack.clear();
   stackHistory = -1;
 
-  inputAllRead = false;
   head = 0;
   rawInputHead = 0;
   rawInputHeadIndex = 0;
   currentWordIndex = 1;
-
-  file.reset();
-  while (tapes[0].size() < ProgramParameters::readSize*4 && !inputAllRead)
-    readInput();
 }
 
 const std::string & Config::Tape::operator[](int relativeIndex)
@@ -398,59 +424,11 @@ LimitedStack<float> & Config::getCurrentStateEntropyHistory()
   return entropyHistory.find(getCurrentStateName())->second;
 }
 
-void Config::shuffle(const std::string & delimiterTape, const std::string & delimiter)
+void Config::shuffle()
 {
-  struct Trio{unsigned int a; unsigned int b; unsigned int c; Trio(unsigned int a, unsigned int b, unsigned int c): a(a), b(b), c(c){}};
-  std::vector<Trio> delimiters;
-
-  if (delimiterTape == "0")
-  {
-    unsigned int previousIndex = 0;
-    for (int i = 0; i < tapes[0].refSize(); i++)
-    {
-      delimiters.emplace_back(previousIndex, i, delimiters.size());
-      previousIndex = i+1;
-    }
-  }
-  else
-  {
-    auto & tape = getTape(delimiterTape);
-    unsigned int previousIndex = 0;
-    for (int i = 0; i < tape.refSize(); i++)
-      if (tape.getRef(i-head) == delimiter)
-      {
-        delimiters.emplace_back(previousIndex, i, delimiters.size());
-        previousIndex = i+1;
-      }
-  }
-
-  if (delimiters.empty())
-  {
-    fprintf(stderr, "WARNING (%s) : Requested to shuffle based on tape \'%s\' with \'%s\' as a delimiter, but none has been found. Aborting.\n", ERRINFO, delimiterTape.c_str(), delimiter.c_str());
-    return;
-  }
-
-  std::pair<unsigned int, unsigned int> suffix = {delimiters.back().b+1, tapes[0].refSize()-1};
-
-  std::random_shuffle(delimiters.begin(), delimiters.end());
-
-  auto newTapes = tapes;
-
-  for (unsigned int tape = 0; tape < tapes.size(); tape++)
-  {
-    newTapes[tape].clearDataForCopy();
-
-    for (auto & delimiter : delimiters)
-      newTapes[tape].copyPart(tapes[tape], delimiter.a, delimiter.b+1);
-
-    if (suffix.first <= suffix.second)
-      newTapes[tape].copyPart(tapes[tape], suffix.first, suffix.second+1);
-  }
-
-  tapes = newTapes;
-
-  if (!rawInput.empty())
-    updateRawInput();
+  reset();
+  std::random_shuffle(inputContent.begin(), inputContent.end());
+  fillTapesWithInput();
 }
 
 int Config::stackGetElem(int index) const
@@ -568,7 +546,7 @@ void Config::Tape::moveHead(int mvt)
 
 bool Config::endOfTapes() const
 {
-  return inputAllRead && tapes[0].headIsAtEnd();
+  return inputAllRead && (tapes[0].headIsAtEnd() || rawInputHeadIndex >= (int)rawInput.size());
 }
 
 bool Config::Tape::headIsAtEnd() const
@@ -735,14 +713,16 @@ float Config::Tape::getScore(int from, int to)
   return 100.0*res / (1+to-from);
 }
 
-void Config::updateRawInput()
+int Config::Tape::getHead()
 {
-  rawInput = "";
-  auto & textTape = getTape("TEXT");
-  for (int i = 0; i < textTape.size(); i++)
-  {
-    if (textTape[i] != "_")
-      rawInput += (rawInput.empty() ? std::string("") : (choiceWithProbability(0.5) ? std::string(" ") : std::string("\n"))) + textTape[i];
-  }
+  return head;
+}
+
+void Config::transformSymbol(const std::string & from, const std::string & to)
+{
+  for (auto & tape : tapes)
+    for (int i = 0; i < tape.size(); i++)
+      if (tape.getHyp(i-tape.getHead()) == from)
+        tape.setHyp(i-tape.getHead(), to);
 }
 
-- 
GitLab