From 20aef337ae74a264d6fd10bc841e3c624f72f4af Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@etu.univ-amu.fr>
Date: Mon, 17 Dec 2018 14:18:50 +0100
Subject: [PATCH] Added infos to error analysis, and moved it to macaon_decode

---
 decoder/CMakeLists.txt                    |   1 +
 decoder/src/Decoder.cpp                   |  44 ++++++---
 decoder/src/macaon_decode.cpp             |  11 ++-
 error_correction/src/Error.cpp            | 110 ++++++++++++++++++++--
 maca_common/include/ProgramParameters.hpp |   2 +
 maca_common/src/ProgramParameters.cpp     |   2 +
 6 files changed, 148 insertions(+), 22 deletions(-)

diff --git a/decoder/CMakeLists.txt b/decoder/CMakeLists.txt
index 4542dfb..e93aa50 100644
--- a/decoder/CMakeLists.txt
+++ b/decoder/CMakeLists.txt
@@ -4,6 +4,7 @@ add_executable(macaon_decode src/macaon_decode.cpp)
 target_link_libraries(macaon_decode transition_machine)
 target_link_libraries(macaon_decode decoder)
 target_link_libraries(macaon_decode ${Boost_PROGRAM_OPTIONS_LIBRARY})
+target_link_libraries(macaon_decode errors)
 install(TARGETS macaon_decode DESTINATION bin)
 
 #compiling library
diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp
index 52ca3dc..a24ed9c 100644
--- a/decoder/src/Decoder.cpp
+++ b/decoder/src/Decoder.cpp
@@ -1,5 +1,6 @@
 #include "Decoder.hpp"
 #include "util.hpp"
+#include "Error.hpp"
 
 Decoder::Decoder(TransitionMachine & tm, Config & config)
 : tm(tm), config(config)
@@ -11,6 +12,8 @@ void Decoder::decode()
   float entropyAccumulator = 0.0;
   int nbActionsInSequence = 0;
   bool justFlipped = false;
+  Errors errors;
+  errors.newSequence();
   while (!config.isFinal())
   {
     TransitionMachine::State * currentState = tm.getCurrentState();
@@ -61,6 +64,16 @@ void Decoder::decode()
       }
     }
 
+    if (classifier->needsTrain() && ProgramParameters::errorAnalysis && (classifier->name == ProgramParameters::classifierName || ProgramParameters::classifierName.empty()))
+    {
+      auto zeroCostActions = classifier->getZeroCostActions(config);
+      std::string oAction = zeroCostActions[0];
+      for (auto & s : zeroCostActions)
+        if (action->name == s)
+          oAction = s;
+      errors.add({action->name, oAction, weightedActions});
+    }
+
     action->apply(config);
 
     TransitionMachine::Transition * transition = tm.getTransition(predictedAction);
@@ -71,27 +84,28 @@ void Decoder::decode()
     float entropy = Classifier::computeEntropy(weightedActions);
     config.addToEntropyHistory(entropy);
 
-    if (ProgramParameters::printEntropy)
-    {
-      nbActionsInSequence++;
-
-      entropyAccumulator += entropy;
+    nbActionsInSequence++;
+    entropyAccumulator += entropy;
 
-      if (config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] != ProgramParameters::sequenceDelimiter)
-        justFlipped = false;
+    if (config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] != ProgramParameters::sequenceDelimiter)
+      justFlipped = false;
 
-      if ((config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] == ProgramParameters::sequenceDelimiter && !justFlipped))
-      {
-        justFlipped = true;
-        entropyAccumulator /= nbActionsInSequence;
-        nbActionsInSequence = 0;
+    if ((config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] == ProgramParameters::sequenceDelimiter && !justFlipped))
+    {
+      justFlipped = true;
+      errors.newSequence();
+      entropyAccumulator /= nbActionsInSequence;
+      nbActionsInSequence = 0;
+      if (ProgramParameters::printEntropy)
         fprintf(stderr, "Entropy : %.2f\n", entropyAccumulator);
-        entropyAccumulator = 0.0;
-      }
+      entropyAccumulator = 0.0;
     }
 
   }
 
-  config.printAsOutput(stdout);
+  if (ProgramParameters::errorAnalysis)
+    errors.printStats();
+  else
+    config.printAsOutput(stdout);
 }
 
diff --git a/decoder/src/macaon_decode.cpp b/decoder/src/macaon_decode.cpp
index 3a4c296..d0e1e36 100644
--- a/decoder/src/macaon_decode.cpp
+++ b/decoder/src/macaon_decode.cpp
@@ -52,7 +52,14 @@ po::options_description getOptionsDescription()
     ("lang", po::value<std::string>()->default_value("fr"),
       "Language you are working with");
 
-  desc.add(req).add(opt);
+  po::options_description analysis("Error analysis related options");
+  analysis.add_options()
+    ("errorAnalysis", "Print an analysis of errors")
+    ("meanEntropy", "Print the mean entropy for error types")
+    ("classifier", po::value<std::string>()->default_value(""),
+      "Name of the monitored classifier, if not specified monitor everyone");
+
+  desc.add(req).add(opt).add(analysis);
 
   return desc;
 }
@@ -111,6 +118,8 @@ int main(int argc, char * argv[])
   ProgramParameters::input = vm["input"].as<std::string>();
   ProgramParameters::mcdName = vm["mcd"].as<std::string>();
   ProgramParameters::debug = vm.count("debug") == 0 ? false : true;
+  ProgramParameters::errorAnalysis = vm.count("errorAnalysis") == 0 ? false : true;
+  ProgramParameters::meanEntropy = vm.count("meanEntropy") == 0 ? false : true;
   ProgramParameters::dicts = vm["dicts"].as<std::string>();
   ProgramParameters::printEntropy = vm.count("printEntropy") == 0 ? false : true;
   ProgramParameters::lang = vm["lang"].as<std::string>();
diff --git a/error_correction/src/Error.cpp b/error_correction/src/Error.cpp
index a99c5c5..c2f94cc 100644
--- a/error_correction/src/Error.cpp
+++ b/error_correction/src/Error.cpp
@@ -82,25 +82,55 @@ void Errors::printStats()
 {
   unsigned int minDistanceToCheck = 1;
   unsigned int maxDistanceToCheck = 5;
+  int window = 10;
   int nbErrorsToKeep = 10;
   std::map<std::string, int> nbErrorOccurencesByType;
+  std::map<std::string, int> nbFirstErrorOccurencesByType;
+  std::map<std::string, float> nbFirstErrorIntroduced;
   std::map<std::string, int> nbOccurencesByType;
   std::map<std::string, float> meanEntropyByType;
   std::map< std::string, std::vector<int> > distanceOfGoldByType;
   std::map< std::string, std::vector<float> > meanEntropyByDistanceByType;
+  std::map< std::string, std::vector<int> > distanceOfGoldByFirstType;
+  std::map< std::string, std::vector<float> > meanEntropyByDistanceByFirstType;
   int nbErrorsTotal = 0;
+  int nbFirstErrorsTotal = 0;
+  int nbActionsTotal = 0;
+
+  auto printLine = []()
+  {
+    for (int i = 0; i < 80; i++)
+      fprintf(stderr, "-");
+    fprintf(stderr, "\n");
+  };
 
   for (auto & sequence : sequences)
-    for (auto & error : sequence.getSequence())
+  {
+    bool firstErrorMet = false;
+    for (unsigned index = 0; index < sequence.getSequence().size(); index++)
     {
+      auto & error = sequence.getSequence()[index];
       nbOccurencesByType[error.getType()]++;
       meanEntropyByType[error.getType()] += error.getEntropy();
+      nbActionsTotal++;
       if (!error.isError())
       {
       }
       else
       {
         nbErrorOccurencesByType[error.getType()]++;
+        if (!firstErrorMet)
+        {
+          nbFirstErrorOccurencesByType[error.getType()]++;
+          nbFirstErrorsTotal++;
+          for (unsigned int i = index+1; i < sequence.getSequence().size(); i++)
+            if (sequence.getSequence()[i].isError())
+            {
+              if ((int)(i - index) > window && window)
+                break;
+              nbFirstErrorIntroduced[error.getType()] += 1;
+            }
+        }
         for (unsigned int i = minDistanceToCheck; i <= maxDistanceToCheck; i++)
         {
           while (distanceOfGoldByType[error.getType()].size() < (unsigned)(i+1))
@@ -109,18 +139,37 @@ void Errors::printStats()
             meanEntropyByDistanceByType[error.getType()].emplace_back(0.0);
           distanceOfGoldByType[error.getType()][i] += error.goldWasAtDistance(i) ? 1 : 0;
           meanEntropyByDistanceByType[error.getType()][i] += error.goldWasAtDistance(i) ? error.getEntropy() : 0;
+
+          if (!firstErrorMet)
+          {
+            while (distanceOfGoldByFirstType[error.getType()].size() < (unsigned)(i+1))
+              distanceOfGoldByFirstType[error.getType()].emplace_back(0);
+            while (meanEntropyByDistanceByFirstType[error.getType()].size() < (unsigned)(i+1))
+              meanEntropyByDistanceByFirstType[error.getType()].emplace_back(0.0);
+            distanceOfGoldByFirstType[error.getType()][i] += error.goldWasAtDistance(i) ? 1 : 0;
+            meanEntropyByDistanceByFirstType[error.getType()][i] += error.goldWasAtDistance(i) ? error.getEntropy() : 0;
+          }
         }
         nbErrorsTotal++;
+        firstErrorMet = true;
       }
     }
+  }
 
   for (auto & it : meanEntropyByDistanceByType)
     for (unsigned int i = 0; i < it.second.size(); i++)
       it.second[i] /= distanceOfGoldByType[it.first][i];
 
+  for (auto & it : meanEntropyByDistanceByFirstType)
+    for (unsigned int i = 0; i < it.second.size(); i++)
+      it.second[i] /= distanceOfGoldByFirstType[it.first][i];
+
   for (auto & it : meanEntropyByType)
     it.second /= nbOccurencesByType[it.first];
 
+  for (auto & it : nbFirstErrorOccurencesByType)
+    nbFirstErrorIntroduced[it.first] /= it.second;
+
   std::vector< std::pair<std::string,int> > typesOccurences;
   for (auto & it : nbErrorOccurencesByType)
     typesOccurences.emplace_back(std::pair<std::string,int>(it.first,it.second));
@@ -131,19 +180,23 @@ void Errors::printStats()
     return a.second > b.second;
   });
 
-  typesOccurences.resize(nbErrorsToKeep);
+  typesOccurences.resize(std::min(nbErrorsToKeep, (int)typesOccurences.size()));
 
+  fprintf(stderr, "%.2f%% of predicted actions where correct (%d / %d)\n",
+      100.0*(nbActionsTotal-nbErrorsTotal)/nbActionsTotal, nbActionsTotal-nbErrorsTotal,nbActionsTotal);
+  fprintf(stderr, "Format : Predicted->Gold\n");
+ 
   std::vector< std::vector<std::string> > columns;
   columns.clear();
-  columns.resize(5);
- 
+  columns.resize(ProgramParameters::meanEntropy ? 5 : 4);
   for (auto & it : typesOccurences)
   {
     columns[0].emplace_back(it.first);
     columns[1].emplace_back("= " + float2str(it.second*100.0/nbErrorsTotal,"%.2f%%"));
     columns[2].emplace_back("of errors");
     columns[3].emplace_back("("+std::to_string(it.second) + " / " + std::to_string(nbErrorsTotal) + ")");
-    columns[4].emplace_back("mean entropy : " + float2str(meanEntropyByType[it.first], "%.2f"));
+    if (ProgramParameters::meanEntropy)
+      columns[4].emplace_back("mean entropy : " + float2str(meanEntropyByType[it.first], "%.2f"));
 
     for (unsigned int dist = minDistanceToCheck; dist <= maxDistanceToCheck; dist++)
     {
@@ -151,7 +204,51 @@ void Errors::printStats()
       columns[1].emplace_back(std::to_string(dist));
       columns[2].emplace_back(float2str(distanceOfGoldByType[it.first][dist]*100.0/nbErrorOccurencesByType[it.first],"%.2f%%"));
       columns[3].emplace_back("of the time");
-    columns[4].emplace_back("with mean entropy : " + float2str(meanEntropyByDistanceByType[it.first][dist], "%.2f"));
+      if (ProgramParameters::meanEntropy)
+        columns[4].emplace_back("mean entropy : " + float2str(meanEntropyByDistanceByType[it.first][dist], "%.2f"));
+    }
+
+    for (auto & col : columns)
+      col.emplace_back("");
+  }
+
+  printLine();
+  printColumns(stderr, columns, 1);
+  printLine();
+
+  std::vector< std::pair<std::string,int> > typesFirstOccurences;
+  for (auto & it : nbFirstErrorOccurencesByType)
+    typesFirstOccurences.emplace_back(std::pair<std::string,int>(it.first,it.second));
+
+  std::sort(typesFirstOccurences.begin(), typesFirstOccurences.end(),
+  [](const std::pair<std::string,int> & a, const std::pair<std::string,int> & b)
+  {
+    return a.second > b.second;
+  });
+
+  typesFirstOccurences.resize(std::min(nbErrorsToKeep, (int)typesFirstOccurences.size()));
+
+  columns.clear();
+  columns.resize(ProgramParameters::meanEntropy ? 6 : 5);
+  for (auto & it : typesFirstOccurences)
+  {
+    columns[0].emplace_back(it.first);
+    columns[1].emplace_back("= " + float2str(it.second*100.0/nbFirstErrorsTotal,"%.2f%%"));
+    columns[2].emplace_back("of first errors");
+    columns[3].emplace_back("("+std::to_string(it.second) + " / " + std::to_string(nbFirstErrorsTotal) + ")");
+    columns[4].emplace_back("introduces " + float2str(nbFirstErrorIntroduced[it.first],"%.2f errors"));
+    if (ProgramParameters::meanEntropy)
+      columns[5].emplace_back("mean entropy : " + float2str(meanEntropyByType[it.first], "%.2f"));
+
+    for (unsigned int dist = minDistanceToCheck; dist <= maxDistanceToCheck; dist++)
+    {
+      columns[0].emplace_back("    Gold at distance");
+      columns[1].emplace_back(std::to_string(dist));
+      columns[2].emplace_back(float2str(distanceOfGoldByFirstType[it.first][dist]*100.0/nbFirstErrorOccurencesByType[it.first],"%.2f%%"));
+      columns[3].emplace_back("of the time");
+      columns[4].emplace_back("");
+      if (ProgramParameters::meanEntropy)
+        columns[5].emplace_back("mean entropy : " + float2str(meanEntropyByDistanceByFirstType[it.first][dist], "%.2f"));
     }
 
     for (auto & col : columns)
@@ -159,5 +256,6 @@ void Errors::printStats()
   }
 
   printColumns(stderr, columns, 1);
+  printLine();
 }
 
diff --git a/maca_common/include/ProgramParameters.hpp b/maca_common/include/ProgramParameters.hpp
index a749f9d..ddd9dfa 100644
--- a/maca_common/include/ProgramParameters.hpp
+++ b/maca_common/include/ProgramParameters.hpp
@@ -55,6 +55,8 @@ struct ProgramParameters
   static int batchSize;
   static std::string loss;
   static std::string dicts;
+  static bool errorAnalysis;
+  static bool meanEntropy;
   static std::map<std::string,std::string> featureModelByClassifier;
 
   private :
diff --git a/maca_common/src/ProgramParameters.cpp b/maca_common/src/ProgramParameters.cpp
index 41930ad..eb32e68 100644
--- a/maca_common/src/ProgramParameters.cpp
+++ b/maca_common/src/ProgramParameters.cpp
@@ -42,6 +42,8 @@ bool ProgramParameters::randomEmbeddings;
 bool ProgramParameters::randomParameters;
 bool ProgramParameters::printEntropy;
 bool ProgramParameters::printTime;
+bool ProgramParameters::errorAnalysis;
+bool ProgramParameters::meanEntropy;
 int ProgramParameters::iterationSize;
 int ProgramParameters::nbTrain;
 std::string ProgramParameters::sequenceDelimiterTape;
-- 
GitLab