From 20aef337ae74a264d6fd10bc841e3c624f72f4af Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@etu.univ-amu.fr> Date: Mon, 17 Dec 2018 14:18:50 +0100 Subject: [PATCH] Added infos to error analysis, and moved it to macaon_decode --- decoder/CMakeLists.txt | 1 + decoder/src/Decoder.cpp | 44 ++++++--- decoder/src/macaon_decode.cpp | 11 ++- error_correction/src/Error.cpp | 110 ++++++++++++++++++++-- maca_common/include/ProgramParameters.hpp | 2 + maca_common/src/ProgramParameters.cpp | 2 + 6 files changed, 148 insertions(+), 22 deletions(-) diff --git a/decoder/CMakeLists.txt b/decoder/CMakeLists.txt index 4542dfb..e93aa50 100644 --- a/decoder/CMakeLists.txt +++ b/decoder/CMakeLists.txt @@ -4,6 +4,7 @@ add_executable(macaon_decode src/macaon_decode.cpp) target_link_libraries(macaon_decode transition_machine) target_link_libraries(macaon_decode decoder) target_link_libraries(macaon_decode ${Boost_PROGRAM_OPTIONS_LIBRARY}) +target_link_libraries(macaon_decode errors) install(TARGETS macaon_decode DESTINATION bin) #compiling library diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp index 52ca3dc..a24ed9c 100644 --- a/decoder/src/Decoder.cpp +++ b/decoder/src/Decoder.cpp @@ -1,5 +1,6 @@ #include "Decoder.hpp" #include "util.hpp" +#include "Error.hpp" Decoder::Decoder(TransitionMachine & tm, Config & config) : tm(tm), config(config) @@ -11,6 +12,8 @@ void Decoder::decode() float entropyAccumulator = 0.0; int nbActionsInSequence = 0; bool justFlipped = false; + Errors errors; + errors.newSequence(); while (!config.isFinal()) { TransitionMachine::State * currentState = tm.getCurrentState(); @@ -61,6 +64,16 @@ void Decoder::decode() } } + if (classifier->needsTrain() && ProgramParameters::errorAnalysis && (classifier->name == ProgramParameters::classifierName || ProgramParameters::classifierName.empty())) + { + auto zeroCostActions = classifier->getZeroCostActions(config); + std::string oAction = zeroCostActions[0]; + for (auto & s : zeroCostActions) + if (action->name == s) + oAction = s; + errors.add({action->name, oAction, weightedActions}); + } + action->apply(config); TransitionMachine::Transition * transition = tm.getTransition(predictedAction); @@ -71,27 +84,28 @@ void Decoder::decode() float entropy = Classifier::computeEntropy(weightedActions); config.addToEntropyHistory(entropy); - if (ProgramParameters::printEntropy) - { - nbActionsInSequence++; - - entropyAccumulator += entropy; + nbActionsInSequence++; + entropyAccumulator += entropy; - if (config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] != ProgramParameters::sequenceDelimiter) - justFlipped = false; + if (config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] != ProgramParameters::sequenceDelimiter) + justFlipped = false; - if ((config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] == ProgramParameters::sequenceDelimiter && !justFlipped)) - { - justFlipped = true; - entropyAccumulator /= nbActionsInSequence; - nbActionsInSequence = 0; + if ((config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] == ProgramParameters::sequenceDelimiter && !justFlipped)) + { + justFlipped = true; + errors.newSequence(); + entropyAccumulator /= nbActionsInSequence; + nbActionsInSequence = 0; + if (ProgramParameters::printEntropy) fprintf(stderr, "Entropy : %.2f\n", entropyAccumulator); - entropyAccumulator = 0.0; - } + entropyAccumulator = 0.0; } } - config.printAsOutput(stdout); + if (ProgramParameters::errorAnalysis) + errors.printStats(); + else + config.printAsOutput(stdout); } diff --git a/decoder/src/macaon_decode.cpp b/decoder/src/macaon_decode.cpp index 3a4c296..d0e1e36 100644 --- a/decoder/src/macaon_decode.cpp +++ b/decoder/src/macaon_decode.cpp @@ -52,7 +52,14 @@ po::options_description getOptionsDescription() ("lang", po::value<std::string>()->default_value("fr"), "Language you are working with"); - desc.add(req).add(opt); + po::options_description analysis("Error analysis related options"); + analysis.add_options() + ("errorAnalysis", "Print an analysis of errors") + ("meanEntropy", "Print the mean entropy for error types") + ("classifier", po::value<std::string>()->default_value(""), + "Name of the monitored classifier, if not specified monitor everyone"); + + desc.add(req).add(opt).add(analysis); return desc; } @@ -111,6 +118,8 @@ int main(int argc, char * argv[]) ProgramParameters::input = vm["input"].as<std::string>(); ProgramParameters::mcdName = vm["mcd"].as<std::string>(); ProgramParameters::debug = vm.count("debug") == 0 ? false : true; + ProgramParameters::errorAnalysis = vm.count("errorAnalysis") == 0 ? false : true; + ProgramParameters::meanEntropy = vm.count("meanEntropy") == 0 ? false : true; ProgramParameters::dicts = vm["dicts"].as<std::string>(); ProgramParameters::printEntropy = vm.count("printEntropy") == 0 ? false : true; ProgramParameters::lang = vm["lang"].as<std::string>(); diff --git a/error_correction/src/Error.cpp b/error_correction/src/Error.cpp index a99c5c5..c2f94cc 100644 --- a/error_correction/src/Error.cpp +++ b/error_correction/src/Error.cpp @@ -82,25 +82,55 @@ void Errors::printStats() { unsigned int minDistanceToCheck = 1; unsigned int maxDistanceToCheck = 5; + int window = 10; int nbErrorsToKeep = 10; std::map<std::string, int> nbErrorOccurencesByType; + std::map<std::string, int> nbFirstErrorOccurencesByType; + std::map<std::string, float> nbFirstErrorIntroduced; std::map<std::string, int> nbOccurencesByType; std::map<std::string, float> meanEntropyByType; std::map< std::string, std::vector<int> > distanceOfGoldByType; std::map< std::string, std::vector<float> > meanEntropyByDistanceByType; + std::map< std::string, std::vector<int> > distanceOfGoldByFirstType; + std::map< std::string, std::vector<float> > meanEntropyByDistanceByFirstType; int nbErrorsTotal = 0; + int nbFirstErrorsTotal = 0; + int nbActionsTotal = 0; + + auto printLine = []() + { + for (int i = 0; i < 80; i++) + fprintf(stderr, "-"); + fprintf(stderr, "\n"); + }; for (auto & sequence : sequences) - for (auto & error : sequence.getSequence()) + { + bool firstErrorMet = false; + for (unsigned index = 0; index < sequence.getSequence().size(); index++) { + auto & error = sequence.getSequence()[index]; nbOccurencesByType[error.getType()]++; meanEntropyByType[error.getType()] += error.getEntropy(); + nbActionsTotal++; if (!error.isError()) { } else { nbErrorOccurencesByType[error.getType()]++; + if (!firstErrorMet) + { + nbFirstErrorOccurencesByType[error.getType()]++; + nbFirstErrorsTotal++; + for (unsigned int i = index+1; i < sequence.getSequence().size(); i++) + if (sequence.getSequence()[i].isError()) + { + if ((int)(i - index) > window && window) + break; + nbFirstErrorIntroduced[error.getType()] += 1; + } + } for (unsigned int i = minDistanceToCheck; i <= maxDistanceToCheck; i++) { while (distanceOfGoldByType[error.getType()].size() < (unsigned)(i+1)) @@ -109,18 +139,37 @@ void Errors::printStats() meanEntropyByDistanceByType[error.getType()].emplace_back(0.0); distanceOfGoldByType[error.getType()][i] += error.goldWasAtDistance(i) ? 1 : 0; meanEntropyByDistanceByType[error.getType()][i] += error.goldWasAtDistance(i) ? error.getEntropy() : 0; + + if (!firstErrorMet) + { + while (distanceOfGoldByFirstType[error.getType()].size() < (unsigned)(i+1)) + distanceOfGoldByFirstType[error.getType()].emplace_back(0); + while (meanEntropyByDistanceByFirstType[error.getType()].size() < (unsigned)(i+1)) + meanEntropyByDistanceByFirstType[error.getType()].emplace_back(0.0); + distanceOfGoldByFirstType[error.getType()][i] += error.goldWasAtDistance(i) ? 1 : 0; + meanEntropyByDistanceByFirstType[error.getType()][i] += error.goldWasAtDistance(i) ? error.getEntropy() : 0; + } } nbErrorsTotal++; + firstErrorMet = true; } } + } for (auto & it : meanEntropyByDistanceByType) for (unsigned int i = 0; i < it.second.size(); i++) it.second[i] /= distanceOfGoldByType[it.first][i]; + for (auto & it : meanEntropyByDistanceByFirstType) + for (unsigned int i = 0; i < it.second.size(); i++) + it.second[i] /= distanceOfGoldByFirstType[it.first][i]; + for (auto & it : meanEntropyByType) it.second /= nbOccurencesByType[it.first]; + for (auto & it : nbFirstErrorOccurencesByType) + nbFirstErrorIntroduced[it.first] /= it.second; + std::vector< std::pair<std::string,int> > typesOccurences; for (auto & it : nbErrorOccurencesByType) typesOccurences.emplace_back(std::pair<std::string,int>(it.first,it.second)); @@ -131,19 +180,23 @@ void Errors::printStats() return a.second > b.second; }); - typesOccurences.resize(nbErrorsToKeep); + typesOccurences.resize(std::min(nbErrorsToKeep, (int)typesOccurences.size())); + fprintf(stderr, "%.2f%% of predicted actions where correct (%d / %d)\n", + 100.0*(nbActionsTotal-nbErrorsTotal)/nbActionsTotal, nbActionsTotal-nbErrorsTotal,nbActionsTotal); + fprintf(stderr, "Format : Predicted->Gold\n"); + std::vector< std::vector<std::string> > columns; columns.clear(); - columns.resize(5); - + columns.resize(ProgramParameters::meanEntropy ? 5 : 4); for (auto & it : typesOccurences) { columns[0].emplace_back(it.first); columns[1].emplace_back("= " + float2str(it.second*100.0/nbErrorsTotal,"%.2f%%")); columns[2].emplace_back("of errors"); columns[3].emplace_back("("+std::to_string(it.second) + " / " + std::to_string(nbErrorsTotal) + ")"); - columns[4].emplace_back("mean entropy : " + float2str(meanEntropyByType[it.first], "%.2f")); + if (ProgramParameters::meanEntropy) + columns[4].emplace_back("mean entropy : " + float2str(meanEntropyByType[it.first], "%.2f")); for (unsigned int dist = minDistanceToCheck; dist <= maxDistanceToCheck; dist++) { @@ -151,7 +204,51 @@ void Errors::printStats() columns[1].emplace_back(std::to_string(dist)); columns[2].emplace_back(float2str(distanceOfGoldByType[it.first][dist]*100.0/nbErrorOccurencesByType[it.first],"%.2f%%")); columns[3].emplace_back("of the time"); - columns[4].emplace_back("with mean entropy : " + float2str(meanEntropyByDistanceByType[it.first][dist], "%.2f")); + if (ProgramParameters::meanEntropy) + columns[4].emplace_back("mean entropy : " + float2str(meanEntropyByDistanceByType[it.first][dist], "%.2f")); + } + + for (auto & col : columns) + col.emplace_back(""); + } + + printLine(); + printColumns(stderr, columns, 1); + printLine(); + + std::vector< std::pair<std::string,int> > typesFirstOccurences; + for (auto & it : nbFirstErrorOccurencesByType) + typesFirstOccurences.emplace_back(std::pair<std::string,int>(it.first,it.second)); + + std::sort(typesFirstOccurences.begin(), typesFirstOccurences.end(), + [](const std::pair<std::string,int> & a, const std::pair<std::string,int> & b) + { + return a.second > b.second; + }); + + typesFirstOccurences.resize(std::min(nbErrorsToKeep, (int)typesFirstOccurences.size())); + + columns.clear(); + columns.resize(ProgramParameters::meanEntropy ? 6 : 5); + for (auto & it : typesFirstOccurences) + { + columns[0].emplace_back(it.first); + columns[1].emplace_back("= " + float2str(it.second*100.0/nbFirstErrorsTotal,"%.2f%%")); + columns[2].emplace_back("of first errors"); + columns[3].emplace_back("("+std::to_string(it.second) + " / " + std::to_string(nbFirstErrorsTotal) + ")"); + columns[4].emplace_back("introduces " + float2str(nbFirstErrorIntroduced[it.first],"%.2f errors")); + if (ProgramParameters::meanEntropy) + columns[5].emplace_back("mean entropy : " + float2str(meanEntropyByType[it.first], "%.2f")); + + for (unsigned int dist = minDistanceToCheck; dist <= maxDistanceToCheck; dist++) + { + columns[0].emplace_back(" Gold at distance"); + columns[1].emplace_back(std::to_string(dist)); + columns[2].emplace_back(float2str(distanceOfGoldByFirstType[it.first][dist]*100.0/nbFirstErrorOccurencesByType[it.first],"%.2f%%")); + columns[3].emplace_back("of the time"); + columns[4].emplace_back(""); + if (ProgramParameters::meanEntropy) + columns[5].emplace_back("mean entropy : " + float2str(meanEntropyByDistanceByFirstType[it.first][dist], "%.2f")); } for (auto & col : columns) @@ -159,5 +256,6 @@ void Errors::printStats() } printColumns(stderr, columns, 1); + printLine(); } diff --git a/maca_common/include/ProgramParameters.hpp b/maca_common/include/ProgramParameters.hpp index a749f9d..ddd9dfa 100644 --- a/maca_common/include/ProgramParameters.hpp +++ b/maca_common/include/ProgramParameters.hpp @@ -55,6 +55,8 @@ struct ProgramParameters static int batchSize; static std::string loss; static std::string dicts; + static bool errorAnalysis; + static bool meanEntropy; static std::map<std::string,std::string> featureModelByClassifier; private : diff --git a/maca_common/src/ProgramParameters.cpp b/maca_common/src/ProgramParameters.cpp index 41930ad..eb32e68 100644 --- a/maca_common/src/ProgramParameters.cpp +++ b/maca_common/src/ProgramParameters.cpp @@ -42,6 +42,8 @@ bool ProgramParameters::randomEmbeddings; bool ProgramParameters::randomParameters; bool ProgramParameters::printEntropy; bool ProgramParameters::printTime; +bool ProgramParameters::errorAnalysis; +bool ProgramParameters::meanEntropy; int ProgramParameters::iterationSize; int ProgramParameters::nbTrain; std::string ProgramParameters::sequenceDelimiterTape; -- GitLab