diff --git a/error_correction/CMakeLists.txt b/error_correction/CMakeLists.txt index fe0b241d275cf50a628a2a59f89fb1eb8486062e..e5db2824035c45568bfb3904252027bd2ee614ef 100644 --- a/error_correction/CMakeLists.txt +++ b/error_correction/CMakeLists.txt @@ -5,3 +5,7 @@ target_link_libraries(macaon_error_correction transition_machine) target_link_libraries(macaon_error_correction ${Boost_PROGRAM_OPTIONS_LIBRARY}) install(TARGETS macaon_error_correction DESTINATION bin) +add_executable(macaon_train_error_detector src/macaon_train_error_detector.cpp) +target_link_libraries(macaon_train_error_detector transition_machine) +target_link_libraries(macaon_train_error_detector ${Boost_PROGRAM_OPTIONS_LIBRARY}) +install(TARGETS macaon_train_error_detector DESTINATION bin) diff --git a/error_correction/src/macaon_error_correction.cpp b/error_correction/src/macaon_error_correction.cpp index 341f3ca7add3032b689a15f6a55e7f1a13e0b965..5c07e76ca07ff628cad00868f37969b040a33047 100644 --- a/error_correction/src/macaon_error_correction.cpp +++ b/error_correction/src/macaon_error_correction.cpp @@ -39,7 +39,6 @@ po::options_description getOptionsDescription() opt.add_options() ("help,h", "Produce this help message") ("debug,d", "Print infos on stderr") - ("printEntropy", "Print entropy for each sequence") ("sequenceDelimiterTape", po::value<std::string>()->default_value("EOS"), "The name of the buffer's tape that contains the delimiter token for a sequence") ("sequenceDelimiter", po::value<std::string>()->default_value("1"), @@ -108,7 +107,6 @@ int main(int argc, char * argv[]) ProgramParameters::input = vm["input"].as<std::string>(); ProgramParameters::mcdName = vm["mcd"].as<std::string>(); ProgramParameters::debug = vm.count("debug") == 0 ? false : true; - ProgramParameters::printEntropy = vm.count("printEntropy") == 0 ? false : true; ProgramParameters::lang = vm["lang"].as<std::string>(); ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>(); ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>(); @@ -223,11 +221,13 @@ int main(int argc, char * argv[]) config.moveHead(transition->headMvt); - if (ProgramParameters::printEntropy) + if (true) { nbActionsInSequence++; - entropyAccumulator += Classifier::computeEntropy(weightedActions); + float entropy = Classifier::computeEntropy(weightedActions); + config.addToEntropyHistory(entropy); + entropyAccumulator += entropy; if (config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] != ProgramParameters::sequenceDelimiter) justFlipped = false; @@ -235,9 +235,6 @@ int main(int argc, char * argv[]) if ((config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] == ProgramParameters::sequenceDelimiter && !justFlipped)) { justFlipped = true; - entropyAccumulator /= nbActionsInSequence; - nbActionsInSequence = 0; - fprintf(stderr, "Entropy : %.2f\n", entropyAccumulator); entropyAccumulator = 0.0; } } diff --git a/error_correction/src/macaon_train_error_detector.cpp b/error_correction/src/macaon_train_error_detector.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1a621b410e119acdfb6d864ec4949fe9393ac71f --- /dev/null +++ b/error_correction/src/macaon_train_error_detector.cpp @@ -0,0 +1,511 @@ +/// @file macaon_train.cpp +/// @author Franck Dary +/// @version 1.0 +/// @date 2018-08-07 + +#include <cstdio> +#include <cstdlib> +#include <boost/program_options.hpp> +#include "BD.hpp" +#include "Config.hpp" +#include "TransitionMachine.hpp" +#include "Trainer.hpp" +#include "ProgramParameters.hpp" + +namespace po = boost::program_options; + +/// @brief Get the list of mandatory and optional program arguments. +/// +/// @return The lists. +po::options_description getOptionsDescription() +{ + po::options_description desc("Command-Line Arguments "); + + po::options_description req("Required"); + req.add_options() + ("expName", po::value<std::string>()->required(), + "Name of this experiment") + ("templateName", po::value<std::string>()->required(), + "Name of the template folder") + ("tm", po::value<std::string>()->required(), + "File describing the Tape Machine we will train") + ("bd", po::value<std::string>()->required(), + "BD file that describes the multi-tapes buffer") + ("mcd", po::value<std::string>()->required(), + "MCD file that describes the input") + ("train,T", po::value<std::string>()->required(), + "Training corpus formated according to the MCD") + ("dev", po::value<std::string>()->default_value(""), + "Development corpus formated according to the MCD"); + + po::options_description opt("Optional"); + opt.add_options() + ("help,h", "Produce this help message") + ("debug,d", "Print infos on stderr") + ("printEntropy", "Print mean entropy and standard deviation accross sequences") + ("optimizer", po::value<std::string>()->default_value("amsgrad"), + "The learning algorithm to use : amsgrad | adam | sgd") + ("lang", po::value<std::string>()->default_value("fr"), + "Language you are working with") + ("nbiter,n", po::value<int>()->default_value(5), + "Number of training epochs (iterations)") + ("iterationSize", po::value<int>()->default_value(-1), + "The number of examples for each iteration. -1 means the whole training set") + ("lr", po::value<float>()->default_value(0.001), + "Learning rate of the optimizer") + ("seed,s", po::value<int>()->default_value(100), + "The random seed that will initialize RNG") + ("nbTrain", po::value<int>()->default_value(0), + "The number of models that will be trained, with only the random seed changing") + ("duplicates", po::value<bool>()->default_value(true), + "Remove identical training examples") + ("showFeatureRepresentation", po::value<bool>()->default_value(false), + "For each state of the Config, show its feature representation") + ("interactive", po::value<bool>()->default_value(true), + "Is the shell interactive ? Display advancement informations") + ("randomEmbeddings", po::value<bool>()->default_value(false), + "When activated, the embeddings will be randomly initialized") + ("sequenceDelimiterTape", po::value<std::string>()->default_value("EOS"), + "The name of the buffer's tape that contains the delimiter token for a sequence") + ("sequenceDelimiter", po::value<std::string>()->default_value("1"), + "The value of the token that act as a delimiter for sequences") + ("printTime", "Print time on stderr") + ("shuffle", po::value<bool>()->default_value(true), + "Shuffle examples after each iteration"); + + po::options_description oracle("Oracle related options"); + oracle.add_options() + ("epochd", po::value<int>()->default_value(3), + "Number of the first epoch where the oracle will be dynamic") + ("proba", po::value<float>()->default_value(0.9), + "The probability that the dynamic oracle will chose the predicted action"); + + po::options_description ams("Amsgrad family optimizers"); + ams.add_options() + ("b1", po::value<float>()->default_value(0.9), + "beta1 parameter for the Amsgtad or Adam optimizer") + ("b2", po::value<float>()->default_value(0.999), + "beta2 parameter for the Amsgtad or Adam optimizer") + ("bias", po::value<float>()->default_value(1e-8), + "bias parameter for the Amsgtad or Adam or Adagrad optimizer"); + + desc.add(req).add(opt).add(oracle).add(ams); + + return desc; +} + +/// @brief Store the program arguments inside a variables_map +/// +/// @param od The description of all the possible options. +/// @param argc The number of arguments given to this program. +/// @param argv The values of arguments given to this program. +/// +/// @return The variables map +po::variables_map checkOptions(po::options_description & od, int argc, char ** argv) +{ + po::variables_map vm; + + try {po::store(po::parse_command_line(argc, argv, od), vm);} + catch(std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + od.print(std::cerr); + exit(1); + } + + if (vm.count("help")) + { + std::cout << od << "\n"; + exit(0); + } + + try {po::notify(vm);} + catch(std::exception& e) + { + std::cerr << "Error: " << e.what() << "\n"; + od.print(std::cerr); + exit(1); + } + + return vm; +} + +/// @brief Set all the usefull paths relative to expPath +void updatePaths() +{ + const char * MACAON_DIR = std::getenv("MACAON_DIR"); + std::string slash = "/"; + ProgramParameters::langPath = MACAON_DIR + slash + ProgramParameters::lang + slash; + ProgramParameters::expPath = ProgramParameters::langPath + "bin/" + ProgramParameters::expName + slash; + ProgramParameters::templatePath = ProgramParameters::langPath + ProgramParameters::templateName + slash; + ProgramParameters::tmFilename = ProgramParameters::expPath + ProgramParameters::tmName; + ProgramParameters::bdFilename = ProgramParameters::expPath + ProgramParameters::bdName; + ProgramParameters::mcdFilename = ProgramParameters::expPath + ProgramParameters::mcdName; + ProgramParameters::trainFilename = ProgramParameters::expPath + ProgramParameters::trainName; + ProgramParameters::devFilename = ProgramParameters::expPath + ProgramParameters::devName; + ProgramParameters::newTemplatePath = ProgramParameters::langPath + "bin/" + ProgramParameters::baseExpName + slash; +} + +/// @brief Create the folder containing the current experiment from the template frolder +void createExpPath() +{ +std::string decode = "\ +#! /bin/bash\n\ +\n\ +if [ \"$#\" -lt 2 ]; then\n\ + echo \"Usage : $0 input mcd\"\n\ + exit\n\ +fi\n\ +\n\ +INPUT=$1\n\ +MCD=$2\n\ +\n\ +shift\n\ +shift\n\ +ARGS=\"\"\n\ +for arg in \"$@\"\n\ +do\n\ + ARGS=\"$ARGS $arg\"\n\ +done\n\ +\n\ +macaon_decode --lang " + ProgramParameters::lang + " --tm machine.tm --bd test.bd -I $INPUT --mcd $MCD --expName " + ProgramParameters::expName + "$ARGS"; + + if (system(("rm -r " + ProgramParameters::expPath + " 2> /dev/null").c_str())){} + if (system(("mkdir " + ProgramParameters::expPath).c_str())){} + if (system(("cp -r " + ProgramParameters::newTemplatePath + "* " + ProgramParameters::expPath + ".").c_str())){} + if (system(("echo \'" + decode + "\' > " + ProgramParameters::expPath + "decode.sh").c_str())){} + if (system(("chmod +x " + ProgramParameters::expPath + "decode.sh").c_str())){} + if (system(("ln -f -s " + ProgramParameters::expPath + "decode.sh " + ProgramParameters::langPath + "bin/maca_tm_" + ProgramParameters::expName).c_str())){} +} + +std::map<std::string, std::pair<float, std::pair<float, float> > > getScoreOnDev(TransitionMachine & tm, std::vector<Config> devConfigs, std::vector<int> & devIsErrors, std::vector<int> &) +{ + tm.reset(); + + std::map< std::string, std::pair<int, int> > counts; + + if (ProgramParameters::debug) + fprintf(stderr, "Computing score on dev set\n"); + + std::vector<int> predictions; + std::string classifierName; + + for (unsigned int i = 0; i < devConfigs.size(); i++) + { + auto & devConfig = devConfigs[i]; + TransitionMachine::State * currentState = tm.getCurrentState(); + Classifier * classifier = currentState->classifier; + devConfig.setCurrentStateName(¤tState->name); + Dict::currentClassifierName = classifier->name; + classifier->initClassifier(devConfig); + + auto weightedActions = classifier->weightActions(devConfig); + std::string pAction = ""; + + for (auto & it : weightedActions) + if (it.first) + { + pAction = it.second.second; + break; + } + + predictions.emplace_back(pAction == "ERROR" ? 1 : 0); + classifierName = classifier->name; + } + + for (unsigned int i = 0; i < devIsErrors.size(); i++) + { + if (devIsErrors[i] == 0) + { + counts[classifierName].first++; + if (predictions[i] == 0) + counts[classifierName].second++; + } + else if (i > 0 && devIsErrors[i] == 1 && devIsErrors[i-1] == 0) + { + counts[classifierName].first++; + unsigned int j; + for (j = i; devIsErrors[j] == 1 && j < devIsErrors.size(); j++) + { + if (predictions[j] == 1) + { + counts[classifierName].second++; + break; + } + } + i = j; + } + } + + std::map<std::string, std::pair<float,std::pair<float,float> > > scores; + + for (auto & it : counts) + scores[it.first].first = 100.0 * it.second.second / it.second.first; + + return scores; +} + +void printScoresAndSave(FILE * output, std::map< std::string, std::pair<int, int> > & trainCounter, std::map< std::string, float > & scores, TransitionMachine & tm, int curIter, std::map< std::string, float > & bestScores, std::vector<Config> & devConfigs, std::vector<int> & devIsErrors, std::vector<int> & devErrorIndexes) +{ + for (auto & it : trainCounter) + scores[it.first] = 100.0 * it.second.second / it.second.first; + + std::vector<std::string> names; + std::vector<std::string> acc; + std::vector<std::string> train; + std::vector<std::string> dev; + std::vector<std::string> savedStr; + + std::map<std::string, bool> saved; + + auto devScores = getScoreOnDev(tm, devConfigs, devIsErrors, devErrorIndexes); + + if (true) + { + for (auto & it : devScores) + { + if (bestScores.count(it.first) == 0 || bestScores[it.first] < it.second.first) + { + bestScores[it.first] = it.second.first; + saved[it.first] = true; + } + else + saved[it.first] = false; + } + } + + auto classifiers = tm.getClassifiers(); + for (auto * cla : classifiers) + { + if (!saved.count(cla->name)) + continue; + + if (saved[cla->name]) + { + cla->save(ProgramParameters::expPath + cla->name + ".model"); + Dict::saveDicts(ProgramParameters::expPath, cla->name); + } + } + + for (auto & it : saved) + { + names.emplace_back(it.first); + acc.emplace_back("accuracy"); + train.emplace_back(": train(" + float2str(scores[it.first], "%.2f") + "%)"); + dev.emplace_back("dev(" +float2str(devScores[it.first].first, "%.2f") + "%)"); + savedStr.emplace_back(saved[it.first] ? "SAVED" : ""); + if (ProgramParameters::printEntropy) + savedStr.back() += " Entropy[" + float2str(devScores[it.first].second.first, "%.2f") + "\u00B1" + float2str(devScores[it.first].second.second, "%.2f") + "]"; + } + + if (ProgramParameters::interactive) + fprintf(stderr, " \r"); + if (ProgramParameters::printTime) + fprintf(output, "[%s] ", getTime().c_str()); + fprintf(output, "Iteration %d/%d :\n", curIter+1, ProgramParameters::nbIter); + + printColumns(output, {names, acc, train, dev, savedStr}); +} + +/// @brief Train a model according to all the ProgramParameters +void launchTraining() +{ + std::map< std::string, float > scores; + std::map< std::string, float > bestScores; + + TransitionMachine tm(true); + + BD trainBD(ProgramParameters::bdFilename, ProgramParameters::mcdFilename); + + File train(ProgramParameters::expPath + ProgramParameters::trainName, "r"); + FILE * trainPtr = train.getDescriptor(); + File dev(ProgramParameters::expPath + ProgramParameters::devName, "r"); + FILE * devPtr = dev.getDescriptor(); + + Dict::createFiles(ProgramParameters::expPath, ""); + + fprintf(stderr, "%sTraining of \'%s\' :\n", + ProgramParameters::printTime ? ("["+getTime()+"] ").c_str() : "", + tm.name.c_str()); + + std::map< std::string, bool > topologyPrinted; + std::map< std::string, std::pair<int, int> > trainCounter; + int curIter = 0; + std::vector<Config> configs; + std::vector<int> isErrors; + std::vector<int> errorIndexes; + + std::vector<Config> devConfigs; + std::vector<int> devIsErrors; + std::vector<int> devErrorIndexes; + + int isError; + int errorIndex; + fprintf(stderr, "Reading train corpus..."); + while (fscanf(trainPtr, "%d\t%d\n", &isError, &errorIndex) == 2) + { + configs.emplace_back(trainBD); + isErrors.emplace_back(isError); + errorIndexes.emplace_back(errorIndex); + configs.back().loadFromFile(train); + } + fprintf(stderr, " done !\n"); + fprintf(stderr, "Reading dev corpus..."); + while (fscanf(devPtr, "%d\t%d\n", &isError, &errorIndex) == 2) + { + devConfigs.emplace_back(trainBD); + devIsErrors.emplace_back(isError); + devErrorIndexes.emplace_back(errorIndex); + devConfigs.back().loadFromFile(dev); + } + fprintf(stderr, " done !\n"); + + auto resetAndShuffle = [&configs,&trainCounter]() + { + //TODO shuffle + /* + if(ProgramParameters::shuffleExamples) + std::random_shuffle(configs.begin(), configs.end()); + */ + + for (auto & it : trainCounter) + it.second.first = it.second.second = 0; + }; + + while (curIter < ProgramParameters::nbIter) + { + resetAndShuffle(); + + for (unsigned int i = 0; i < configs.size(); i++) + { + auto & trainConfig = configs[i]; + isError = isErrors[i]; + errorIndex = errorIndexes[i]; + + TransitionMachine::State * currentState = tm.getCurrentState(); + Classifier * classifier = currentState->classifier; + trainConfig.setCurrentStateName(¤tState->name); + Dict::currentClassifierName = classifier->name; + classifier->initClassifier(trainConfig); + + if (!topologyPrinted.count(classifier->name)) + { + topologyPrinted[classifier->name] = true; + classifier->printTopology(stderr); + } + + // Print current iter advancement in percentage + if (ProgramParameters::interactive) + { + int totalSize = configs.size(); + int steps = i; + if (steps % 200 == 0 || totalSize-steps < 200) + fprintf(stderr, "Current Iteration : %.2f%%\r", 100.0*steps/totalSize); + } + + auto weightedActions = classifier->weightActions(trainConfig); + std::string pAction = ""; + + for (auto & it : weightedActions) + if (it.first) + if (pAction == "") + { + pAction = it.second.second; + break; + } + + std::string oAction = isError ? "ERROR" : "CORRECT"; + + classifier->trainOnExample(trainConfig, classifier->getActionIndex(oAction)); + + trainCounter[classifier->name].first++; + trainCounter[classifier->name].second += pAction == oAction ? 1 : 0; + } + + printScoresAndSave(stderr, trainCounter, scores, tm, curIter, bestScores, devConfigs, devIsErrors, devErrorIndexes); + curIter++; + } +} + +void createTemplatePath() +{ + if (system(("rm -r " + ProgramParameters::newTemplatePath + " 2> /dev/null").c_str())){} + if (system(("mkdir " + ProgramParameters::newTemplatePath).c_str())){} + if (system(("cp -r " + ProgramParameters::templatePath + "* " + ProgramParameters::newTemplatePath + ".").c_str())){} +} + +void removeTemplatePath() +{ + if (system(("rm -r " + ProgramParameters::newTemplatePath + " 2> /dev/null").c_str())){} +} + +/// @brief Train a TransitionMachine to predict and add information to a structured input file, by using annotated examples. +/// +/// @param argc The number of arguments given to this program. +/// @param argv[] Array of arguments given to this program. +/// +/// @return 0 if there was no crash. +int main(int argc, char * argv[]) +{ + auto od = getOptionsDescription(); + + po::variables_map vm = checkOptions(od, argc, argv); + + ProgramParameters::expName = vm["expName"].as<std::string>(); + ProgramParameters::baseExpName = ProgramParameters::expName; + ProgramParameters::templateName = vm["templateName"].as<std::string>(); + ProgramParameters::tmName = vm["tm"].as<std::string>(); + ProgramParameters::bdName = vm["bd"].as<std::string>(); + ProgramParameters::mcdName = vm["mcd"].as<std::string>(); + ProgramParameters::debug = vm.count("debug") == 0 ? false : true; + ProgramParameters::printEntropy = vm.count("printEntropy") == 0 ? false : true; + ProgramParameters::printTime = vm.count("printTime") == 0 ? false : true; + ProgramParameters::trainName = vm["train"].as<std::string>(); + ProgramParameters::devName = vm["dev"].as<std::string>(); + ProgramParameters::lang = vm["lang"].as<std::string>(); + ProgramParameters::nbIter = vm["nbiter"].as<int>(); + ProgramParameters::seed = vm["seed"].as<int>(); + ProgramParameters::nbTrain = vm["nbTrain"].as<int>(); + ProgramParameters::removeDuplicates = vm["duplicates"].as<bool>(); + ProgramParameters::interactive = vm["interactive"].as<bool>(); + ProgramParameters::shuffleExamples = vm["shuffle"].as<bool>(); + ProgramParameters::randomEmbeddings = vm["randomEmbeddings"].as<bool>(); + ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>(); + ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>(); + ProgramParameters::learningRate = vm["lr"].as<float>(); + ProgramParameters::beta1 = vm["b1"].as<float>(); + ProgramParameters::beta2 = vm["b2"].as<float>(); + ProgramParameters::bias = vm["bias"].as<float>(); + ProgramParameters::optimizer = vm["optimizer"].as<std::string>(); + ProgramParameters::dynamicEpoch = vm["epochd"].as<int>(); + ProgramParameters::dynamicProbability = vm["proba"].as<float>(); + ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<bool>(); + ProgramParameters::iterationSize = vm["iterationSize"].as<int>(); + + if (ProgramParameters::nbTrain) + { + updatePaths(); + createTemplatePath(); + for (int i = 0; i < ProgramParameters::nbTrain; i++) + { + fprintf(stderr, "Training number %d / %d :\n", i+1, ProgramParameters::nbTrain); + ProgramParameters::expName = ProgramParameters::baseExpName + "_" + std::to_string(i); + updatePaths(); + createExpPath(); + Dict::deleteDicts(); + launchTraining(); + } + removeTemplatePath(); + } + else + { + updatePaths(); + ProgramParameters::newTemplatePath = ProgramParameters::templatePath; + createExpPath(); + Dict::deleteDicts(); + launchTraining(); + } + + return 0; +} + diff --git a/maca_common/src/util.cpp b/maca_common/src/util.cpp index 5bfa71d72ba3897bfee72f3396340279961d69ad..7f43046c38f1259d87b530420196fa1678ae647a 100644 --- a/maca_common/src/util.cpp +++ b/maca_common/src/util.cpp @@ -343,10 +343,26 @@ std::string float2str(float f, const char * format) bool isNum(const std::string & s) { + bool digitHapened = false; + bool dotHapened = false; + for (unsigned int i = 0; i < s.size(); i++) + { + if (s[i] == '.') + { + if (dotHapened || !digitHapened) + return false; + dotHapened = true; + continue; + } + if ((i == 0 && s[i] != '+' && s[i] != '-' && !isNum(s[i])) || (i != 0 && !isNum(s[i]))) return false; + if (isNum(s[i])) + digitHapened = true; + } + return true; } diff --git a/transition_machine/include/Config.hpp b/transition_machine/include/Config.hpp index 747304603d11d08cebbec23360a3b2b1f01fa180..f4ac482f270b52e6b0c768e69b1d6f0f407bca6e 100644 --- a/transition_machine/include/Config.hpp +++ b/transition_machine/include/Config.hpp @@ -8,6 +8,7 @@ #include <vector> #include "BD.hpp" +#include "File.hpp" /// @brief Configuration of a TransitionMachine. /// It consists of a multi-tapes buffer, a stack and a head. @@ -44,6 +45,8 @@ class Config std::string * currentStateName; /// @brief For each state of the TransitionMachine, an history of the Action that have been applied to this Config. std::map< std::string, std::vector<std::string> > actionHistory; + /// @brief For each state of the TransitionMachine, an history of the entropies for each past decisions. + std::map< std::string, std::vector<float> > entropyHistory; /// @brief A stack that can contain indexes of the multi-tapes buffer. std::vector<int> stack; /// @brief The lastest popped element from the stack @@ -156,6 +159,15 @@ class Config /// /// @return The number of elements in the stack. int stackSize(); + /// @brief Load a Config to match the one that has been written to file, + /// formated by printAsExample. + /// + /// @param file The File to read from. + void loadFromFile(File & file); + /// @brief Add the entropy to the entropyHistory. + /// + /// @param entropy The entropy value. + void addToEntropyHistory(float entropy); }; #endif diff --git a/transition_machine/src/ActionBank.cpp b/transition_machine/src/ActionBank.cpp index 2d071599eb0dca0129f8a4396e25ba50f3fb9b5b..5281f503b01008c4dde7dffd08773475793ef0f9 100644 --- a/transition_machine/src/ActionBank.cpp +++ b/transition_machine/src/ActionBank.cpp @@ -139,6 +139,32 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na else if(std::string(b1) == "NOTHING") { } + else if(std::string(b1) == "ERROR") + { + auto apply = [](Config &, Action::BasicAction &) + {fprintf(stderr, "ERROR\n");}; + auto undo = [](Config &, Action::BasicAction &) + {}; + auto appliable = [](Config &, Action::BasicAction &) + {return true;}; + Action::BasicAction basicAction = + {Action::BasicAction::Type::Push, "", apply, undo, appliable}; + + sequence.emplace_back(basicAction); + } + else if(std::string(b1) == "CORRECT") + { + auto apply = [](Config &, Action::BasicAction &) + {fprintf(stderr, "CORRECT\n");}; + auto undo = [](Config &, Action::BasicAction &) + {}; + auto appliable = [](Config &, Action::BasicAction &) + {return true;}; + Action::BasicAction basicAction = + {Action::BasicAction::Type::Push, "", apply, undo, appliable}; + + sequence.emplace_back(basicAction); + } else if(std::string(b1) == "SHIFT") { auto apply = [](Config & c, Action::BasicAction &) diff --git a/transition_machine/src/Config.cpp b/transition_machine/src/Config.cpp index 985e39ef810673afb19e46960836f4e8704526fe..025b4f15a6f8acebf1b176cf17960d32b4cdfa81 100644 --- a/transition_machine/src/Config.cpp +++ b/transition_machine/src/Config.cpp @@ -143,13 +143,31 @@ void Config::printForDebug(FILE * output) void Config::printAsExample(FILE * output) { - int window = 5; + int window = 100; + int historyWindow = 10; fprintf(output, "head=%d\n", head); fprintf(output, "stack="); for (unsigned int i = 0; i < stack.size(); i++) fprintf(output, "%d%s", stack[i], i == stack.size()-1 ? "" : ","); fprintf(output, "\n"); fprintf(output, "stackHistory=%d\n", stackHistory); + fprintf(output, "actionsHistory=\n"); + for (auto & history : actionHistory) + { + fprintf(output, "%s=", history.first.c_str()); + for (int i = history.second.size()-1-std::min(((int)history.second.size())-1, historyWindow); i < (int)history.second.size(); i++) + fprintf(output, "%s%s", history.second[i].c_str(), i == (int)history.second.size()-1 ? "" : ","); + fprintf(output, "\n"); + } + fprintf(output, "entropyHistory=\n"); + for (auto & history : entropyHistory) + { + fprintf(output, "%s=", history.first.c_str()); + for (int i = history.second.size()-1-std::min(((int)history.second.size())-1, historyWindow); i < (int)history.second.size(); i++) + fprintf(output, "%f%s", history.second[i], i == (int)history.second.size()-1 ? "" : ","); + fprintf(output, "\n"); + } + fprintf(stderr, "-----\n"); for (auto & tape : tapes) { fprintf(output, "%s\t", tape.name.c_str()); @@ -157,6 +175,7 @@ void Config::printAsExample(FILE * output) fprintf(output, "%d=%s\t", i, tape[i].c_str()); fprintf(output, "\n"); } + fprintf(stderr, "-----\n"); } void Config::printAsOutput(FILE * output) @@ -333,3 +352,136 @@ int Config::stackSize() return stack.size(); } +void Config::loadFromFile(File & file) +{ + static auto errorAndExit = [](const char * errinfo, const char * context) + { + fprintf(stderr, "ERROR (%s) : expected \'%s\'. Aborting.\n", errinfo, context); + exit(1); + }; + + tapes.clear(); + stackHistory = -1; + stack.clear(); + actionHistory.clear(); + entropyHistory.clear(); + head = -1; + + FILE * filePtr = file.getDescriptor(); + char buffer[100000]; + char buffer2[100000]; + int number; + int firstIndex = -1; + + if (fscanf(filePtr, "head=%d\n", &number) != 1) + errorAndExit(ERRINFO, "head=X"); + + head = number; + if (fscanf(filePtr, "stack%[^\n]\n", buffer) != 1) + errorAndExit(ERRINFO, "stack=x,x,x,..."); + auto splitted = split(buffer+1, ','); + for (auto & s : splitted) + { + if (!isNum(s)) + { + fprintf(stderr, "<%s>\n", s.c_str()); + errorAndExit(ERRINFO, "number instead of string"); + } + + stack.emplace_back(std::stoi(s)); + } + + if (fscanf(filePtr, "stackHistory=%d\n", &number) != 1) + errorAndExit(ERRINFO, "stackHistory=X"); + stackHistory = number; + + if (fscanf(filePtr, "actionsHistory%s\n", buffer) != 1) + errorAndExit(ERRINFO, "actionsHistory="); + + while (fscanf(filePtr, "entropyHistory%[^\n]\n", buffer) != 1) + { + buffer2[0] = '\0'; + if (fscanf(filePtr, "%[^=]%[^\n]\n", buffer, buffer2) < 1) + errorAndExit(ERRINFO, "stateName=h1,h2,..."); + + if (strlen(buffer2) == 0 || buffer2[0] != '=') + errorAndExit(ERRINFO, "stateName=h1,h2,..."); + + auto history = split(buffer2+1, ','); + + for (auto & s : history) + actionHistory[buffer].emplace_back(s); + } + while (fscanf(filePtr, "----%s\n", buffer) != 1) + { + if (fscanf(filePtr, "%[^=]%[^\n]\n", buffer, buffer2) < 1) + errorAndExit(ERRINFO, "stateName=e1,e2,..."); + + if (strlen(buffer2) == 0 || buffer2[0] != '=') + errorAndExit(ERRINFO, "stateName=h1,h2,..."); + + auto history = split(buffer2+1, ','); + + for (auto & s : history) + { + if (!isNum(s)) + errorAndExit(ERRINFO, "number instead of string"); + + entropyHistory[buffer].emplace_back(std::stof(s)); + } + } + while (fscanf(filePtr, "----%s\n", buffer) != 1) + { + if (fscanf(filePtr, "%[^\n]\n", buffer) != 1) + { + fprintf(stderr, "<%s>\n", buffer); + errorAndExit(ERRINFO, "TAPENAME\txx=yy\trr=zz..."); + } + + auto splited = split(buffer, '\t'); + + if (splited.size() < 2) + errorAndExit(ERRINFO, "TAPENAME\txx=yy\trr=zz..."); + + tapes.emplace_back(); + tapes.back().name = splited[0]; + tapes.back().isKnown = false; + tapes.back().ref.resize(splited.size()-1); + for (unsigned int i = 1; i < splited.size(); i++) + { + auto parts = split(splited[i], '='); + std::string indexStr = parts[0]; + std::string value; + for (unsigned int j = 1; j < parts.size(); j++) + value += parts[j]+(j == parts.size()-1 ? std::string("") : std::string("=")); + if (!isNum(indexStr)) + errorAndExit(ERRINFO, "number instead of string"); + + int index = std::stoi(indexStr); + + if (firstIndex == -1) + firstIndex = index; + + index -= firstIndex; + + tapes.back().ref[index] = value; + } + tapes.back().hyp = tapes.back().ref; + } + + head -= firstIndex; + for (auto & s : stack) + { + s -= firstIndex; + if (s < 0) + fprintf(stderr, "WARNING (%s) : stack element \'%d\' is negative, window was too small when creating the error corpus\n", ERRINFO, s); + } + if (stackHistory > 0) + stackHistory -= firstIndex; +} + +void Config::addToEntropyHistory(float entropy) +{ + entropyHistory[*currentStateName].emplace_back(entropy); +} + diff --git a/transition_machine/src/Oracle.cpp b/transition_machine/src/Oracle.cpp index 76dd82ffe8a8e4e15d67275bc68d2da7e4b38dbd..ac8bc7d3786d5a24c2537726467ff97ade4b94e5 100644 --- a/transition_machine/src/Oracle.cpp +++ b/transition_machine/src/Oracle.cpp @@ -76,6 +76,25 @@ void Oracle::createDatabase() return; isInit = true; + str2oracle.emplace("null", std::unique_ptr<Oracle>(new Oracle( + [](Oracle *) + { + }, + [](Config &, Oracle *) + { + fprintf(stderr, "ERROR (%s) : getAction called on null Oracle. Aborting.\n", ERRINFO); + exit(1); + + return std::string(""); + }, + [](Config &, Oracle *, const std::string &) + { + fprintf(stderr, "ERROR (%s) : getAction called on null Oracle. Aborting.\n", ERRINFO); + exit(1); + + return false; + }))); + str2oracle.emplace("tagger", std::unique_ptr<Oracle>(new Oracle( [](Oracle *) {