diff --git a/error_correction/CMakeLists.txt b/error_correction/CMakeLists.txt index ef39694bf609150cd3aab8be3527e91b14826caf..28d408aa5a91fb44397a422650a5542aaf0bee4f 100644 --- a/error_correction/CMakeLists.txt +++ b/error_correction/CMakeLists.txt @@ -1,20 +1,4 @@ FILE(GLOB SOURCES src/*.cpp) -add_executable(macaon_error_correction src/macaon_error_correction.cpp) -target_link_libraries(macaon_error_correction errors) -target_link_libraries(macaon_error_correction transition_machine) -target_link_libraries(macaon_error_correction ${Boost_PROGRAM_OPTIONS_LIBRARY}) -install(TARGETS macaon_error_correction DESTINATION bin) - -add_executable(macaon_train_error_detector src/macaon_train_error_detector.cpp) -target_link_libraries(macaon_train_error_detector transition_machine) -target_link_libraries(macaon_train_error_detector ${Boost_PROGRAM_OPTIONS_LIBRARY}) -install(TARGETS macaon_train_error_detector DESTINATION bin) - -add_executable(macaon_decode_error_detector src/macaon_decode_error_detector.cpp) -target_link_libraries(macaon_decode_error_detector transition_machine) -target_link_libraries(macaon_decode_error_detector ${Boost_PROGRAM_OPTIONS_LIBRARY}) -install(TARGETS macaon_decode_error_detector DESTINATION bin) - #compiling library add_library(errors STATIC ${SOURCES}) diff --git a/error_correction/src/macaon_decode_error_detector.cpp b/error_correction/src/macaon_decode_error_detector.cpp deleted file mode 100644 index 8b8236858715b550ec73c9db8b13400b481f1e51..0000000000000000000000000000000000000000 --- a/error_correction/src/macaon_decode_error_detector.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/// @file macaon_decode_error_detector.cpp -/// @author Franck Dary -/// @version 1.0 -/// @date 2018-12-03 - -#include <cstdio> -#include <cstdlib> -#include <boost/program_options.hpp> -#include "BD.hpp" -#include "Config.hpp" -#include "TransitionMachine.hpp" - -namespace po = boost::program_options; - -/// @brief Get the list of mandatory and optional program arguments. -/// -/// @return The lists. -po::options_description getOptionsDescription() -{ - po::options_description desc("Command-Line Arguments "); - - po::options_description req("Required"); - req.add_options() - ("expName", po::value<std::string>()->required(), - "Name of this experiment") - ("tm", po::value<std::string>()->required(), - "File describing the Tape Machine to use") - ("bd", po::value<std::string>()->required(), - "BD file that describes the multi-tapes buffer") - ("mcd", po::value<std::string>()->required(), - "MCD file that describes the input") - ("input,I", po::value<std::string>()->required(), - "Input file formated according to the mcd"); - - po::options_description opt("Optional"); - opt.add_options() - ("help,h", "Produce this help message") - ("debug,d", "Print infos on stderr") - ("printEntropy", "Print entropy for each sequence") - ("sequenceDelimiterTape", po::value<std::string>()->default_value("EOS"), - "The name of the buffer's tape that contains the delimiter token for a sequence") - ("sequenceDelimiter", po::value<std::string>()->default_value("1"), - "The value of the token that act as a delimiter for sequences") - ("showFeatureRepresentation", po::value<int>()->default_value(0), - "For each state of the Config, show its feature representation") - ("lang", po::value<std::string>()->default_value("fr"), - "Language you are working with"); - - desc.add(req).add(opt); - - return desc; -} - -/// @brief Store the program arguments inside a variables_map -/// -/// @param od The description of all the possible options. -/// @param argc The number of arguments given to this program. -/// @param argv The values of arguments given to this program. -/// -/// @return The variables map -po::variables_map checkOptions(po::options_description & od, int argc, char ** argv) -{ - po::variables_map vm; - - try {po::store(po::parse_command_line(argc, argv, od), vm);} - catch(std::exception& e) - { - std::cerr << "Error: " << e.what() << "\n"; - od.print(std::cerr); - exit(1); - } - - if (vm.count("help")) - { - std::cout << od << "\n"; - exit(0); - } - - try {po::notify(vm);} - catch(std::exception& e) - { - std::cerr << "Error: " << e.what() << "\n"; - od.print(std::cerr); - exit(1); - } - - return vm; -} - -/// @brief Uses a pre-trained TransitionMachine to predict and add information to a structured input file. -/// -/// @param argc The number of arguments given to this program. -/// @param argv[] Array of arguments given to this program. -/// -/// @return 0 if there was no crash. -int main(int argc, char * argv[]) -{ - auto od = getOptionsDescription(); - - po::variables_map vm = checkOptions(od, argc, argv); - - ProgramParameters::expName = vm["expName"].as<std::string>(); - ProgramParameters::tmName = vm["tm"].as<std::string>(); - ProgramParameters::bdName = vm["bd"].as<std::string>(); - ProgramParameters::input = vm["input"].as<std::string>(); - ProgramParameters::mcdName = vm["mcd"].as<std::string>(); - ProgramParameters::debug = vm.count("debug") == 0 ? false : true; - ProgramParameters::printEntropy = vm.count("printEntropy") == 0 ? false : true; - ProgramParameters::lang = vm["lang"].as<std::string>(); - ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>(); - ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>(); - ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<int>(); - - const char * MACAON_DIR = std::getenv("MACAON_DIR"); - std::string slash = "/"; - ProgramParameters::expPath = MACAON_DIR + slash + ProgramParameters::lang + slash + "bin/" + ProgramParameters::expName + slash; - - ProgramParameters::tmFilename = ProgramParameters::expPath + ProgramParameters::tmName; - ProgramParameters::bdFilename = ProgramParameters::expPath + ProgramParameters::bdName; - ProgramParameters::mcdFilename = ProgramParameters::mcdName; - - TransitionMachine tm(false); - - BD bd(ProgramParameters::bdFilename, ProgramParameters::mcdFilename); - Config config(bd); - - File input(ProgramParameters::input, "r"); - FILE * inputPtr = input.getDescriptor(); - - int isError, errorIndex; - while (fscanf(inputPtr, "%d\t%d\n", &isError, &errorIndex) == 2) - { - config.loadFromFile(input); - - TransitionMachine::State * currentState = tm.getCurrentState(); - Classifier * classifier = currentState->classifier; - config.setCurrentStateName(¤tState->name); - Dict::currentClassifierName = classifier->name; - classifier->initClassifier(config); - - auto weightedActions = classifier->weightActions(config); - std::string pAction = ""; - - for (auto & it : weightedActions) - if (it.first) - if (pAction == "") - { - pAction = it.second.second; - break; - } - - Action * action = classifier->getAction(pAction); - - action->apply(config); - - } - - return 0; -} - diff --git a/error_correction/src/macaon_error_correction.cpp b/error_correction/src/macaon_error_correction.cpp deleted file mode 100644 index 9757e6788f881d7e6c798341e63405ae94cfba58..0000000000000000000000000000000000000000 --- a/error_correction/src/macaon_error_correction.cpp +++ /dev/null @@ -1,260 +0,0 @@ -/// @file macaon_error_correction.cpp -/// @author Franck Dary -/// @version 1.0 -/// @date 2018-11-27 - -#include <cstdio> -#include <cstdlib> -#include <boost/program_options.hpp> -#include "BD.hpp" -#include "Config.hpp" -#include "TransitionMachine.hpp" -#include "util.hpp" -#include "Error.hpp" -#include "ActionBank.hpp" - -namespace po = boost::program_options; - -/// @brief Get the list of mandatory and optional program arguments. -/// -/// @return The lists. -po::options_description getOptionsDescription() -{ - po::options_description desc("Command-Line Arguments "); - - po::options_description req("Required"); - req.add_options() - ("expName", po::value<std::string>()->required(), - "Name of this experiment") - ("classifier", po::value<std::string>()->required(), - "Name of the monitored classifier") - ("tm", po::value<std::string>()->required(), - "File describing the Tape Machine to use") - ("bd", po::value<std::string>()->required(), - "BD file that describes the multi-tapes buffer") - ("mcd", po::value<std::string>()->required(), - "MCD file that describes the input") - ("input,I", po::value<std::string>()->required(), - "Input file formated according to the mcd"); - - po::options_description opt("Optional"); - opt.add_options() - ("help,h", "Produce this help message") - ("debug,d", "Print infos on stderr") - ("sequenceDelimiterTape", po::value<std::string>()->default_value("EOS"), - "The name of the buffer's tape that contains the delimiter token for a sequence") - ("sequenceDelimiter", po::value<std::string>()->default_value("1"), - "The value of the token that act as a delimiter for sequences") - - ("lang", po::value<std::string>()->default_value("fr"), - "Language you are working with"); - - - desc.add(req).add(opt); - - return desc; -} - -/// @brief Store the program arguments inside a variables_map -/// -/// @param od The description of all the possible options. -/// @param argc The number of arguments given to this program. -/// @param argv The values of arguments given to this program. -/// -/// @return The variables map -po::variables_map checkOptions(po::options_description & od, int argc, char ** argv) -{ - po::variables_map vm; - - try {po::store(po::parse_command_line(argc, argv, od), vm);} - catch(std::exception& e) - { - std::cerr << "Error: " << e.what() << "\n"; - od.print(std::cerr); - exit(1); - } - - if (vm.count("help")) - { - std::cout << od << "\n"; - exit(0); - } - - try {po::notify(vm);} - catch(std::exception& e) - { - std::cerr << "Error: " << e.what() << "\n"; - od.print(std::cerr); - exit(1); - } - - return vm; -} - -/// @brief Uses a pre-trained TransitionMachine to output a pair of Config - labels. That can be used as a corpus for error detection. -/// -/// @param argc The number of arguments given to this program. -/// @param argv[] Array of arguments given to this program. -/// -/// @return 0 if there was no crash. -int main(int argc, char * argv[]) -{ - auto od = getOptionsDescription(); - - po::variables_map vm = checkOptions(od, argc, argv); - - ProgramParameters::expName = vm["expName"].as<std::string>(); - ProgramParameters::tmName = vm["tm"].as<std::string>(); - ProgramParameters::bdName = vm["bd"].as<std::string>(); - ProgramParameters::input = vm["input"].as<std::string>(); - ProgramParameters::mcdName = vm["mcd"].as<std::string>(); - ProgramParameters::debug = vm.count("debug") == 0 ? false : true; - ProgramParameters::lang = vm["lang"].as<std::string>(); - ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>(); - ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>(); - ProgramParameters::classifierName = vm["classifier"].as<std::string>(); - - const char * MACAON_DIR = std::getenv("MACAON_DIR"); - std::string slash = "/"; - ProgramParameters::expPath = MACAON_DIR + slash + ProgramParameters::lang + slash + "bin/" + ProgramParameters::expName + slash; - - ProgramParameters::tmFilename = ProgramParameters::expPath + ProgramParameters::tmName; - ProgramParameters::bdFilename = ProgramParameters::expPath + ProgramParameters::bdName; - ProgramParameters::mcdFilename = ProgramParameters::mcdName; - - TransitionMachine tm(false); - - BD bd(ProgramParameters::bdFilename, ProgramParameters::mcdFilename); - Config config(bd); - config.readInput(ProgramParameters::input); - - float entropyAccumulator = 0.0; - int nbActionsInSequence = 0; - bool justFlipped = false; - bool configIsError = false; - int actionIndex = 0; - int errorIndex = 0; - Errors errors; - errors.newSequence(); - while (!config.isFinal()) - { - TransitionMachine::State * currentState = tm.getCurrentState(); - Classifier * classifier = currentState->classifier; - config.setCurrentStateName(¤tState->name); - Dict::currentClassifierName = classifier->name; - - if (ProgramParameters::debug) - { - config.printForDebug(stderr); - fprintf(stderr, "State : \'%s\'\n", currentState->name.c_str()); - } - - auto weightedActions = classifier->weightActions(config); - - if (ProgramParameters::debug) - { - Classifier::printWeightedActions(stderr, weightedActions); - fprintf(stderr, "\n"); - } - - std::string & predictedAction = weightedActions[0].second.second; - Action * action = classifier->getAction(predictedAction); - - for(unsigned int i = 0; i < weightedActions.size(); i++) - { - predictedAction = weightedActions[i].second.second; - action = classifier->getAction(predictedAction); - - if(weightedActions[i].first) - break; - } - - if(!action->appliable(config)) - { - // First case the analysis is finished but without an empty stack - if (config.head == (int)config.tapes[0].ref.size()-1) - { - while (!config.stackEmpty()) - config.stackPop(); - continue; - } - else - { - fprintf(stderr, "ERROR (%s) : action \'%s\' is not appliable. Aborting\n", ERRINFO, predictedAction.c_str()); - exit(1); - } - } - - if (classifier->name == ProgramParameters::classifierName) - { - //fprintf(stderr, "%d\t%d\n", configIsError ? 1 : 0, errorIndex - actionIndex); - //config.printAsExample(stderr); - actionIndex++; - - auto zeroCostActions = classifier->getZeroCostActions(config); - bool pActionIsZeroCost = false; - for (auto & s : zeroCostActions) - if (s == action->name) - { - pActionIsZeroCost = true; - break; - } - - int windowSize = 5; - - if (!pActionIsZeroCost) - { - if (!configIsError || (actionIndex - errorIndex > windowSize)) - { - configIsError = true; - errorIndex = actionIndex-1; - } - } - else if (configIsError && (actionIndex - errorIndex > windowSize)) - { - configIsError = false; - errorIndex = 0; - } - - if (configIsError) - { - errors.add({action->name, zeroCostActions[0], weightedActions, classifier->getActionCost(config, action->name), ActionBank::getLinkLength(config, action->name), ActionBank::getLinkLength(config, zeroCostActions[0])}); - } - } - - - action->apply(config); - - TransitionMachine::Transition * transition = tm.getTransition(predictedAction); - tm.takeTransition(transition); - - config.moveHead(transition->headMvt); - - if (true) - { - nbActionsInSequence++; - - float entropy = Classifier::computeEntropy(weightedActions); - config.addToEntropyHistory(entropy); - entropyAccumulator += entropy; - - if (config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] != ProgramParameters::sequenceDelimiter) - justFlipped = false; - - if ((config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] == ProgramParameters::sequenceDelimiter && !justFlipped)) - { - justFlipped = true; - entropyAccumulator = 0.0; - errors.newSequence(); - configIsError = false; - errorIndex = 0; - } - } - - } - - errors.printStats(); - - return 0; -} - diff --git a/error_correction/src/macaon_train_error_detector.cpp b/error_correction/src/macaon_train_error_detector.cpp deleted file mode 100644 index 3bb665138ba11e60355dace2e15ecc29af6a231c..0000000000000000000000000000000000000000 --- a/error_correction/src/macaon_train_error_detector.cpp +++ /dev/null @@ -1,560 +0,0 @@ -/// @file macaon_train.cpp -/// @author Franck Dary -/// @version 1.0 -/// @date 2018-08-07 - -#include <cstdio> -#include <cstdlib> -#include <boost/program_options.hpp> -#include "BD.hpp" -#include "Config.hpp" -#include "TransitionMachine.hpp" -#include "Trainer.hpp" -#include "ProgramParameters.hpp" - -namespace po = boost::program_options; - -/// @brief Get the list of mandatory and optional program arguments. -/// -/// @return The lists. -po::options_description getOptionsDescription() -{ - po::options_description desc("Command-Line Arguments "); - - po::options_description req("Required"); - req.add_options() - ("expName", po::value<std::string>()->required(), - "Name of this experiment") - ("templateName", po::value<std::string>()->required(), - "Name of the template folder") - ("tm", po::value<std::string>()->required(), - "File describing the Tape Machine we will train") - ("bd", po::value<std::string>()->required(), - "BD file that describes the multi-tapes buffer") - ("mcd", po::value<std::string>()->required(), - "MCD file that describes the input") - ("train,T", po::value<std::string>()->required(), - "Training corpus formated according to the MCD") - ("dev", po::value<std::string>()->default_value(""), - "Development corpus formated according to the MCD"); - - po::options_description opt("Optional"); - opt.add_options() - ("help,h", "Produce this help message") - ("debug,d", "Print infos on stderr") - ("printEntropy", "Print mean entropy and standard deviation accross sequences") - ("optimizer", po::value<std::string>()->default_value("amsgrad"), - "The learning algorithm to use : amsgrad | adam | sgd") - ("loss", po::value<std::string>()->default_value("neglogsoftmax"), - "The loss function to use : neglogsoftmax | weighted") - ("lang", po::value<std::string>()->default_value("fr"), - "Language you are working with") - ("nbiter,n", po::value<int>()->default_value(5), - "Number of training epochs (iterations)") - ("iterationSize", po::value<int>()->default_value(-1), - "The number of examples for each iteration. -1 means the whole training set") - ("lr", po::value<float>()->default_value(0.001), - "Learning rate of the optimizer") - ("seed,s", po::value<int>()->default_value(100), - "The random seed that will initialize RNG") - ("batchSize", po::value<int>()->default_value(50), - "The size of each minibatch (in number of taining examples)") - ("nbTrain", po::value<int>()->default_value(0), - "The number of models that will be trained, with only the random seed changing") - ("duplicates", po::value<bool>()->default_value(true), - "Remove identical training examples") - ("showFeatureRepresentation", po::value<int>()->default_value(0), - "For each state of the Config, show its feature representation") - ("interactive", po::value<bool>()->default_value(true), - "Is the shell interactive ? Display advancement informations") - ("randomEmbeddings", po::value<bool>()->default_value(false), - "When activated, the embeddings will be randomly initialized") - ("randomParameters", po::value<bool>()->default_value(true), - "When activated, the parameters will be randomly initialized") - ("sequenceDelimiterTape", po::value<std::string>()->default_value("EOS"), - "The name of the buffer's tape that contains the delimiter token for a sequence") - ("sequenceDelimiter", po::value<std::string>()->default_value("1"), - "The value of the token that act as a delimiter for sequences") - ("printTime", "Print time on stderr") - ("shuffle", po::value<bool>()->default_value(true), - "Shuffle examples after each iteration"); - - po::options_description oracle("Oracle related options"); - oracle.add_options() - ("epochd", po::value<int>()->default_value(3), - "Number of the first epoch where the oracle will be dynamic") - ("proba", po::value<float>()->default_value(0.9), - "The probability that the dynamic oracle will chose the predicted action"); - - po::options_description ams("Amsgrad family optimizers"); - ams.add_options() - ("b1", po::value<float>()->default_value(0.9), - "beta1 parameter for the Amsgtad or Adam optimizer") - ("b2", po::value<float>()->default_value(0.999), - "beta2 parameter for the Amsgtad or Adam optimizer") - ("bias", po::value<float>()->default_value(1e-8), - "bias parameter for the Amsgtad or Adam or Adagrad optimizer"); - - desc.add(req).add(opt).add(oracle).add(ams); - - return desc; -} - -/// @brief Store the program arguments inside a variables_map -/// -/// @param od The description of all the possible options. -/// @param argc The number of arguments given to this program. -/// @param argv The values of arguments given to this program. -/// -/// @return The variables map -po::variables_map checkOptions(po::options_description & od, int argc, char ** argv) -{ - po::variables_map vm; - - try {po::store(po::parse_command_line(argc, argv, od), vm);} - catch(std::exception& e) - { - std::cerr << "Error: " << e.what() << "\n"; - od.print(std::cerr); - exit(1); - } - - if (vm.count("help")) - { - std::cout << od << "\n"; - exit(0); - } - - try {po::notify(vm);} - catch(std::exception& e) - { - std::cerr << "Error: " << e.what() << "\n"; - od.print(std::cerr); - exit(1); - } - - return vm; -} - -/// @brief Set all the usefull paths relative to expPath -void updatePaths() -{ - const char * MACAON_DIR = std::getenv("MACAON_DIR"); - std::string slash = "/"; - ProgramParameters::langPath = MACAON_DIR + slash + ProgramParameters::lang + slash; - ProgramParameters::expPath = ProgramParameters::langPath + "bin/" + ProgramParameters::expName + slash; - ProgramParameters::templatePath = ProgramParameters::langPath + ProgramParameters::templateName + slash; - ProgramParameters::tmFilename = ProgramParameters::expPath + ProgramParameters::tmName; - ProgramParameters::bdFilename = ProgramParameters::expPath + ProgramParameters::bdName; - ProgramParameters::mcdFilename = ProgramParameters::expPath + ProgramParameters::mcdName; - ProgramParameters::trainFilename = ProgramParameters::expPath + ProgramParameters::trainName; - ProgramParameters::devFilename = ProgramParameters::expPath + ProgramParameters::devName; - ProgramParameters::newTemplatePath = ProgramParameters::langPath + "bin/" + ProgramParameters::baseExpName + slash; -} - -/// @brief Create the folder containing the current experiment from the template frolder -void createExpPath() -{ -std::string decode = "\ -#! /bin/bash\n\ -\n\ -if [ \"$#\" -lt 2 ]; then\n\ - echo \"Usage : $0 input mcd\"\n\ - exit\n\ -fi\n\ -\n\ -INPUT=$1\n\ -MCD=$2\n\ -\n\ -shift\n\ -shift\n\ -ARGS=\"\"\n\ -for arg in \"$@\"\n\ -do\n\ - ARGS=\"$ARGS $arg\"\n\ -done\n\ -\n\ -macaon_decode --lang " + ProgramParameters::lang + " --tm machine.tm --bd test.bd -I $INPUT --mcd $MCD --expName " + ProgramParameters::expName + "$ARGS"; - - if (system(("rm -r " + ProgramParameters::expPath + " 2> /dev/null").c_str())){} - if (system(("mkdir " + ProgramParameters::expPath).c_str())){} - if (system(("cp -r " + ProgramParameters::newTemplatePath + "* " + ProgramParameters::expPath + ".").c_str())){} - if (system(("echo \'" + decode + "\' > " + ProgramParameters::expPath + "decode.sh").c_str())){} - if (system(("chmod +x " + ProgramParameters::expPath + "decode.sh").c_str())){} - if (system(("ln -f -s " + ProgramParameters::expPath + "decode.sh " + ProgramParameters::langPath + "bin/maca_tm_" + ProgramParameters::expName).c_str())){} -} - -std::map<std::string, std::pair<float, std::pair<float, float> > > getScoreOnDev(TransitionMachine & tm, std::vector<int> & devIsErrors, std::vector<int> &, File & dev, Config & devConfig) -{ - dev.rewind(); - FILE * devPtr = dev.getDescriptor(); - tm.reset(); - - std::map< std::string, std::pair<int, int> > counts; - - if (ProgramParameters::debug) - fprintf(stderr, "Computing score on dev set\n"); - - std::vector<int> predictions; - std::string classifierName; - - int isError, errorIndex; - - for (unsigned int i = 0; i < devIsErrors.size(); i++) - { - if (fscanf(devPtr, "%d\t%d\n", &isError, &errorIndex) != 2) - { - fprintf(stderr, "ERROR (%s) : corpus bad format. Aborting.\n", ERRINFO); - exit(1); - } - - devConfig.loadFromFile(dev); - - TransitionMachine::State * currentState = tm.getCurrentState(); - Classifier * classifier = currentState->classifier; - devConfig.setCurrentStateName(¤tState->name); - Dict::currentClassifierName = classifier->name; - classifier->initClassifier(devConfig); - - auto weightedActions = classifier->weightActions(devConfig); - std::string pAction = ""; - - for (auto & it : weightedActions) - if (it.first) - { - pAction = it.second.second; - break; - } - - predictions.emplace_back(pAction == "ERROR" ? 1 : 0); - classifierName = classifier->name; - } - - int pred1Hyp0 = 0; - int pred0Hyp1 = 0; - int pred0Hyp0 = 0; - int pred1Hyp1 = 0; - - for (unsigned int i = 0; i < devIsErrors.size(); i++) - { - if (devIsErrors[i] == 0) - { - counts[classifierName].first++; - if (predictions[i] == 0) - { - counts[classifierName].second++; - pred0Hyp0++; - } - else - pred1Hyp0++; - } - else if (i > 0 && devIsErrors[i] == 1 && devIsErrors[i-1] == 0) - { - counts[classifierName].first++; - unsigned int j; - bool found = false; - for (j = i; devIsErrors[j] == 1 && j < devIsErrors.size(); j++) - { - if (predictions[j] == 1) - { - counts[classifierName].second++; - pred1Hyp1++; - found = true; - break; - } - } - i = j; - if (!found) - pred0Hyp1++; - } - } - - int nbErrorsIntroduced = pred1Hyp0; - int nbErrorsCorrected = pred1Hyp1; - - fprintf(stderr, "\nClass 0 nbExemples : %d\n", pred0Hyp0+pred1Hyp0); - fprintf(stderr, "Class 0 precision : %.2f%%\n", 100.0*pred0Hyp0 / (pred0Hyp0+pred0Hyp1)); - fprintf(stderr, "Class 0 recall : %.2f%%\n\n", 100.0*pred0Hyp0 / (pred0Hyp0+pred1Hyp0)); - - fprintf(stderr, "Class 1 nbExemples : %d\n", pred0Hyp1+pred1Hyp1); - fprintf(stderr, "Class 1 precision : %.2f%%\n", 100.0*pred1Hyp1 / (pred1Hyp1+pred1Hyp0)); - fprintf(stderr, "Class 1 recall : %.2f%%\n\n", 100.0*pred1Hyp1 / (pred1Hyp1+pred0Hyp1)); - - fprintf(stderr, "Nb errors introduced : %d\n", nbErrorsIntroduced); - fprintf(stderr, "Nb errors corrected : %d\n", nbErrorsCorrected); - fprintf(stderr, "Difference : %d\n", nbErrorsCorrected-nbErrorsIntroduced); - - std::map<std::string, std::pair<float,std::pair<float,float> > > scores; - - for (auto & it : counts) - scores[it.first].first = 100.0*pred1Hyp1 / (pred1Hyp1+pred1Hyp0); - - return scores; -} - -void printScoresAndSave(FILE * output, std::map< std::string, std::pair<int, int> > & trainCounter, std::map< std::string, float > & scores, TransitionMachine & tm, int curIter, std::map< std::string, float > & bestScores, std::vector<int> & devIsErrors, std::vector<int> & devErrorIndexes, File & devFile, Config & config, float totalLoss) -{ - for (auto & it : trainCounter) - scores[it.first] = 100.0 * it.second.second / it.second.first; - - std::vector<std::string> names; - std::vector<std::string> acc; - std::vector<std::string> train; - std::vector<std::string> dev; - std::vector<std::string> savedStr; - - std::map<std::string, bool> saved; - - auto devScores = getScoreOnDev(tm, devIsErrors, devErrorIndexes, devFile, config); - - for (auto & it : devScores) - { - if (bestScores.count(it.first) == 0 || bestScores[it.first] < it.second.first) - { - bestScores[it.first] = it.second.first; - saved[it.first] = true; - } - else - saved[it.first] = false; - } - - auto classifiers = tm.getClassifiers(); - for (auto * cla : classifiers) - { - if (!saved.count(cla->name)) - continue; - - if (saved[cla->name]) - { - cla->save(ProgramParameters::expPath + cla->name + ".model"); - Dict::saveDicts(ProgramParameters::expPath, cla->name); - } - } - - for (auto & it : saved) - { - names.emplace_back(it.first); - acc.emplace_back("accuracy"); - train.emplace_back(": train(" + float2str(scores[it.first], "%.2f") + "%)"); - dev.emplace_back("dev(" +float2str(devScores[it.first].first, "%.2f") + "%)"); - savedStr.emplace_back(saved[it.first] ? "SAVED" : ""); - if (ProgramParameters::printEntropy) - savedStr.back() += " Entropy[" + float2str(devScores[it.first].second.first, "%.2f") + "\u00B1" + float2str(devScores[it.first].second.second, "%.2f") + "]"; - savedStr.back() += " Loss[" + float2str(totalLoss, "%.2f") + "]"; - } - - if (ProgramParameters::interactive) - fprintf(stderr, " \r"); - if (ProgramParameters::printTime) - fprintf(output, "[%s] ", getTime().c_str()); - fprintf(output, "Iteration %d/%d :\n", curIter+1, ProgramParameters::nbIter); - - printColumns(output, {names, acc, train, dev, savedStr}); -} - -/// @brief Train a model according to all the ProgramParameters -void launchTraining() -{ - std::map< std::string, float > scores; - std::map< std::string, float > bestScores; - - TransitionMachine tm(true); - - BD trainBD(ProgramParameters::bdFilename, ProgramParameters::mcdFilename); - - File train(ProgramParameters::expPath + ProgramParameters::trainName, "r"); - FILE * trainPtr = train.getDescriptor(); - File dev(ProgramParameters::expPath + ProgramParameters::devName, "r"); - FILE * devPtr = dev.getDescriptor(); - - Dict::createFiles(ProgramParameters::expPath, ""); - - fprintf(stderr, "%sTraining of \'%s\' :\n", - ProgramParameters::printTime ? ("["+getTime()+"] ").c_str() : "", - tm.name.c_str()); - - std::map< std::string, bool > topologyPrinted; - std::map< std::string, std::pair<int, int> > trainCounter; - int curIter = 0; - std::vector<int> isErrors; - std::vector<int> errorIndexes; - - std::vector<int> devIsErrors; - std::vector<int> devErrorIndexes; - - int isError; - int errorIndex; - Config config(trainBD); - fprintf(stderr, "Reading train corpus..."); - while (fscanf(trainPtr, "%d\t%d\n", &isError, &errorIndex) == 2) - { - isErrors.emplace_back(isError); - errorIndexes.emplace_back(errorIndex); - config.loadFromFile(train); - } - fprintf(stderr, " done !\n"); - fprintf(stderr, "Reading dev corpus..."); - while (fscanf(devPtr, "%d\t%d\n", &isError, &errorIndex) == 2) - { - devIsErrors.emplace_back(isError); - devErrorIndexes.emplace_back(errorIndex); - config.loadFromFile(dev); - } - fprintf(stderr, " done !\n"); - - float totalLoss = 0.0; - auto resetAndShuffle = [&trainCounter,&train,&dev,&trainPtr,&totalLoss]() - { - train.rewind(); - dev.rewind(); - trainPtr = train.getDescriptor(); - for (auto & it : trainCounter) - it.second.first = it.second.second = 0; - totalLoss = 0.0; - }; - - Config trainConfig(trainBD); - while (curIter < ProgramParameters::nbIter) - { - resetAndShuffle(); - - for (unsigned int i = 0; i < isErrors.size(); i++) - { - if (fscanf(trainPtr, "%d\t%d\n", &isError, &errorIndex) != 2) - { - fprintf(stderr, "ERROR (%s) : corpus bad format. Aborting.\n", ERRINFO); - exit(1); - } - - trainConfig.loadFromFile(train); - - TransitionMachine::State * currentState = tm.getCurrentState(); - Classifier * classifier = currentState->classifier; - trainConfig.setCurrentStateName(¤tState->name); - Dict::currentClassifierName = classifier->name; - classifier->initClassifier(trainConfig); - - if (!topologyPrinted.count(classifier->name)) - { - topologyPrinted[classifier->name] = true; - classifier->printTopology(stderr); - } - - // Print current iter advancement in percentage - if (ProgramParameters::interactive) - { - int totalSize = isErrors.size(); - int steps = i; - if (steps % 200 == 0 || totalSize-steps < 200) - fprintf(stderr, "Current Iteration : %.2f%%\r", 100.0*steps/totalSize); - } - - auto weightedActions = classifier->weightActions(trainConfig); - std::string pAction = ""; - - for (auto & it : weightedActions) - if (it.first) - if (pAction == "") - { - pAction = it.second.second; - break; - } - - std::string oAction = isError ? "ERROR" : "CORRECT"; - - totalLoss += classifier->trainOnExample(trainConfig, classifier->getActionIndex(oAction)); - - trainCounter[classifier->name].first++; - trainCounter[classifier->name].second += pAction == oAction ? 1 : 0; - } - - printScoresAndSave(stderr, trainCounter, scores, tm, curIter, bestScores, devIsErrors, devErrorIndexes, dev, config, totalLoss); - curIter++; - } -} - -void createTemplatePath() -{ - if (system(("rm -r " + ProgramParameters::newTemplatePath + " 2> /dev/null").c_str())){} - if (system(("mkdir " + ProgramParameters::newTemplatePath).c_str())){} - if (system(("cp -r " + ProgramParameters::templatePath + "* " + ProgramParameters::newTemplatePath + ".").c_str())){} -} - -void removeTemplatePath() -{ - if (system(("rm -r " + ProgramParameters::newTemplatePath + " 2> /dev/null").c_str())){} -} - -/// @brief Train a TransitionMachine to predict and add information to a structured input file, by using annotated examples. -/// -/// @param argc The number of arguments given to this program. -/// @param argv[] Array of arguments given to this program. -/// -/// @return 0 if there was no crash. -int main(int argc, char * argv[]) -{ - auto od = getOptionsDescription(); - - po::variables_map vm = checkOptions(od, argc, argv); - - ProgramParameters::expName = vm["expName"].as<std::string>(); - ProgramParameters::baseExpName = ProgramParameters::expName; - ProgramParameters::templateName = vm["templateName"].as<std::string>(); - ProgramParameters::tmName = vm["tm"].as<std::string>(); - ProgramParameters::bdName = vm["bd"].as<std::string>(); - ProgramParameters::mcdName = vm["mcd"].as<std::string>(); - ProgramParameters::debug = vm.count("debug") == 0 ? false : true; - ProgramParameters::printEntropy = vm.count("printEntropy") == 0 ? false : true; - ProgramParameters::printTime = vm.count("printTime") == 0 ? false : true; - ProgramParameters::trainName = vm["train"].as<std::string>(); - ProgramParameters::devName = vm["dev"].as<std::string>(); - ProgramParameters::lang = vm["lang"].as<std::string>(); - ProgramParameters::nbIter = vm["nbiter"].as<int>(); - ProgramParameters::seed = vm["seed"].as<int>(); - ProgramParameters::batchSize = vm["batchSize"].as<int>(); - ProgramParameters::nbTrain = vm["nbTrain"].as<int>(); - ProgramParameters::removeDuplicates = vm["duplicates"].as<bool>(); - ProgramParameters::interactive = vm["interactive"].as<bool>(); - ProgramParameters::shuffleExamples = vm["shuffle"].as<bool>(); - ProgramParameters::randomEmbeddings = vm["randomEmbeddings"].as<bool>(); - ProgramParameters::randomParameters = vm["randomParameters"].as<bool>(); - ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>(); - ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>(); - ProgramParameters::learningRate = vm["lr"].as<float>(); - ProgramParameters::beta1 = vm["b1"].as<float>(); - ProgramParameters::beta2 = vm["b2"].as<float>(); - ProgramParameters::bias = vm["bias"].as<float>(); - ProgramParameters::optimizer = vm["optimizer"].as<std::string>(); - ProgramParameters::dynamicEpoch = vm["epochd"].as<int>(); - ProgramParameters::loss = vm["loss"].as<std::string>(); - ProgramParameters::dynamicProbability = vm["proba"].as<float>(); - ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<int>(); - ProgramParameters::iterationSize = vm["iterationSize"].as<int>(); - - if (ProgramParameters::nbTrain) - { - updatePaths(); - createTemplatePath(); - for (int i = 0; i < ProgramParameters::nbTrain; i++) - { - fprintf(stderr, "Training number %d / %d :\n", i+1, ProgramParameters::nbTrain); - ProgramParameters::expName = ProgramParameters::baseExpName + "_" + std::to_string(i); - updatePaths(); - createExpPath(); - Dict::deleteDicts(); - launchTraining(); - } - removeTemplatePath(); - } - else - { - updatePaths(); - ProgramParameters::newTemplatePath = ProgramParameters::templatePath; - createExpPath(); - Dict::deleteDicts(); - launchTraining(); - } - - return 0; -} -