Skip to content
Snippets Groups Projects
Commit 8f8741e9 authored by Franck Dary's avatar Franck Dary
Browse files

Added a program to train an error detector

parent aba2b510
Branches
No related tags found
No related merge requests found
......@@ -5,3 +5,7 @@ target_link_libraries(macaon_error_correction transition_machine)
target_link_libraries(macaon_error_correction ${Boost_PROGRAM_OPTIONS_LIBRARY})
install(TARGETS macaon_error_correction DESTINATION bin)
add_executable(macaon_train_error_detector src/macaon_train_error_detector.cpp)
target_link_libraries(macaon_train_error_detector transition_machine)
target_link_libraries(macaon_train_error_detector ${Boost_PROGRAM_OPTIONS_LIBRARY})
install(TARGETS macaon_train_error_detector DESTINATION bin)
......@@ -39,7 +39,6 @@ po::options_description getOptionsDescription()
opt.add_options()
("help,h", "Produce this help message")
("debug,d", "Print infos on stderr")
("printEntropy", "Print entropy for each sequence")
("sequenceDelimiterTape", po::value<std::string>()->default_value("EOS"),
"The name of the buffer's tape that contains the delimiter token for a sequence")
("sequenceDelimiter", po::value<std::string>()->default_value("1"),
......@@ -108,7 +107,6 @@ int main(int argc, char * argv[])
ProgramParameters::input = vm["input"].as<std::string>();
ProgramParameters::mcdName = vm["mcd"].as<std::string>();
ProgramParameters::debug = vm.count("debug") == 0 ? false : true;
ProgramParameters::printEntropy = vm.count("printEntropy") == 0 ? false : true;
ProgramParameters::lang = vm["lang"].as<std::string>();
ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>();
ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>();
......@@ -223,11 +221,13 @@ int main(int argc, char * argv[])
config.moveHead(transition->headMvt);
if (ProgramParameters::printEntropy)
if (true)
{
nbActionsInSequence++;
entropyAccumulator += Classifier::computeEntropy(weightedActions);
float entropy = Classifier::computeEntropy(weightedActions);
config.addToEntropyHistory(entropy);
entropyAccumulator += entropy;
if (config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] != ProgramParameters::sequenceDelimiter)
justFlipped = false;
......@@ -235,9 +235,6 @@ int main(int argc, char * argv[])
if ((config.head >= 1 && config.getTape(ProgramParameters::sequenceDelimiterTape)[config.head-1] == ProgramParameters::sequenceDelimiter && !justFlipped))
{
justFlipped = true;
entropyAccumulator /= nbActionsInSequence;
nbActionsInSequence = 0;
fprintf(stderr, "Entropy : %.2f\n", entropyAccumulator);
entropyAccumulator = 0.0;
}
}
......
/// @file macaon_train.cpp
/// @author Franck Dary
/// @version 1.0
/// @date 2018-08-07
#include <cstdio>
#include <cstdlib>
#include <boost/program_options.hpp>
#include "BD.hpp"
#include "Config.hpp"
#include "TransitionMachine.hpp"
#include "Trainer.hpp"
#include "ProgramParameters.hpp"
namespace po = boost::program_options;
/// @brief Get the list of mandatory and optional program arguments.
///
/// @return The lists.
po::options_description getOptionsDescription()
{
po::options_description desc("Command-Line Arguments ");
po::options_description req("Required");
req.add_options()
("expName", po::value<std::string>()->required(),
"Name of this experiment")
("templateName", po::value<std::string>()->required(),
"Name of the template folder")
("tm", po::value<std::string>()->required(),
"File describing the Tape Machine we will train")
("bd", po::value<std::string>()->required(),
"BD file that describes the multi-tapes buffer")
("mcd", po::value<std::string>()->required(),
"MCD file that describes the input")
("train,T", po::value<std::string>()->required(),
"Training corpus formated according to the MCD")
("dev", po::value<std::string>()->default_value(""),
"Development corpus formated according to the MCD");
po::options_description opt("Optional");
opt.add_options()
("help,h", "Produce this help message")
("debug,d", "Print infos on stderr")
("printEntropy", "Print mean entropy and standard deviation accross sequences")
("optimizer", po::value<std::string>()->default_value("amsgrad"),
"The learning algorithm to use : amsgrad | adam | sgd")
("lang", po::value<std::string>()->default_value("fr"),
"Language you are working with")
("nbiter,n", po::value<int>()->default_value(5),
"Number of training epochs (iterations)")
("iterationSize", po::value<int>()->default_value(-1),
"The number of examples for each iteration. -1 means the whole training set")
("lr", po::value<float>()->default_value(0.001),
"Learning rate of the optimizer")
("seed,s", po::value<int>()->default_value(100),
"The random seed that will initialize RNG")
("nbTrain", po::value<int>()->default_value(0),
"The number of models that will be trained, with only the random seed changing")
("duplicates", po::value<bool>()->default_value(true),
"Remove identical training examples")
("showFeatureRepresentation", po::value<bool>()->default_value(false),
"For each state of the Config, show its feature representation")
("interactive", po::value<bool>()->default_value(true),
"Is the shell interactive ? Display advancement informations")
("randomEmbeddings", po::value<bool>()->default_value(false),
"When activated, the embeddings will be randomly initialized")
("sequenceDelimiterTape", po::value<std::string>()->default_value("EOS"),
"The name of the buffer's tape that contains the delimiter token for a sequence")
("sequenceDelimiter", po::value<std::string>()->default_value("1"),
"The value of the token that act as a delimiter for sequences")
("printTime", "Print time on stderr")
("shuffle", po::value<bool>()->default_value(true),
"Shuffle examples after each iteration");
po::options_description oracle("Oracle related options");
oracle.add_options()
("epochd", po::value<int>()->default_value(3),
"Number of the first epoch where the oracle will be dynamic")
("proba", po::value<float>()->default_value(0.9),
"The probability that the dynamic oracle will chose the predicted action");
po::options_description ams("Amsgrad family optimizers");
ams.add_options()
("b1", po::value<float>()->default_value(0.9),
"beta1 parameter for the Amsgtad or Adam optimizer")
("b2", po::value<float>()->default_value(0.999),
"beta2 parameter for the Amsgtad or Adam optimizer")
("bias", po::value<float>()->default_value(1e-8),
"bias parameter for the Amsgtad or Adam or Adagrad optimizer");
desc.add(req).add(opt).add(oracle).add(ams);
return desc;
}
/// @brief Store the program arguments inside a variables_map
///
/// @param od The description of all the possible options.
/// @param argc The number of arguments given to this program.
/// @param argv The values of arguments given to this program.
///
/// @return The variables map
po::variables_map checkOptions(po::options_description & od, int argc, char ** argv)
{
po::variables_map vm;
try {po::store(po::parse_command_line(argc, argv, od), vm);}
catch(std::exception& e)
{
std::cerr << "Error: " << e.what() << "\n";
od.print(std::cerr);
exit(1);
}
if (vm.count("help"))
{
std::cout << od << "\n";
exit(0);
}
try {po::notify(vm);}
catch(std::exception& e)
{
std::cerr << "Error: " << e.what() << "\n";
od.print(std::cerr);
exit(1);
}
return vm;
}
/// @brief Set all the usefull paths relative to expPath
void updatePaths()
{
const char * MACAON_DIR = std::getenv("MACAON_DIR");
std::string slash = "/";
ProgramParameters::langPath = MACAON_DIR + slash + ProgramParameters::lang + slash;
ProgramParameters::expPath = ProgramParameters::langPath + "bin/" + ProgramParameters::expName + slash;
ProgramParameters::templatePath = ProgramParameters::langPath + ProgramParameters::templateName + slash;
ProgramParameters::tmFilename = ProgramParameters::expPath + ProgramParameters::tmName;
ProgramParameters::bdFilename = ProgramParameters::expPath + ProgramParameters::bdName;
ProgramParameters::mcdFilename = ProgramParameters::expPath + ProgramParameters::mcdName;
ProgramParameters::trainFilename = ProgramParameters::expPath + ProgramParameters::trainName;
ProgramParameters::devFilename = ProgramParameters::expPath + ProgramParameters::devName;
ProgramParameters::newTemplatePath = ProgramParameters::langPath + "bin/" + ProgramParameters::baseExpName + slash;
}
/// @brief Create the folder containing the current experiment from the template frolder
void createExpPath()
{
std::string decode = "\
#! /bin/bash\n\
\n\
if [ \"$#\" -lt 2 ]; then\n\
echo \"Usage : $0 input mcd\"\n\
exit\n\
fi\n\
\n\
INPUT=$1\n\
MCD=$2\n\
\n\
shift\n\
shift\n\
ARGS=\"\"\n\
for arg in \"$@\"\n\
do\n\
ARGS=\"$ARGS $arg\"\n\
done\n\
\n\
macaon_decode --lang " + ProgramParameters::lang + " --tm machine.tm --bd test.bd -I $INPUT --mcd $MCD --expName " + ProgramParameters::expName + "$ARGS";
if (system(("rm -r " + ProgramParameters::expPath + " 2> /dev/null").c_str())){}
if (system(("mkdir " + ProgramParameters::expPath).c_str())){}
if (system(("cp -r " + ProgramParameters::newTemplatePath + "* " + ProgramParameters::expPath + ".").c_str())){}
if (system(("echo \'" + decode + "\' > " + ProgramParameters::expPath + "decode.sh").c_str())){}
if (system(("chmod +x " + ProgramParameters::expPath + "decode.sh").c_str())){}
if (system(("ln -f -s " + ProgramParameters::expPath + "decode.sh " + ProgramParameters::langPath + "bin/maca_tm_" + ProgramParameters::expName).c_str())){}
}
std::map<std::string, std::pair<float, std::pair<float, float> > > getScoreOnDev(TransitionMachine & tm, std::vector<Config> devConfigs, std::vector<int> & devIsErrors, std::vector<int> &)
{
tm.reset();
std::map< std::string, std::pair<int, int> > counts;
if (ProgramParameters::debug)
fprintf(stderr, "Computing score on dev set\n");
std::vector<int> predictions;
std::string classifierName;
for (unsigned int i = 0; i < devConfigs.size(); i++)
{
auto & devConfig = devConfigs[i];
TransitionMachine::State * currentState = tm.getCurrentState();
Classifier * classifier = currentState->classifier;
devConfig.setCurrentStateName(&currentState->name);
Dict::currentClassifierName = classifier->name;
classifier->initClassifier(devConfig);
auto weightedActions = classifier->weightActions(devConfig);
std::string pAction = "";
for (auto & it : weightedActions)
if (it.first)
{
pAction = it.second.second;
break;
}
predictions.emplace_back(pAction == "ERROR" ? 1 : 0);
classifierName = classifier->name;
}
for (unsigned int i = 0; i < devIsErrors.size(); i++)
{
if (devIsErrors[i] == 0)
{
counts[classifierName].first++;
if (predictions[i] == 0)
counts[classifierName].second++;
}
else if (i > 0 && devIsErrors[i] == 1 && devIsErrors[i-1] == 0)
{
counts[classifierName].first++;
unsigned int j;
for (j = i; devIsErrors[j] == 1 && j < devIsErrors.size(); j++)
{
if (predictions[j] == 1)
{
counts[classifierName].second++;
break;
}
}
i = j;
}
}
std::map<std::string, std::pair<float,std::pair<float,float> > > scores;
for (auto & it : counts)
scores[it.first].first = 100.0 * it.second.second / it.second.first;
return scores;
}
void printScoresAndSave(FILE * output, std::map< std::string, std::pair<int, int> > & trainCounter, std::map< std::string, float > & scores, TransitionMachine & tm, int curIter, std::map< std::string, float > & bestScores, std::vector<Config> & devConfigs, std::vector<int> & devIsErrors, std::vector<int> & devErrorIndexes)
{
for (auto & it : trainCounter)
scores[it.first] = 100.0 * it.second.second / it.second.first;
std::vector<std::string> names;
std::vector<std::string> acc;
std::vector<std::string> train;
std::vector<std::string> dev;
std::vector<std::string> savedStr;
std::map<std::string, bool> saved;
auto devScores = getScoreOnDev(tm, devConfigs, devIsErrors, devErrorIndexes);
if (true)
{
for (auto & it : devScores)
{
if (bestScores.count(it.first) == 0 || bestScores[it.first] < it.second.first)
{
bestScores[it.first] = it.second.first;
saved[it.first] = true;
}
else
saved[it.first] = false;
}
}
auto classifiers = tm.getClassifiers();
for (auto * cla : classifiers)
{
if (!saved.count(cla->name))
continue;
if (saved[cla->name])
{
cla->save(ProgramParameters::expPath + cla->name + ".model");
Dict::saveDicts(ProgramParameters::expPath, cla->name);
}
}
for (auto & it : saved)
{
names.emplace_back(it.first);
acc.emplace_back("accuracy");
train.emplace_back(": train(" + float2str(scores[it.first], "%.2f") + "%)");
dev.emplace_back("dev(" +float2str(devScores[it.first].first, "%.2f") + "%)");
savedStr.emplace_back(saved[it.first] ? "SAVED" : "");
if (ProgramParameters::printEntropy)
savedStr.back() += " Entropy[" + float2str(devScores[it.first].second.first, "%.2f") + "\u00B1" + float2str(devScores[it.first].second.second, "%.2f") + "]";
}
if (ProgramParameters::interactive)
fprintf(stderr, " \r");
if (ProgramParameters::printTime)
fprintf(output, "[%s] ", getTime().c_str());
fprintf(output, "Iteration %d/%d :\n", curIter+1, ProgramParameters::nbIter);
printColumns(output, {names, acc, train, dev, savedStr});
}
/// @brief Train a model according to all the ProgramParameters
void launchTraining()
{
std::map< std::string, float > scores;
std::map< std::string, float > bestScores;
TransitionMachine tm(true);
BD trainBD(ProgramParameters::bdFilename, ProgramParameters::mcdFilename);
File train(ProgramParameters::expPath + ProgramParameters::trainName, "r");
FILE * trainPtr = train.getDescriptor();
File dev(ProgramParameters::expPath + ProgramParameters::devName, "r");
FILE * devPtr = dev.getDescriptor();
Dict::createFiles(ProgramParameters::expPath, "");
fprintf(stderr, "%sTraining of \'%s\' :\n",
ProgramParameters::printTime ? ("["+getTime()+"] ").c_str() : "",
tm.name.c_str());
std::map< std::string, bool > topologyPrinted;
std::map< std::string, std::pair<int, int> > trainCounter;
int curIter = 0;
std::vector<Config> configs;
std::vector<int> isErrors;
std::vector<int> errorIndexes;
std::vector<Config> devConfigs;
std::vector<int> devIsErrors;
std::vector<int> devErrorIndexes;
int isError;
int errorIndex;
fprintf(stderr, "Reading train corpus...");
while (fscanf(trainPtr, "%d\t%d\n", &isError, &errorIndex) == 2)
{
configs.emplace_back(trainBD);
isErrors.emplace_back(isError);
errorIndexes.emplace_back(errorIndex);
configs.back().loadFromFile(train);
}
fprintf(stderr, " done !\n");
fprintf(stderr, "Reading dev corpus...");
while (fscanf(devPtr, "%d\t%d\n", &isError, &errorIndex) == 2)
{
devConfigs.emplace_back(trainBD);
devIsErrors.emplace_back(isError);
devErrorIndexes.emplace_back(errorIndex);
devConfigs.back().loadFromFile(dev);
}
fprintf(stderr, " done !\n");
auto resetAndShuffle = [&configs,&trainCounter]()
{
//TODO shuffle
/*
if(ProgramParameters::shuffleExamples)
std::random_shuffle(configs.begin(), configs.end());
*/
for (auto & it : trainCounter)
it.second.first = it.second.second = 0;
};
while (curIter < ProgramParameters::nbIter)
{
resetAndShuffle();
for (unsigned int i = 0; i < configs.size(); i++)
{
auto & trainConfig = configs[i];
isError = isErrors[i];
errorIndex = errorIndexes[i];
TransitionMachine::State * currentState = tm.getCurrentState();
Classifier * classifier = currentState->classifier;
trainConfig.setCurrentStateName(&currentState->name);
Dict::currentClassifierName = classifier->name;
classifier->initClassifier(trainConfig);
if (!topologyPrinted.count(classifier->name))
{
topologyPrinted[classifier->name] = true;
classifier->printTopology(stderr);
}
// Print current iter advancement in percentage
if (ProgramParameters::interactive)
{
int totalSize = configs.size();
int steps = i;
if (steps % 200 == 0 || totalSize-steps < 200)
fprintf(stderr, "Current Iteration : %.2f%%\r", 100.0*steps/totalSize);
}
auto weightedActions = classifier->weightActions(trainConfig);
std::string pAction = "";
for (auto & it : weightedActions)
if (it.first)
if (pAction == "")
{
pAction = it.second.second;
break;
}
std::string oAction = isError ? "ERROR" : "CORRECT";
classifier->trainOnExample(trainConfig, classifier->getActionIndex(oAction));
trainCounter[classifier->name].first++;
trainCounter[classifier->name].second += pAction == oAction ? 1 : 0;
}
printScoresAndSave(stderr, trainCounter, scores, tm, curIter, bestScores, devConfigs, devIsErrors, devErrorIndexes);
curIter++;
}
}
void createTemplatePath()
{
if (system(("rm -r " + ProgramParameters::newTemplatePath + " 2> /dev/null").c_str())){}
if (system(("mkdir " + ProgramParameters::newTemplatePath).c_str())){}
if (system(("cp -r " + ProgramParameters::templatePath + "* " + ProgramParameters::newTemplatePath + ".").c_str())){}
}
void removeTemplatePath()
{
if (system(("rm -r " + ProgramParameters::newTemplatePath + " 2> /dev/null").c_str())){}
}
/// @brief Train a TransitionMachine to predict and add information to a structured input file, by using annotated examples.
///
/// @param argc The number of arguments given to this program.
/// @param argv[] Array of arguments given to this program.
///
/// @return 0 if there was no crash.
int main(int argc, char * argv[])
{
auto od = getOptionsDescription();
po::variables_map vm = checkOptions(od, argc, argv);
ProgramParameters::expName = vm["expName"].as<std::string>();
ProgramParameters::baseExpName = ProgramParameters::expName;
ProgramParameters::templateName = vm["templateName"].as<std::string>();
ProgramParameters::tmName = vm["tm"].as<std::string>();
ProgramParameters::bdName = vm["bd"].as<std::string>();
ProgramParameters::mcdName = vm["mcd"].as<std::string>();
ProgramParameters::debug = vm.count("debug") == 0 ? false : true;
ProgramParameters::printEntropy = vm.count("printEntropy") == 0 ? false : true;
ProgramParameters::printTime = vm.count("printTime") == 0 ? false : true;
ProgramParameters::trainName = vm["train"].as<std::string>();
ProgramParameters::devName = vm["dev"].as<std::string>();
ProgramParameters::lang = vm["lang"].as<std::string>();
ProgramParameters::nbIter = vm["nbiter"].as<int>();
ProgramParameters::seed = vm["seed"].as<int>();
ProgramParameters::nbTrain = vm["nbTrain"].as<int>();
ProgramParameters::removeDuplicates = vm["duplicates"].as<bool>();
ProgramParameters::interactive = vm["interactive"].as<bool>();
ProgramParameters::shuffleExamples = vm["shuffle"].as<bool>();
ProgramParameters::randomEmbeddings = vm["randomEmbeddings"].as<bool>();
ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>();
ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>();
ProgramParameters::learningRate = vm["lr"].as<float>();
ProgramParameters::beta1 = vm["b1"].as<float>();
ProgramParameters::beta2 = vm["b2"].as<float>();
ProgramParameters::bias = vm["bias"].as<float>();
ProgramParameters::optimizer = vm["optimizer"].as<std::string>();
ProgramParameters::dynamicEpoch = vm["epochd"].as<int>();
ProgramParameters::dynamicProbability = vm["proba"].as<float>();
ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<bool>();
ProgramParameters::iterationSize = vm["iterationSize"].as<int>();
if (ProgramParameters::nbTrain)
{
updatePaths();
createTemplatePath();
for (int i = 0; i < ProgramParameters::nbTrain; i++)
{
fprintf(stderr, "Training number %d / %d :\n", i+1, ProgramParameters::nbTrain);
ProgramParameters::expName = ProgramParameters::baseExpName + "_" + std::to_string(i);
updatePaths();
createExpPath();
Dict::deleteDicts();
launchTraining();
}
removeTemplatePath();
}
else
{
updatePaths();
ProgramParameters::newTemplatePath = ProgramParameters::templatePath;
createExpPath();
Dict::deleteDicts();
launchTraining();
}
return 0;
}
......@@ -343,10 +343,26 @@ std::string float2str(float f, const char * format)
bool isNum(const std::string & s)
{
bool digitHapened = false;
bool dotHapened = false;
for (unsigned int i = 0; i < s.size(); i++)
{
if (s[i] == '.')
{
if (dotHapened || !digitHapened)
return false;
dotHapened = true;
continue;
}
if ((i == 0 && s[i] != '+' && s[i] != '-' && !isNum(s[i])) || (i != 0 && !isNum(s[i])))
return false;
if (isNum(s[i]))
digitHapened = true;
}
return true;
}
......
......@@ -8,6 +8,7 @@
#include <vector>
#include "BD.hpp"
#include "File.hpp"
/// @brief Configuration of a TransitionMachine.
/// It consists of a multi-tapes buffer, a stack and a head.
......@@ -44,6 +45,8 @@ class Config
std::string * currentStateName;
/// @brief For each state of the TransitionMachine, an history of the Action that have been applied to this Config.
std::map< std::string, std::vector<std::string> > actionHistory;
/// @brief For each state of the TransitionMachine, an history of the entropies for each past decisions.
std::map< std::string, std::vector<float> > entropyHistory;
/// @brief A stack that can contain indexes of the multi-tapes buffer.
std::vector<int> stack;
/// @brief The lastest popped element from the stack
......@@ -156,6 +159,15 @@ class Config
///
/// @return The number of elements in the stack.
int stackSize();
/// @brief Load a Config to match the one that has been written to file,
/// formated by printAsExample.
///
/// @param file The File to read from.
void loadFromFile(File & file);
/// @brief Add the entropy to the entropyHistory.
///
/// @param entropy The entropy value.
void addToEntropyHistory(float entropy);
};
#endif
......@@ -139,6 +139,32 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
else if(std::string(b1) == "NOTHING")
{
}
else if(std::string(b1) == "ERROR")
{
auto apply = [](Config &, Action::BasicAction &)
{fprintf(stderr, "ERROR\n");};
auto undo = [](Config &, Action::BasicAction &)
{};
auto appliable = [](Config &, Action::BasicAction &)
{return true;};
Action::BasicAction basicAction =
{Action::BasicAction::Type::Push, "", apply, undo, appliable};
sequence.emplace_back(basicAction);
}
else if(std::string(b1) == "CORRECT")
{
auto apply = [](Config &, Action::BasicAction &)
{fprintf(stderr, "CORRECT\n");};
auto undo = [](Config &, Action::BasicAction &)
{};
auto appliable = [](Config &, Action::BasicAction &)
{return true;};
Action::BasicAction basicAction =
{Action::BasicAction::Type::Push, "", apply, undo, appliable};
sequence.emplace_back(basicAction);
}
else if(std::string(b1) == "SHIFT")
{
auto apply = [](Config & c, Action::BasicAction &)
......
......@@ -143,13 +143,31 @@ void Config::printForDebug(FILE * output)
void Config::printAsExample(FILE * output)
{
int window = 5;
int window = 100;
int historyWindow = 10;
fprintf(output, "head=%d\n", head);
fprintf(output, "stack=");
for (unsigned int i = 0; i < stack.size(); i++)
fprintf(output, "%d%s", stack[i], i == stack.size()-1 ? "" : ",");
fprintf(output, "\n");
fprintf(output, "stackHistory=%d\n", stackHistory);
fprintf(output, "actionsHistory=\n");
for (auto & history : actionHistory)
{
fprintf(output, "%s=", history.first.c_str());
for (int i = history.second.size()-1-std::min(((int)history.second.size())-1, historyWindow); i < (int)history.second.size(); i++)
fprintf(output, "%s%s", history.second[i].c_str(), i == (int)history.second.size()-1 ? "" : ",");
fprintf(output, "\n");
}
fprintf(output, "entropyHistory=\n");
for (auto & history : entropyHistory)
{
fprintf(output, "%s=", history.first.c_str());
for (int i = history.second.size()-1-std::min(((int)history.second.size())-1, historyWindow); i < (int)history.second.size(); i++)
fprintf(output, "%f%s", history.second[i], i == (int)history.second.size()-1 ? "" : ",");
fprintf(output, "\n");
}
fprintf(stderr, "-----\n");
for (auto & tape : tapes)
{
fprintf(output, "%s\t", tape.name.c_str());
......@@ -157,6 +175,7 @@ void Config::printAsExample(FILE * output)
fprintf(output, "%d=%s\t", i, tape[i].c_str());
fprintf(output, "\n");
}
fprintf(stderr, "-----\n");
}
void Config::printAsOutput(FILE * output)
......@@ -333,3 +352,136 @@ int Config::stackSize()
return stack.size();
}
void Config::loadFromFile(File & file)
{
static auto errorAndExit = [](const char * errinfo, const char * context)
{
fprintf(stderr, "ERROR (%s) : expected \'%s\'. Aborting.\n", errinfo, context);
exit(1);
};
tapes.clear();
stackHistory = -1;
stack.clear();
actionHistory.clear();
entropyHistory.clear();
head = -1;
FILE * filePtr = file.getDescriptor();
char buffer[100000];
char buffer2[100000];
int number;
int firstIndex = -1;
if (fscanf(filePtr, "head=%d\n", &number) != 1)
errorAndExit(ERRINFO, "head=X");
head = number;
if (fscanf(filePtr, "stack%[^\n]\n", buffer) != 1)
errorAndExit(ERRINFO, "stack=x,x,x,...");
auto splitted = split(buffer+1, ',');
for (auto & s : splitted)
{
if (!isNum(s))
{
fprintf(stderr, "<%s>\n", s.c_str());
errorAndExit(ERRINFO, "number instead of string");
}
stack.emplace_back(std::stoi(s));
}
if (fscanf(filePtr, "stackHistory=%d\n", &number) != 1)
errorAndExit(ERRINFO, "stackHistory=X");
stackHistory = number;
if (fscanf(filePtr, "actionsHistory%s\n", buffer) != 1)
errorAndExit(ERRINFO, "actionsHistory=");
while (fscanf(filePtr, "entropyHistory%[^\n]\n", buffer) != 1)
{
buffer2[0] = '\0';
if (fscanf(filePtr, "%[^=]%[^\n]\n", buffer, buffer2) < 1)
errorAndExit(ERRINFO, "stateName=h1,h2,...");
if (strlen(buffer2) == 0 || buffer2[0] != '=')
errorAndExit(ERRINFO, "stateName=h1,h2,...");
auto history = split(buffer2+1, ',');
for (auto & s : history)
actionHistory[buffer].emplace_back(s);
}
while (fscanf(filePtr, "----%s\n", buffer) != 1)
{
if (fscanf(filePtr, "%[^=]%[^\n]\n", buffer, buffer2) < 1)
errorAndExit(ERRINFO, "stateName=e1,e2,...");
if (strlen(buffer2) == 0 || buffer2[0] != '=')
errorAndExit(ERRINFO, "stateName=h1,h2,...");
auto history = split(buffer2+1, ',');
for (auto & s : history)
{
if (!isNum(s))
errorAndExit(ERRINFO, "number instead of string");
entropyHistory[buffer].emplace_back(std::stof(s));
}
}
while (fscanf(filePtr, "----%s\n", buffer) != 1)
{
if (fscanf(filePtr, "%[^\n]\n", buffer) != 1)
{
fprintf(stderr, "<%s>\n", buffer);
errorAndExit(ERRINFO, "TAPENAME\txx=yy\trr=zz...");
}
auto splited = split(buffer, '\t');
if (splited.size() < 2)
errorAndExit(ERRINFO, "TAPENAME\txx=yy\trr=zz...");
tapes.emplace_back();
tapes.back().name = splited[0];
tapes.back().isKnown = false;
tapes.back().ref.resize(splited.size()-1);
for (unsigned int i = 1; i < splited.size(); i++)
{
auto parts = split(splited[i], '=');
std::string indexStr = parts[0];
std::string value;
for (unsigned int j = 1; j < parts.size(); j++)
value += parts[j]+(j == parts.size()-1 ? std::string("") : std::string("="));
if (!isNum(indexStr))
errorAndExit(ERRINFO, "number instead of string");
int index = std::stoi(indexStr);
if (firstIndex == -1)
firstIndex = index;
index -= firstIndex;
tapes.back().ref[index] = value;
}
tapes.back().hyp = tapes.back().ref;
}
head -= firstIndex;
for (auto & s : stack)
{
s -= firstIndex;
if (s < 0)
fprintf(stderr, "WARNING (%s) : stack element \'%d\' is negative, window was too small when creating the error corpus\n", ERRINFO, s);
}
if (stackHistory > 0)
stackHistory -= firstIndex;
}
void Config::addToEntropyHistory(float entropy)
{
entropyHistory[*currentStateName].emplace_back(entropy);
}
......@@ -76,6 +76,25 @@ void Oracle::createDatabase()
return;
isInit = true;
str2oracle.emplace("null", std::unique_ptr<Oracle>(new Oracle(
[](Oracle *)
{
},
[](Config &, Oracle *)
{
fprintf(stderr, "ERROR (%s) : getAction called on null Oracle. Aborting.\n", ERRINFO);
exit(1);
return std::string("");
},
[](Config &, Oracle *, const std::string &)
{
fprintf(stderr, "ERROR (%s) : getAction called on null Oracle. Aborting.\n", ERRINFO);
exit(1);
return false;
})));
str2oracle.emplace("tagger", std::unique_ptr<Oracle>(new Oracle(
[](Oracle *)
{
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment