diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp index b06daebc76d4febae6e436f6032a336b43115fd4..f299d70d71dd0a44ad65d5d710f0df996b65460e 100644 --- a/decoder/src/Decoder.cpp +++ b/decoder/src/Decoder.cpp @@ -182,16 +182,20 @@ struct BeamNode BeamNode(TransitionMachine & tm, Config & config) : tm(tm), config(config) { totalEntropy = 0.0; + config.setOutputFile(nullptr); } BeamNode(BeamNode & other, Action * action, float proba) : tm(other.tm), config(other.config) { totalEntropy = other.totalEntropy + proba; this->action = action; + config.setOutputFile(nullptr); } }; void Decoder::decode() { + config.reset(); + if (ProgramParameters::beamSize > 1) decodeBeam(); else @@ -209,6 +213,9 @@ void Decoder::decodeNoBeam() int nbActionsCutoff = 200; float currentSpeed = 0.0; auto pastTime = std::chrono::high_resolution_clock::now(); + FILE * outputFile = stdout; + + config.setOutputFile(outputFile); while (!config.isFinal()) { @@ -242,8 +249,6 @@ void Decoder::decodeNoBeam() if (ProgramParameters::errorAnalysis) errors.printStats(); - else - config.printAsOutput(stdout); fprintf(stderr, " \n"); } @@ -260,6 +265,8 @@ void Decoder::decodeBeam() float currentSpeed = 0.0; auto pastTime = std::chrono::high_resolution_clock::now(); + FILE * outputFile = stdout; + std::vector< std::shared_ptr<BeamNode> > beam; std::vector< std::shared_ptr<BeamNode> > otherBeam; beam.emplace_back(new BeamNode(tm, config)); @@ -311,6 +318,7 @@ void Decoder::decodeBeam() beam = otherBeam; sortBeam(); beam.resize(std::min((int)beam.size(), ProgramParameters::beamSize)); + beam[0]->config.setOutputFile(outputFile); for (auto & node : beam) { @@ -341,8 +349,6 @@ void Decoder::decodeBeam() if (ProgramParameters::errorAnalysis) errors.printStats(); - else - beam[0]->config.printAsOutput(stdout); fprintf(stderr, " \n"); } diff --git a/decoder/src/macaon_decode.cpp b/decoder/src/macaon_decode.cpp index 4740dc98670d522f9a22365878642edf551dc893..7b9a73ccf4552052440ee99af1d19904f2c16d5a 100644 --- a/decoder/src/macaon_decode.cpp +++ b/decoder/src/macaon_decode.cpp @@ -47,10 +47,10 @@ po::options_description getOptionsDescription() "The name of the buffer's tape that contains the delimiter token for a sequence") ("sequenceDelimiter", po::value<std::string>()->default_value("1"), "The value of the token that act as a delimiter for sequences") - ("tapeSize", po::value<int>()->default_value(100000), - "Number of lines in the input file.") ("showFeatureRepresentation", po::value<int>()->default_value(0), "For each state of the Config, show its feature representation") + ("readSize", po::value<int>()->default_value(0), + "The number of lines of input that will be read and stored in memory at once.") ("interactive", po::value<bool>()->default_value(true), "Is the shell interactive ? Display advancement informations") ("lang", po::value<std::string>()->default_value("fr"), @@ -143,7 +143,10 @@ int main(int argc, char * argv[]) ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>(); ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>(); ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<int>(); - ProgramParameters::tapeSize = vm["tapeSize"].as<int>(); + ProgramParameters::tapeSize = getNbLines(ProgramParameters::input); + ProgramParameters::readSize = vm["readSize"].as<int>(); + if (ProgramParameters::readSize == 0) + ProgramParameters::readSize = ProgramParameters::tapeSize; ProgramParameters::beamSize = vm["beamSize"].as<int>(); ProgramParameters::nbChilds = vm["nbChilds"].as<int>(); ProgramParameters::optimizer = "none"; diff --git a/maca_common/include/LimitedArray.hpp b/maca_common/include/LimitedArray.hpp index f7176f6ba6b25613c83d5cfe695a0f15555629d4..b45f0420f80d94568d2edc551781d05b967b6ff3 100644 --- a/maca_common/include/LimitedArray.hpp +++ b/maca_common/include/LimitedArray.hpp @@ -30,7 +30,6 @@ class LimitedArray nbElements = 0; lastElementDataIndex = -1; lastElementRealIndex = -1; - data.clear(); } void push(const T & elem) @@ -67,6 +66,30 @@ class LimitedArray { std::copy(other.data.begin()+from, other.data.begin()+to, std::back_inserter(data)); } + + void printForDebug(FILE * out) const + { + for (int i = 0; i < std::min(nbElements,(int)data.size()); i++) + fprintf(out, "<%s>", data[i].c_str()); + fprintf(out, "\n"); + } + + void clearData() + { + data.clear(); + } + + int getNextOverridenDataIndex() + { + if (lastElementRealIndex <= (int)data.size()) + return -1; + + int res = lastElementDataIndex++; + if (res >= (int)data.size()) + res = 0; + + return res; + } }; #endif diff --git a/maca_common/include/ProgramParameters.hpp b/maca_common/include/ProgramParameters.hpp index 7c35d24ef17d7a0ccb84e227d064afa5661f5047..1457d1b024043d68afb07c840b98c89e68bd2108 100644 --- a/maca_common/include/ProgramParameters.hpp +++ b/maca_common/include/ProgramParameters.hpp @@ -64,6 +64,8 @@ struct ProgramParameters static int beamSize; static int nbChilds; static int tapeSize; + static int devTapeSize; + static int readSize; private : diff --git a/maca_common/include/util.hpp b/maca_common/include/util.hpp index a2b67dfb167117826fbf9a2e6a0e715f918c9052..434963f6dcc644d9ca9af4e4d2714e8cf5407596 100644 --- a/maca_common/include/util.hpp +++ b/maca_common/include/util.hpp @@ -196,6 +196,13 @@ std::string int2humanStr(int number); /// @return Random number between -range and +range float getRandomValueInRange(int range); +/// @brief Return the number of lines in a file. +/// +/// @param filename The name of the file. +/// +/// @return The number of lines in that file. +int getNbLines(const std::string & filename); + /// @brief Macro giving informations about an error. #define ERRINFO (getFilenameFromPath(std::string(__FILE__))+ ":l." + std::to_string(__LINE__)).c_str() diff --git a/maca_common/src/ProgramParameters.cpp b/maca_common/src/ProgramParameters.cpp index 6cb84d64a5e8dbc7c2b0e8617b2a85642efd5466..67dc32bd9759ca8c83d86ca6bb75821db8baf330 100644 --- a/maca_common/src/ProgramParameters.cpp +++ b/maca_common/src/ProgramParameters.cpp @@ -58,4 +58,6 @@ int ProgramParameters::nbIndividuals; int ProgramParameters::beamSize; int ProgramParameters::nbChilds; int ProgramParameters::tapeSize; +int ProgramParameters::devTapeSize; +int ProgramParameters::readSize; diff --git a/maca_common/src/util.cpp b/maca_common/src/util.cpp index 7c3342048de7281e4aa1775d229b7fbc6517276d..5a772f28bcf9519ae81808f8bff387c93852b3f5 100644 --- a/maca_common/src/util.cpp +++ b/maca_common/src/util.cpp @@ -1,4 +1,5 @@ #include "util.hpp" +#include "File.hpp" #include <algorithm> #include <cstring> #include <ctime> @@ -438,3 +439,16 @@ std::string int2humanStr(int number) return result; } +int getNbLines(const std::string & filename) +{ + File file(filename, "r"); + + char buffer[10000]; + + int nbLines = 0; + while (fscanf(file.getDescriptor(), "%[^\n]\n", buffer) == 1) + nbLines++; + + return nbLines; +} + diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp index 63c07cb43838cf962c24c23e8b4e2a8ef1f2df9d..c89bfe2856e28bdb3a360c907fec0ccdd1b0347d 100644 --- a/trainer/src/Trainer.cpp +++ b/trainer/src/Trainer.cpp @@ -56,7 +56,7 @@ void Trainer::computeScoreOnDev() // Print current iter advancement in percentage if (ProgramParameters::interactive) { - int totalSize = ProgramParameters::tapeSize; + int totalSize = ProgramParameters::devTapeSize; int steps = devConfig->getHead(); if (steps && (steps % 200 == 0 || totalSize-steps < 200)) { diff --git a/trainer/src/macaon_train.cpp b/trainer/src/macaon_train.cpp index 4f8e10d931a33fdea44c1e6421b01e5dda541d34..f5f39db3b92ebfa5d269aaccd2a6e86197b29249 100644 --- a/trainer/src/macaon_train.cpp +++ b/trainer/src/macaon_train.cpp @@ -34,9 +34,7 @@ po::options_description getOptionsDescription() ("mcd", po::value<std::string>()->required(), "MCD file that describes the input") ("train,T", po::value<std::string>()->required(), - "Training corpus formated according to the MCD") - ("tapeSize", po::value<int>()->required(), - "Number of lines in the input file."); + "Training corpus formated according to the MCD"); po::options_description opt("Optional"); opt.add_options() @@ -161,6 +159,10 @@ void updatePaths() ProgramParameters::trainFilename = ProgramParameters::expPath + ProgramParameters::trainName; ProgramParameters::devFilename = ProgramParameters::expPath + ProgramParameters::devName; ProgramParameters::newTemplatePath = ProgramParameters::langPath + "bin/" + ProgramParameters::baseExpName + slash; + + ProgramParameters::tapeSize = getNbLines(ProgramParameters::trainFilename); + ProgramParameters::devTapeSize = ProgramParameters::devFilename.empty() ? 0 : getNbLines(ProgramParameters::devFilename); + ProgramParameters::readSize = ProgramParameters::tapeSize; } /// @brief Create the folder containing the current experiment from the template frolder @@ -285,7 +287,6 @@ int main(int argc, char * argv[]) ProgramParameters::dynamicEpoch = vm["epochd"].as<int>(); ProgramParameters::dynamicProbability = vm["proba"].as<float>(); ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<int>(); - ProgramParameters::tapeSize = vm["tapeSize"].as<int>(); ProgramParameters::iterationSize = vm["iterationSize"].as<int>(); std::string featureModels = vm["featureModels"].as<std::string>(); if (!featureModels.empty()) diff --git a/transition_machine/include/Config.hpp b/transition_machine/include/Config.hpp index 922f642354749a3d1960d1a3eecdef1461e606b0..8bbb9e18a3857d835769132578c7d86f3a84f623 100644 --- a/transition_machine/include/Config.hpp +++ b/transition_machine/include/Config.hpp @@ -70,6 +70,8 @@ class Config /// /// @return True if the head of this tape is on the last cell. bool headIsAtEnd() const; + /// @brief Must be used before copying another Tape's data into this one, but never used anywhere else. + void clearDataForCopy(); public : @@ -111,6 +113,8 @@ class Config /// @param from first cell index of the chunk to copy. /// @param to last cell index of the chunk to copy. void copyPart(Tape & other, unsigned int from, unsigned int to); + /// @brief Get the last tape index that will be overriden with the next read. + int getNextOverridenDataIndex(); }; private : @@ -136,6 +140,8 @@ class Config std::shared_ptr<File> file; /// @brief If the end of input was reached during reading. bool inputAllRead; + /// @brief Where to print the output. + FILE * outputFile; public : @@ -174,7 +180,8 @@ class Config /// @brief Print the tapes as the output of the program. /// /// @param output Where to print. - void printAsOutput(FILE * output); + /// @param index Index of line to print. + void printAsOutput(FILE * output, int index); /// @brief Print the Config without information loss. /// /// @param output Where to print. @@ -292,6 +299,8 @@ class Config /// /// @return True if the head is at the end of the tapes. bool endOfTapes() const; + /// @brief Set the output file. + void setOutputFile(FILE * outputFile); }; #endif diff --git a/transition_machine/src/Config.cpp b/transition_machine/src/Config.cpp index 1be439008ef66969edc73033cd2ddf37d7b282fc..40ecbe64c3d14f3bf5574260372a6245aebe5ce4 100644 --- a/transition_machine/src/Config.cpp +++ b/transition_machine/src/Config.cpp @@ -6,6 +6,7 @@ Config::Config(BD & bd, const std::string inputFilename) : bd(bd), hashHistory(10), pastActions(100) { + this->outputFile = nullptr; this->stackHistory = -1; this->currentStateName = nullptr; this->inputFilename = inputFilename; @@ -15,7 +16,7 @@ Config::Config(BD & bd, const std::string inputFilename) : bd(bd), hashHistory(1 tapes.emplace_back(bd.getNameOfLine(i), bd.lineIsKnown(i)); } -Config::Tape::Tape(const std::string & name, bool isKnown) : ref(100), hyp(100) +Config::Tape::Tape(const std::string & name, bool isKnown) : ref(ProgramParameters::readSize*4+1), hyp(ProgramParameters::readSize*4+1) { this->head = 0; this->name = name; @@ -34,6 +35,9 @@ Config::Tape & Config::getTapeByInputCol(int col) void Config::readInput() { + if (inputAllRead) + return; + if (!file.get()) file.reset(new File(inputFilename, "r")); FILE * fd = file->getDescriptor(); @@ -42,7 +46,7 @@ void Config::readInput() std::vector<std::string> cols; unsigned int usualColsSize = 0; - int toRead = 100; + int toRead = ProgramParameters::readSize; int haveRead = 0; while(fscanf(fd, "%[^\n]\n", buffer) == 1 && haveRead < toRead) @@ -57,6 +61,8 @@ void Config::readInput() exit(1); } + printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex()); + for(unsigned int i = 0; i < cols.size(); i++) if(bd.hasLineOfInputCol(i)) { @@ -82,7 +88,7 @@ void Config::readInput() while(tape.hypSize() < maxTapeSize) tape.addToHyp(""); - if (haveRead < toRead) + if (haveRead < toRead || (int)tape.size() == ProgramParameters::tapeSize) { tape.addToRef("0"); tape.addToHyp(""); @@ -138,7 +144,7 @@ void Config::printForDebug(FILE * output) cols[colIndex].emplace_back(i == head ? " || " : ""); } - cols[colIndex].emplace_back(shrink(tape[i])); + cols[colIndex].emplace_back(shrink(tape[i-head])); } } @@ -164,23 +170,21 @@ void Config::printAsExample(FILE *) exit(1); } -void Config::printAsOutput(FILE *) +void Config::printAsOutput(FILE * output, int index) { - //TODO : Output should be done when reading, and at the end of the program. + if (index == -1 || !output) + return; - /* unsigned int lastToPrint = 0; for (unsigned int j = 0; j < tapes.size(); j++) if(bd.mustPrintLine(j)) lastToPrint = j; - for (unsigned int i = 0; i < tapes[0].hyp.size() - 1; i++) - for (unsigned int j = 0; j < tapes.size(); j++) - { - if(bd.mustPrintLine(j)) - fprintf(output, "%s%s", tapes[j][i].empty() ? "0" : tapes[j][i].c_str(), j == lastToPrint ? "\n" : "\t"); - } - */ + for (unsigned int j = 0; j < tapes.size(); j++) + { + if(bd.mustPrintLine(j)) + fprintf(output, "%s%s", tapes[j][index].empty() ? "0" : tapes[j][index].c_str(), j == lastToPrint ? "\n" : "\t"); + } } void Config::moveHead(int mvt) @@ -191,6 +195,14 @@ void Config::moveHead(int mvt) for (auto & tape : tapes) tape.moveHead(mvt); + + if (mvt > 0 && head % ProgramParameters::readSize == 0) + readInput(); + } + else if (!endOfTapes()) + { + fprintf(stderr, "ERROR (%s) : Input has not been read completely, yet the head is already at the end of tapes. Aborting.\n", ERRINFO); + exit(1); } } @@ -211,10 +223,12 @@ void Config::reset() stack.clear(); stackHistory = -1; + inputAllRead = false; head = 0; file.reset(); - readInput(); + while (tapes[0].size() < ProgramParameters::readSize*4) + readInput(); } const std::string & Config::Tape::operator[](int relativeIndex) @@ -307,7 +321,7 @@ void Config::shuffle(const std::string & delimiterTape, const std::string & deli for (unsigned int tape = 0; tape < tapes.size(); tape++) { - newTapes[tape].clear(); + newTapes[tape].clearDataForCopy(); for (auto & delimiter : delimiters) newTapes[tape].copyPart(tapes[tape], delimiter.first, delimiter.second+1); @@ -454,12 +468,12 @@ int Config::Tape::size() int Config::Tape::refSize() { - return ref.getLastIndex(); + return ref.getLastIndex()+1; } int Config::Tape::hypSize() { - return hyp.getLastIndex(); + return hyp.getLastIndex()+1; } void Config::Tape::addToHyp(const std::string & elem) @@ -485,3 +499,19 @@ void Config::Tape::copyPart(Tape & other, unsigned int from, unsigned int to) } +void Config::Tape::clearDataForCopy() +{ + ref.clearData(); + hyp.clearData(); +} + +void Config::setOutputFile(FILE * outputFile) +{ + this->outputFile = outputFile; +} + +int Config::Tape::getNextOverridenDataIndex() +{ + return ref.getNextOverridenDataIndex(); +} +