diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp index 3218009c924c0d3d91d1f1a30994c98a136c76df..f9e5ef175b5bb58d7a33fffa20152db6408ed86b 100644 --- a/decoder/src/Decoder.cpp +++ b/decoder/src/Decoder.cpp @@ -65,7 +65,12 @@ void printAdvancement(Config & config, float currentSpeed, int nbActionsCutoff) int totalSize = ProgramParameters::tapeSize; int steps = config.getHead(); if (steps && (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff)) - fprintf(stderr, "Decode : %.2f%% speed : %s actions/s\r", 100.0*steps/totalSize, int2humanStr((int)currentSpeed).c_str()); + { + if (ProgramParameters::rawInput) + fprintf(stderr, "Decode : %.2f%% speed : %s actions/s\r", 100.0*config.rawInputHeadIndex/config.rawInput.size(), int2humanStr((int)currentSpeed).c_str()); + else + fprintf(stderr, "Decode : %.2f%% speed : %s actions/s\r", 100.0*steps/totalSize, int2humanStr((int)currentSpeed).c_str()); + } } } diff --git a/decoder/src/macaon_decode.cpp b/decoder/src/macaon_decode.cpp index dbe947ffe36d4f8a6d8b5958ba79b48574a5bf11..cd5a532bd1de4e238c00b038e5b05a7a0d5876e7 100644 --- a/decoder/src/macaon_decode.cpp +++ b/decoder/src/macaon_decode.cpp @@ -31,7 +31,7 @@ po::options_description getOptionsDescription() ("mcd", po::value<std::string>()->required(), "MCD file that describes the input") ("input,I", po::value<std::string>()->required(), - "Input file formated according to the mcd"); + "Input file formated according to the mcd, or rawInput"); po::options_description opt("Optional"); opt.add_options() @@ -60,6 +60,7 @@ po::options_description getOptionsDescription() "The maximal size of the stack (dependency parsing).") ("interactive", po::value<bool>()->default_value(true), "Is the shell interactive ? Display advancement informations") + ("rawInput", "Is the input file raw text ?") ("tapeToMask", po::value<std::string>()->default_value("FORM"), "The name of the Tape for which some of the elements will be masked.") ("maskRate", po::value<float>()->default_value(0.0), @@ -148,6 +149,7 @@ int main(int argc, char * argv[]) ProgramParameters::showActions = vm.count("showActions") == 0 ? false : true; ProgramParameters::noNeuralNetwork = vm.count("noNeuralNetwork") == 0 ? false : true; ProgramParameters::interactive = vm["interactive"].as<bool>(); + ProgramParameters::rawInput = vm.count("rawInput") == 0 ? false : true; ProgramParameters::errorAnalysis = vm.count("errorAnalysis") == 0 ? false : true; ProgramParameters::nbErrorsToShow = vm["nbErrorsToShow"].as<int>(); ProgramParameters::meanEntropy = vm.count("meanEntropy") == 0 ? false : true; @@ -159,7 +161,7 @@ int main(int argc, char * argv[]) ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>(); ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>(); ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<int>(); - ProgramParameters::tapeSize = getNbLines(ProgramParameters::input); + ProgramParameters::tapeSize = ProgramParameters::rawInput ? 200000 : getNbLines(ProgramParameters::input); ProgramParameters::readSize = vm["readSize"].as<int>(); if (ProgramParameters::readSize == 0) ProgramParameters::readSize = ProgramParameters::tapeSize; diff --git a/maca_common/include/ProgramParameters.hpp b/maca_common/include/ProgramParameters.hpp index 8cd22a77756e7dc6e035ac1854a79d9a4d3f9cef..720116e8239235108aa4866882d033f632fd8b78 100644 --- a/maca_common/include/ProgramParameters.hpp +++ b/maca_common/include/ProgramParameters.hpp @@ -80,6 +80,7 @@ struct ProgramParameters static bool showActions; static bool delayedOutput; static int maxStackSize; + static bool rawInput; private : diff --git a/maca_common/include/util.hpp b/maca_common/include/util.hpp index 7833af177fdf80853ae5718b326ac428b49ce222..c9033fccf22b843fd2365007557cb974b2dc884f 100644 --- a/maca_common/include/util.hpp +++ b/maca_common/include/util.hpp @@ -28,6 +28,18 @@ bool isNotSeparator(char c); /// /// @return Whether or not this symbol is the start of a new line. bool isNewline(char c); +/// @brief Whether or not this symbol is a space encoded in utf8. +/// +/// @param c iterator to the strat of the symbol to check. +/// +/// @return Whether or not this symbol is an utf8 space. +bool isUtf8Space(const std::string::iterator & c); +/// @brief Whether or not this symbol is a separator encoded in utf8. +/// +/// @param c iterator to the strat of the symbol to check. +/// +/// @return Whether or not this symbol is an utf8 separator. +bool isUtf8Separator(const std::string::iterator & c); /// @brief Get the filename of a file given it's relative or absolute path. /// /// getFilenameFromPath(/home/toto/File.ex) = File.ex diff --git a/maca_common/src/ProgramParameters.cpp b/maca_common/src/ProgramParameters.cpp index c6fc86f45adc84cb912f9742151ebb2b2e863700..84c90c5b2aebf88c12aa7efe41e88632b6f3ef21 100644 --- a/maca_common/src/ProgramParameters.cpp +++ b/maca_common/src/ProgramParameters.cpp @@ -74,4 +74,5 @@ bool ProgramParameters::noNeuralNetwork; bool ProgramParameters::showActions; bool ProgramParameters::delayedOutput; int ProgramParameters::maxStackSize; +bool ProgramParameters::rawInput; diff --git a/maca_common/src/util.cpp b/maca_common/src/util.cpp index 8682941099488c48fd061637fdd4af792812b9ad..6dc2c199881971ce0eece40e4c4b115a2720d21b 100644 --- a/maca_common/src/util.cpp +++ b/maca_common/src/util.cpp @@ -20,6 +20,46 @@ bool isSeparator(char c) return (c == EOF) || endLine(c) || (c == ' ' || c == '\t'); } +bool isUtf8Space(const std::string::iterator & c) +{ + auto it = c; + utf8::next(it, it+10); + int nbChar = it - c; + + if (nbChar == 1) + { + if ((unsigned char)c[0] == 0x20) + return true; + } + else if (nbChar == 2) + { + if ((unsigned char)c[0] == 0xC2 && (unsigned char)c[1] == 0xA0) + return true; + } + else if (nbChar == 3) + { + if ((unsigned char)c[0] == 0xE1 && (unsigned char)c[1] == 0x9A && (unsigned char)c[2] == 0x80) + return true; + if ((unsigned char)c[0] == 0xE2 && (unsigned char)c[1] == 0x80 && (unsigned char)c[2] >= 0x80 && (unsigned char)c[2] <= 0x89) + return true; + if ((unsigned char)c[0] == 0xE2 && (unsigned char)c[1] == 0x80 && (unsigned char)c[2] == 0x8A) + return true; + if ((unsigned char)c[0] == 0xE2 && (unsigned char)c[1] == 0x80 && (unsigned char)c[2] == 0xAF) + return true; + if ((unsigned char)c[0] == 0xE2 && (unsigned char)c[1] == 0x81 && (unsigned char)c[2] == 0x9F) + return true; + if ((unsigned char)c[0] == 0xE3 && (unsigned char)c[1] == 0x80 && (unsigned char)c[2] == 0x80) + return true; + } + + return false; +} + +bool isUtf8Separator(const std::string::iterator & c) +{ + return (c[0] == EOF) || endLine(c[0]) || (c[0] == '\t') || isUtf8Space(c); +} + bool isNotSeparator(char c) { return !isSeparator(c); diff --git a/transition_machine/include/ActionBank.hpp b/transition_machine/include/ActionBank.hpp index 8fe5817b8e5a20e890ebda5b40d8a57d5377bfd2..d7affe9dc64cdf0e2e57e207c17fa40c858a0e68 100644 --- a/transition_machine/include/ActionBank.hpp +++ b/transition_machine/include/ActionBank.hpp @@ -114,6 +114,14 @@ class ActionBank /// \return A BasicAction only appliable if word is the prefix of rawInput. static Action::BasicAction rawInputBeginsWith(std::string word); + /// \brief Verify that certain cell is not empty. + /// + /// \param tape The tape to test + /// \param relativeIndex The index on this tape + /// + /// \return A BasicAction only appliable if the tape at relativeIndex is not empty. + static Action::BasicAction checkNotEmpty(std::string tape, int relativeIndex); + /// \brief Write something on the buffer /// /// \param tapeName The tape we will write to @@ -128,10 +136,25 @@ class ActionBank /// \return The corresponding BasicAction static Action::BasicAction pushHead(); + /// \brief Verify that the character pointed by the rawInputHead is a space. + /// + /// \return A BasicAction only appliable if the character pointed by the rawInputHead is a space. + static Action::BasicAction checkRawInputHeadIsSpace(); + + /// \brief Verify that the character pointed by the rawInputHead is a separator. + /// + /// \return A BasicAction only appliable if the character pointed by the rawInputHead is a separator. + static Action::BasicAction checkRawInputHeadIsSeparator(); + /// \brief Pop the stack /// /// \return The corresponding BasicAction static Action::BasicAction stackPop(bool checkGov); + + /// \brief Add size cells to all tapes, until there are size cells past head. + /// + /// \return The corresponding BasicAction + static Action::BasicAction increaseTapesIfNeeded(int size); }; #endif diff --git a/transition_machine/src/ActionBank.cpp b/transition_machine/src/ActionBank.cpp index 22bdde4c72d058d33368810cd1dc21c5bfb373ee..b6c43b62516b676eaf7cc251edcceda1222998eb 100644 --- a/transition_machine/src/ActionBank.cpp +++ b/transition_machine/src/ActionBank.cpp @@ -17,6 +17,27 @@ Action::BasicAction ActionBank::moveHead(int movement) return basicAction; } +Action::BasicAction ActionBank::increaseTapesIfNeeded(int size) +{ + auto apply = [size](Config & c, Action::BasicAction &) + { + for (auto & tape : c.tapes) + for (int i = 0; i <= size-(tape.refSize()-c.getHead()); i++) + { + tape.addToRef(""); + tape.addToHyp(""); + } + }; + auto undo = [](Config &, Action::BasicAction &) + {}; + auto appliable = [](Config &, Action::BasicAction &) + {return true;}; + Action::BasicAction basicAction = + {Action::BasicAction::Type::Write, "", apply, undo, appliable}; + + return basicAction; +} + Action::BasicAction ActionBank::moveRawInputHead(int movement) { auto apply = [movement](Config & c, Action::BasicAction &) @@ -24,13 +45,45 @@ Action::BasicAction ActionBank::moveRawInputHead(int movement) auto undo = [movement](Config & c, Action::BasicAction &) {c.moveRawInputHead(-movement);}; auto appliable = [movement](Config & c, Action::BasicAction &) - {return c.rawInputHeadIndex+movement < (int)c.rawInput.size();}; + {return c.rawInputHeadIndex+movement <= (int)c.rawInput.size();}; Action::BasicAction basicAction = {Action::BasicAction::Type::MoveHead, "", apply, undo, appliable}; return basicAction; } +Action::BasicAction ActionBank::checkRawInputHeadIsSpace() +{ + auto apply = [](Config &, Action::BasicAction &) + {}; + auto undo = [](Config &, Action::BasicAction &) + {}; + auto appliable = [](Config & c, Action::BasicAction &) + { + return isUtf8Space(c.rawInput.begin()+c.rawInputHeadIndex); + }; + Action::BasicAction basicAction = + {Action::BasicAction::Type::Write, "", apply, undo, appliable}; + + return basicAction; +} + +Action::BasicAction ActionBank::checkRawInputHeadIsSeparator() +{ + auto apply = [](Config &, Action::BasicAction &) + {}; + auto undo = [](Config &, Action::BasicAction &) + {}; + auto appliable = [](Config & c, Action::BasicAction &) + { + return isUtf8Separator(c.rawInput.begin()+c.rawInputHeadIndex); + }; + Action::BasicAction basicAction = + {Action::BasicAction::Type::Write, "", apply, undo, appliable}; + + return basicAction; +} + Action::BasicAction ActionBank::rawInputBeginsWith(std::string word) { auto apply = [](Config &, Action::BasicAction &) @@ -139,6 +192,22 @@ Action::BasicAction ActionBank::stackPop(bool checkGov) return basicAction; } +Action::BasicAction ActionBank::checkNotEmpty(std::string tape, int relativeIndex) +{ + auto apply = [](Config &, Action::BasicAction &) + {}; + auto undo = [](Config &, Action::BasicAction &) + {}; + auto appliable = [tape, relativeIndex](Config & c, Action::BasicAction &) + { + return !c.getTape(tape).getHyp(relativeIndex).empty(); + }; + Action::BasicAction basicAction = + {Action::BasicAction::Type::Write, "", apply, undo, appliable}; + + return basicAction; +} + std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & name) { auto invalidNameAndAbort = [&](const char * errInfo) @@ -231,10 +300,13 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na } else if(std::string(b1) == "IGNORECHAR") { + sequence.emplace_back(checkRawInputHeadIsSeparator()); sequence.emplace_back(moveRawInputHead(1)); } else if(std::string(b1) == "ENDWORD") { + sequence.emplace_back(checkNotEmpty("FORM", 0)); + sequence.emplace_back(increaseTapesIfNeeded(1)); } else if(std::string(b1) == "ADDCHARTOWORD") { @@ -262,6 +334,8 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na sequence.emplace_back(moveRawInputHead(nbSymbols)); + sequence.emplace_back(increaseTapesIfNeeded(splited.size()-1)); + for (unsigned int i = 1; i < splited.size(); i++) sequence.emplace_back(bufferWrite("FORM", splited[i], i-1)); } diff --git a/transition_machine/src/Config.cpp b/transition_machine/src/Config.cpp index 69e37eadae50f58b6566e872a0a6546b2cc14002..6cc328fbd06ca05733c91fa976a2f96ccd0282d8 100644 --- a/transition_machine/src/Config.cpp +++ b/transition_machine/src/Config.cpp @@ -70,6 +70,23 @@ void Config::readInput() if (inputAllRead) return; + if (ProgramParameters::rawInput) + { + file.reset(new File(inputFilename, "r")); + while (!file->isFinished()) + rawInput += file->getChar(); + + inputAllRead = true; + + for (auto & tape : tapes) + { + tape.addToRef(""); + tape.addToHyp(""); + } + + return; + } + if (!file.get()) file.reset(new File(inputFilename, "r")); FILE * fd = file->getDescriptor(); @@ -233,6 +250,13 @@ void Config::printAsOutput(FILE * output, int dataIndex, int realIndex) void Config::moveHead(int mvt) { +// if (ProgramParameters::rawInput && head + mvt >= tapes[0].size()) +// for (auto & tape : tapes) +// { +// tape.addToRef(""); +// tape.addToHyp(""); +// } + if (head + mvt < tapes[0].size()) { head += mvt; @@ -274,7 +298,10 @@ void Config::moveRawInputHead(int mvt) bool Config::isFinal() { - return endOfTapes() && stack.empty(); + if (!ProgramParameters::rawInput) + return endOfTapes() && stack.empty(); + + return (rawInputHeadIndex >= (int)rawInput.size()); } void Config::reset() @@ -716,7 +743,7 @@ void Config::updateRawInput() for (int i = 0; i < textTape.size(); i++) { if (textTape[i] != "_") - rawInput += (rawInput.empty() ? std::string("") : std::string(" ")) + textTape[i]; + rawInput += (rawInput.empty() ? std::string("") : (choiceWithProbability(0.5) ? std::string(" ") : std::string("\n"))) + textTape[i]; } } diff --git a/transition_machine/src/Oracle.cpp b/transition_machine/src/Oracle.cpp index 769684e936a66187ee4a06f06ff13c9501860c9d..f5bcc73d12114c037e4125f35c4c016d7a8b612f 100644 --- a/transition_machine/src/Oracle.cpp +++ b/transition_machine/src/Oracle.cpp @@ -469,6 +469,8 @@ void Oracle::createDatabase() while (start+c.getHead() < c.getTape("SGN").size() && !c.getTape("SGN").getHyp(start).empty()) start++; + while (end >= 0 && c.getTape("FORM").getHyp(end).empty()) + end--; if (start > end) return std::string("NOTHING"); @@ -477,7 +479,7 @@ void Oracle::createDatabase() for(int i = start; i <= end; i++) { - const std::string & form = c.getTape("FORM").getRef(i); + const std::string & form = c.getTape("FORM").getRef(i).empty() ? c.getTape("FORM").getHyp(i) : c.getTape("FORM").getRef(i); std::string signature; if (oracle->data.count(form))