From 1ffa121ad252e135de198b8b4708f26dafdf06cb Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Tue, 22 Oct 2019 14:59:32 +0200 Subject: [PATCH] Fixed tokenization --- decoder/src/Decoder.cpp | 12 ++++++++++- maca_common/src/util.cpp | 6 ++++++ transition_machine/src/Config.cpp | 34 +++++++++++++++++++++++++++---- 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp index b9aa33d..be13aff 100644 --- a/decoder/src/Decoder.cpp +++ b/decoder/src/Decoder.cpp @@ -205,7 +205,8 @@ void applyActionAndTakeTransition(TransitionMachine & tm, const std::string & ac void Decoder::decode() { - config.reset(); + if (!ProgramParameters::rawInput) + config.reset(); config.fillTapesWithInput(); if (ProgramParameters::beamSize > 1) @@ -231,8 +232,14 @@ void Decoder::decodeNoBeam() config.setOutputFile(outputFile); + if (ProgramParameters::debug) + fprintf(stderr, "Begin decode\n"); + while (!config.isFinal()) { + if (ProgramParameters::debug) + fprintf(stderr, "Config is not final\n"); + config.setCurrentStateName(tm.getCurrentClassifier()->name); Dict::currentClassifierName = tm.getCurrentClassifier()->name; @@ -260,6 +267,9 @@ void Decoder::decodeNoBeam() computeAndPrintSequenceEntropy(config, justFlipped, errors, entropyAccumulator, nbActionsInSequence); } + if (ProgramParameters::debug) + fprintf(stderr, "Config is final\n"); + if (ProgramParameters::errorAnalysis) errors.printStats(); diff --git a/maca_common/src/util.cpp b/maca_common/src/util.cpp index 1266a49..357892f 100644 --- a/maca_common/src/util.cpp +++ b/maca_common/src/util.cpp @@ -248,6 +248,12 @@ std::vector<std::string> split(const std::string & s, char sep) if (!res.empty() && res.back().empty()) res.pop_back(); + if (res.empty()) + { + fprintf(stderr, "ERROR (%s) : asked to split \'%s\' with separator \'%c\'. Aborting.\n", ERRINFO, s.c_str(), sep); + exit(1); + } + return res; } diff --git a/transition_machine/src/Config.cpp b/transition_machine/src/Config.cpp index 82a3867..7baae4f 100644 --- a/transition_machine/src/Config.cpp +++ b/transition_machine/src/Config.cpp @@ -94,6 +94,8 @@ void Config::readInput() tape.addToHyp(""); } + fprintf(stderr, "rawInputHeadIndex=%d rawInputSize=%lu\n", rawInputHeadIndex, rawInput.size()); + return; } @@ -154,6 +156,9 @@ void Config::readInput() void Config::fillTapesWithInput() { + if (ProgramParameters::rawInput) + return; + rawInput = ""; std::vector<std::string> cols; unsigned int usualColsSize = 0; @@ -409,12 +414,30 @@ void Config::moveHead(int mvt) if (mvt > 0) for (int i = 0; i < mvt; i++) - if (hasTape("ID") && util::split(getTape("ID").getHyp(i), '-').size() <= 1) + { + if (!hasTape("ID")) + break; + std::string id = getTape("ID").getHyp(i); + if (id.empty()) + id = getTape("ID").getRef(i); + if (id.empty()) + continue; + if (util::split(id, '-').size() <= 1) currentWordIndex += 1; + } if (mvt < 0) for (int i = 0; i < mvt; i++) - if (hasTape("ID") && util::split(getTape("ID").getHyp(-i), '-').size() <= 1) - currentWordIndex -= 1; + { + if (!hasTape("ID")) + break; + std::string id = getTape("ID").getHyp(i); + if (id.empty()) + id = getTape("ID").getRef(-i); + if (id.empty()) + continue; + if (util::split(id, '-').size() <= 1) + currentWordIndex += 1; + } for (auto & tape : tapes) tape.moveHead(mvt); @@ -450,7 +473,7 @@ void Config::moveRawInputHead(int mvt) bool Config::isFinal() { - if (rawInputHeadIndex > 0 && !rawInput.empty()) + if (ProgramParameters::rawInput || (rawInputHeadIndex > 0 && !rawInput.empty())) return (rawInputHeadIndex >= (int)rawInput.size()); return endOfTapes() && stack.empty(); @@ -911,6 +934,9 @@ void Config::setGovsAsUD(bool ref) void Config::updateIdsInSequence() { + if (ProgramParameters::rawInput || rawInputHeadIndex > 0) + return; + int sentenceEnd = stackHasIndex(0) ? stackGetElem(0) : getHead(); auto & eos = getTape(ProgramParameters::sequenceDelimiterTape); auto & ids = getTape("ID"); -- GitLab