From a88641aec390eb34284ca6ee5afbbb7f2767f72d Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 2 Jul 2020 14:26:24 +0200 Subject: [PATCH] Fixed dynamic oracle for tokenization transitions --- reading_machine/src/Transition.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/reading_machine/src/Transition.cpp b/reading_machine/src/Transition.cpp index 352ca9b..9f6007b 100644 --- a/reading_machine/src/Transition.cpp +++ b/reading_machine/src/Transition.cpp @@ -197,9 +197,15 @@ void Transition::initIgnoreChar() { sequence.emplace_back(Action::ignoreCurrentCharacter()); - costDynamic = [](const Config &) + costDynamic = [](const Config & config) { - return 0; + auto letter = fmt::format("{}", config.getLetter(config.getCharacterIndex())); + auto goldWord = util::splitAsUtf8(config.getConst("FORM", config.getWordIndex(), 0).get()); + auto curWord = util::splitAsUtf8(config.getAsFeature("FORM", config.getWordIndex()).get()); + if (curWord.size() >= goldWord.size()) + return 0; + + return goldWord[curWord.size()] == letter ? 1 : 0; }; costStatic = costDynamic; @@ -231,6 +237,9 @@ void Transition::initAddCharToWord() if (!config.hasCharacter(config.getCharacterIndex())) return std::numeric_limits<int>::max(); + if (!config.isToken(config.getWordIndex())) + return std::numeric_limits<int>::max(); + auto letter = fmt::format("{}", config.getLetter(config.getCharacterIndex())); auto & goldWord = config.getConst("FORM", config.getWordIndex(), 0).get(); auto & curWord = config.getAsFeature("FORM", config.getWordIndex()).get(); -- GitLab