From a88641aec390eb34284ca6ee5afbbb7f2767f72d Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Thu, 2 Jul 2020 14:26:24 +0200
Subject: [PATCH] Fixed dynamic oracle for tokenization transitions

---
 reading_machine/src/Transition.cpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/reading_machine/src/Transition.cpp b/reading_machine/src/Transition.cpp
index 352ca9b..9f6007b 100644
--- a/reading_machine/src/Transition.cpp
+++ b/reading_machine/src/Transition.cpp
@@ -197,9 +197,15 @@ void Transition::initIgnoreChar()
 {
   sequence.emplace_back(Action::ignoreCurrentCharacter());
 
-  costDynamic = [](const Config &)
+  costDynamic = [](const Config & config)
   {
-    return 0;
+    auto letter = fmt::format("{}", config.getLetter(config.getCharacterIndex()));
+    auto goldWord = util::splitAsUtf8(config.getConst("FORM", config.getWordIndex(), 0).get());
+    auto curWord = util::splitAsUtf8(config.getAsFeature("FORM", config.getWordIndex()).get());
+    if (curWord.size() >= goldWord.size())
+      return 0;
+
+    return goldWord[curWord.size()] == letter ? 1 : 0;
   };
 
   costStatic = costDynamic;
@@ -231,6 +237,9 @@ void Transition::initAddCharToWord()
     if (!config.hasCharacter(config.getCharacterIndex()))
       return std::numeric_limits<int>::max();
 
+    if (!config.isToken(config.getWordIndex()))
+      return std::numeric_limits<int>::max();
+
     auto letter = fmt::format("{}", config.getLetter(config.getCharacterIndex()));
     auto & goldWord = config.getConst("FORM", config.getWordIndex(), 0).get();
     auto & curWord = config.getAsFeature("FORM", config.getWordIndex()).get();
-- 
GitLab