From 5690a9d18a2ec1bab911dfc9a9981c4fdb08ac35 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Sat, 9 Oct 2021 17:21:32 +0200
Subject: [PATCH] Allow space in w2v format

---
 common/src/Dict.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/common/src/Dict.cpp b/common/src/Dict.cpp
index b1de43a..c6731cc 100644
--- a/common/src/Dict.cpp
+++ b/common/src/Dict.cpp
@@ -278,6 +278,8 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
   bool firstLine = true;
   bool pretrained = false;
 
+  int embSize = 0;
+
   try
   {
     if (!prefix.empty())
@@ -298,6 +300,7 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
       if (firstLine)
       {
         firstLine = false;
+        embSize = std::atoi(util::split(util::strip(buffer), ' ')[1].c_str());
         continue;
       }
 
@@ -307,9 +310,13 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
       if (splited.size() < 2)
         util::myThrow(fmt::format("invalid w2v line '{}' less than 2 columns", buffer));
 
-      if (splited[0] == "<unk>")
+      std::string word = splited[0];
+      for (int i = 1; i < ((int)splited.size()-embSize); i++)
+        word += " "+splited[i];
+
+      if (word == "<unk>")
         continue;
-      auto toInsert = util::splitAsUtf8(splited[0]);
+      auto toInsert = util::splitAsUtf8(word);
       toInsert.replace("◌", " ");
       auto dictIndex = getIndexOrInsert(fmt::format("{}", toInsert), prefix);
 
-- 
GitLab