From 5690a9d18a2ec1bab911dfc9a9981c4fdb08ac35 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Sat, 9 Oct 2021 17:21:32 +0200 Subject: [PATCH] Allow space in w2v format --- common/src/Dict.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/common/src/Dict.cpp b/common/src/Dict.cpp index b1de43a..c6731cc 100644 --- a/common/src/Dict.cpp +++ b/common/src/Dict.cpp @@ -278,6 +278,8 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) bool firstLine = true; bool pretrained = false; + int embSize = 0; + try { if (!prefix.empty()) @@ -298,6 +300,7 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) if (firstLine) { firstLine = false; + embSize = std::atoi(util::split(util::strip(buffer), ' ')[1].c_str()); continue; } @@ -307,9 +310,13 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) if (splited.size() < 2) util::myThrow(fmt::format("invalid w2v line '{}' less than 2 columns", buffer)); - if (splited[0] == "<unk>") + std::string word = splited[0]; + for (int i = 1; i < ((int)splited.size()-embSize); i++) + word += " "+splited[i]; + + if (word == "<unk>") continue; - auto toInsert = util::splitAsUtf8(splited[0]); + auto toInsert = util::splitAsUtf8(word); toInsert.replace("◌", " "); auto dictIndex = getIndexOrInsert(fmt::format("{}", toInsert), prefix); -- GitLab