Skip to content
Snippets Groups Projects
Commit 5690a9d1 authored by Franck Dary's avatar Franck Dary
Browse files

Allow space in w2v format

parent 95273531
Branches
No related tags found
No related merge requests found
...@@ -278,6 +278,8 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) ...@@ -278,6 +278,8 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
bool firstLine = true; bool firstLine = true;
bool pretrained = false; bool pretrained = false;
int embSize = 0;
try try
{ {
if (!prefix.empty()) if (!prefix.empty())
...@@ -298,6 +300,7 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) ...@@ -298,6 +300,7 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
if (firstLine) if (firstLine)
{ {
firstLine = false; firstLine = false;
embSize = std::atoi(util::split(util::strip(buffer), ' ')[1].c_str());
continue; continue;
} }
...@@ -307,9 +310,13 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) ...@@ -307,9 +310,13 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
if (splited.size() < 2) if (splited.size() < 2)
util::myThrow(fmt::format("invalid w2v line '{}' less than 2 columns", buffer)); util::myThrow(fmt::format("invalid w2v line '{}' less than 2 columns", buffer));
if (splited[0] == "<unk>") std::string word = splited[0];
for (int i = 1; i < ((int)splited.size()-embSize); i++)
word += " "+splited[i];
if (word == "<unk>")
continue; continue;
auto toInsert = util::splitAsUtf8(splited[0]); auto toInsert = util::splitAsUtf8(word);
toInsert.replace("◌", " "); toInsert.replace("◌", " ");
auto dictIndex = getIndexOrInsert(fmt::format("{}", toInsert), prefix); auto dictIndex = getIndexOrInsert(fmt::format("{}", toInsert), prefix);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment