Commit 5690a9d1 authored by Franck Dary's avatar Franck Dary
Browse files

Allow space in w2v format

parent 95273531
...@@ -278,6 +278,8 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) ...@@ -278,6 +278,8 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
bool firstLine = true; bool firstLine = true;
bool pretrained = false; bool pretrained = false;
int embSize = 0;
try try
{ {
if (!prefix.empty()) if (!prefix.empty())
...@@ -298,6 +300,7 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) ...@@ -298,6 +300,7 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
if (firstLine) if (firstLine)
{ {
firstLine = false; firstLine = false;
embSize = std::atoi(util::split(util::strip(buffer), ' ')[1].c_str());
continue; continue;
} }
...@@ -307,9 +310,13 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix) ...@@ -307,9 +310,13 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
if (splited.size() < 2) if (splited.size() < 2)
util::myThrow(fmt::format("invalid w2v line '{}' less than 2 columns", buffer)); util::myThrow(fmt::format("invalid w2v line '{}' less than 2 columns", buffer));
if (splited[0] == "<unk>") std::string word = splited[0];
for (int i = 1; i < ((int)splited.size()-embSize); i++)
word += " "+splited[i];
if (word == "<unk>")
continue; continue;
auto toInsert = util::splitAsUtf8(splited[0]); auto toInsert = util::splitAsUtf8(word);
toInsert.replace("◌", " "); toInsert.replace("◌", " ");
auto dictIndex = getIndexOrInsert(fmt::format("{}", toInsert), prefix); auto dictIndex = getIndexOrInsert(fmt::format("{}", toInsert), prefix);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment