Skip to content
Snippets Groups Projects
Commit 5690a9d1 authored by Franck Dary's avatar Franck Dary
Browse files

Allow space in w2v format

parent 95273531
No related branches found
No related tags found
No related merge requests found
......@@ -278,6 +278,8 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
bool firstLine = true;
bool pretrained = false;
int embSize = 0;
try
{
if (!prefix.empty())
......@@ -298,6 +300,7 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
if (firstLine)
{
firstLine = false;
embSize = std::atoi(util::split(util::strip(buffer), ' ')[1].c_str());
continue;
}
......@@ -307,9 +310,13 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
if (splited.size() < 2)
util::myThrow(fmt::format("invalid w2v line '{}' less than 2 columns", buffer));
if (splited[0] == "<unk>")
std::string word = splited[0];
for (int i = 1; i < ((int)splited.size()-embSize); i++)
word += " "+splited[i];
if (word == "<unk>")
continue;
auto toInsert = util::splitAsUtf8(splited[0]);
auto toInsert = util::splitAsUtf8(word);
toInsert.replace("◌", " ");
auto dictIndex = getIndexOrInsert(fmt::format("{}", toInsert), prefix);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment