Commit 5690a9d1 authored by Franck Dary's avatar Franck Dary
Browse files

Allow space in w2v format

parent 95273531
......@@ -278,6 +278,8 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
bool firstLine = true;
bool pretrained = false;
int embSize = 0;
try
{
if (!prefix.empty())
......@@ -298,6 +300,7 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
if (firstLine)
{
firstLine = false;
embSize = std::atoi(util::split(util::strip(buffer), ' ')[1].c_str());
continue;
}
......@@ -307,9 +310,13 @@ bool Dict::loadWord2Vec(std::filesystem::path path, std::string prefix)
if (splited.size() < 2)
util::myThrow(fmt::format("invalid w2v line '{}' less than 2 columns", buffer));
if (splited[0] == "<unk>")
std::string word = splited[0];
for (int i = 1; i < ((int)splited.size()-embSize); i++)
word += " "+splited[i];
if (word == "<unk>")
continue;
auto toInsert = util::splitAsUtf8(splited[0]);
auto toInsert = util::splitAsUtf8(word);
toInsert.replace("◌", " ");
auto dictIndex = getIndexOrInsert(fmt::format("{}", toInsert), prefix);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment