Skip to content
Snippets Groups Projects
Commit 73971f7c authored by Franck Dary's avatar Franck Dary
Browse files

Allow spaces in w2v

parent 5690a9d1
No related branches found
No related tags found
No related merge requests found
...@@ -61,27 +61,36 @@ void Submodule::loadPretrainedW2vEmbeddings(torch::nn::Embedding embeddings, std ...@@ -61,27 +61,36 @@ void Submodule::loadPretrainedW2vEmbeddings(torch::nn::Embedding embeddings, std
else else
word = splited[0]; word = splited[0];
for (unsigned int i = 1; i < ((int)splited.size()-embeddingsSize); i++)
word += " "+splited[i];
auto toInsert = util::splitAsUtf8(word); auto toInsert = util::splitAsUtf8(word);
toInsert.replace("◌", " "); toInsert.replace("◌", " ");
word = fmt::format("{}", toInsert); word = fmt::format("{}", toInsert);
auto dictIndex = getDict().getIndexOrInsert(word, prefix); auto dictIndex = getDict().getIndexOrInsert(word, prefix);
if (embeddingsSize != splited.size()-1) if (embeddingsSize > splited.size()-1)
util::myThrow(fmt::format("in line \n{}embeddingsSize='{}' mismatch pretrainedEmbeddingSize='{}'", buffer, embeddingsSize, ((int)splited.size())-1)); util::myThrow(fmt::format("in line \n{}embeddingsSize='{}' mismatch pretrainedEmbeddingSize='{}'", buffer, embeddingsSize, ((int)splited.size())-1));
if (dictIndex >= embeddings->weight.size(0)) try
{ {
if ((unsigned long)dictIndex != embeddings->weight.size(0)+toAdd.size()) if (dictIndex >= embeddings->weight.size(0))
util::myThrow(fmt::format("dictIndex == {}, weight.size == {}, toAdd.size == {}", dictIndex, embeddings->weight.size(0), toAdd.size())); {
toAdd.emplace_back(); if ((unsigned long)dictIndex != embeddings->weight.size(0)+toAdd.size())
for (unsigned int i = 1; i < splited.size(); i++) util::myThrow(fmt::format("dictIndex == {}, weight.size == {}, toAdd.size == {}", dictIndex, embeddings->weight.size(0), toAdd.size()));
toAdd.back().emplace_back(std::stof(splited[i])); toAdd.emplace_back();
} for (unsigned int i = splited.size()-embeddingsSize; i < splited.size(); i++)
else toAdd.back().emplace_back(std::stof(splited[i]));
}
else
{
for (unsigned int i = splited.size()-embeddingsSize; i < splited.size(); i++)
embeddings->weight[dictIndex][i-(splited.size()-embeddingsSize)] = std::stof(splited[i]);
}
} catch (std::exception & e)
{ {
for (unsigned int i = 1; i < splited.size(); i++) util::myThrow(fmt::format("{} in line\n{}\n", e.what(), buffer));
embeddings->weight[dictIndex][i-1] = std::stof(splited[i]);
} }
} }
} catch (std::exception & e) } catch (std::exception & e)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment