Skip to content
Snippets Groups Projects
Commit d217cf58 authored by Franck Dary's avatar Franck Dary
Browse files

Rare values are now treated as unknown values. Embeddings sizes now exactly match dict size

parent 7982090e
No related branches found
No related tags found
No related merge requests found
Showing
with 86 additions and 55 deletions
......@@ -48,6 +48,7 @@ class Dict
void printEntry(std::FILE * file, int index, const std::string & entry, Encoding encoding) const;
std::size_t size() const;
int getNbOccs(int index) const;
void removeRareElements();
};
#endif
......@@ -173,3 +173,24 @@ int Dict::getNbOccs(int index) const
return nbOccs[index];
}
void Dict::removeRareElements()
{
int minNbOcc = std::numeric_limits<int>::max();
for (int nbOcc : nbOccs)
if (nbOcc < minNbOcc)
minNbOcc = nbOcc;
std::unordered_map<std::string, int> newElementsToIndexes;
std::vector<int> newNbOccs;
for (auto & it : elementsToIndexes)
if (nbOccs[it.second] > minNbOcc)
{
newElementsToIndexes.emplace(it.first, newElementsToIndexes.size());
newNbOccs.emplace_back(nbOccs[it.second]);
}
elementsToIndexes = newElementsToIndexes;
nbOccs = newNbOccs;
}
......@@ -9,7 +9,6 @@ void Decoder::decode(BaseConfig & config, std::size_t beamSize, bool debug, bool
{
torch::AutoGradMode useGrad(false);
machine.trainMode(false);
machine.splitUnknown(false);
machine.setDictsState(Dict::State::Closed);
machine.getStrategy().reset();
config.addPredicted(machine.getPredicted());
......
......@@ -47,7 +47,6 @@ class ReadingMachine
bool isPredicted(const std::string & columnName) const;
const std::set<std::string> & getPredicted() const;
void trainMode(bool isTrainMode);
void splitUnknown(bool splitUnknown);
void setDictsState(Dict::State state);
void saveBest() const;
void saveLast() const;
......
......@@ -21,8 +21,13 @@ ReadingMachine::ReadingMachine(std::filesystem::path path, std::vector<std::file
{
readFromFile(path);
std::size_t maxDictSize = 0;
for (auto path : dicts)
{
this->dicts.emplace(path.stem().string(), Dict{path.c_str(), Dict::State::Closed});
maxDictSize = std::max<std::size_t>(maxDictSize, this->dicts.at(path.stem().string()).size());
}
classifier->getNN()->registerEmbeddings(maxDictSize);
torch::load(classifier->getNN(), models[0]);
}
......@@ -182,11 +187,6 @@ void ReadingMachine::trainMode(bool isTrainMode)
classifier->getNN()->train(isTrainMode);
}
void ReadingMachine::splitUnknown(bool splitUnknown)
{
classifier->getNN()->setSplitUnknown(splitUnknown);
}
void ReadingMachine::setDictsState(Dict::State state)
{
for (auto & it : dicts)
......
......@@ -16,8 +16,7 @@ class ContextModuleImpl : public Submodule
std::vector<std::string> columns;
std::vector<int> bufferContext;
std::vector<int> stackContext;
int unknownValueThreshold;
std::vector<std::string> unknownValueColumns{"FORM", "LEMMA"};
int inSize;
public :
......@@ -25,7 +24,8 @@ class ContextModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(ContextModule);
......
......@@ -17,6 +17,7 @@ class DepthLayerTreeEmbeddingModuleImpl : public Submodule
std::vector<int> focusedStack;
torch::nn::Embedding wordEmbeddings{nullptr};
std::vector<std::shared_ptr<MyModule>> depthModules;
int inSize;
public :
......@@ -24,7 +25,8 @@ class DepthLayerTreeEmbeddingModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(DepthLayerTreeEmbeddingModule);
......
......@@ -16,6 +16,7 @@ class FocusedColumnModuleImpl : public Submodule
std::vector<int> focusedBuffer, focusedStack;
std::string column;
int maxNbElements;
int inSize;
public :
......@@ -23,7 +24,8 @@ class FocusedColumnModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(FocusedColumnModule);
......
......@@ -27,6 +27,7 @@ class ModularNetworkImpl : public NeuralNetworkImpl
ModularNetworkImpl(std::map<std::string,std::size_t> nbOutputsPerState, std::vector<std::string> definitions);
torch::Tensor forward(torch::Tensor input) override;
std::vector<std::vector<long>> extractContext(Config & config, Dict & dict) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
#endif
......@@ -13,7 +13,6 @@ class NeuralNetworkImpl : public torch::nn::Module
private :
bool splitUnknown{false};
std::string state;
protected :
......@@ -24,8 +23,7 @@ class NeuralNetworkImpl : public torch::nn::Module
virtual torch::Tensor forward(torch::Tensor input) = 0;
virtual std::vector<std::vector<long>> extractContext(Config & config, Dict & dict) const = 0;
bool mustSplitUnknown() const;
void setSplitUnknown(bool splitUnknown);
virtual void registerEmbeddings(std::size_t nbElements) = 0;
void setState(const std::string & state);
const std::string & getState() const;
};
......
......@@ -14,6 +14,7 @@ class RandomNetworkImpl : public NeuralNetworkImpl
RandomNetworkImpl(std::map<std::string,std::size_t> nbOutputsPerState);
torch::Tensor forward(torch::Tensor input) override;
std::vector<std::vector<long>> extractContext(Config &, Dict &) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
#endif
......@@ -14,6 +14,7 @@ class RawInputModuleImpl : public Submodule
torch::nn::Embedding wordEmbeddings{nullptr};
std::shared_ptr<MyModule> myModule{nullptr};
int leftWindow, rightWindow;
int inSize;
public :
......@@ -21,7 +22,8 @@ class RawInputModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(RawInputModule);
......
......@@ -14,6 +14,7 @@ class SplitTransModuleImpl : public Submodule
torch::nn::Embedding wordEmbeddings{nullptr};
std::shared_ptr<MyModule> myModule{nullptr};
int maxNbTrans;
int inSize;
public :
......@@ -21,7 +22,8 @@ class SplitTransModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(SplitTransModule);
......
......@@ -16,8 +16,9 @@ class Submodule : public torch::nn::Module
void setFirstInputIndex(std::size_t firstInputIndex);
virtual std::size_t getOutputSize() = 0;
virtual std::size_t getInputSize() = 0;
virtual void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const = 0;
virtual void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const = 0;
virtual torch::Tensor forward(torch::Tensor input) = 0;
virtual void registerEmbeddings(std::size_t nbElements) = 0;
};
#endif
......
......@@ -2,23 +2,21 @@
ContextModuleImpl::ContextModuleImpl(const std::string & definition)
{
std::regex regex("(?:(?:\\s|\\t)*)Unk\\{(.*)\\}(?:(?:\\s|\\t)*)Buffer\\{(.*)\\}(?:(?:\\s|\\t)*)Stack\\{(.*)\\}(?:(?:\\s|\\t)*)Columns\\{(.*)\\}(?:(?:\\s|\\t)*)(\\S+)\\{(.*)\\}(?:(?:\\s|\\t)*)In\\{(.*)\\}(?:(?:\\s|\\t)*)Out\\{(.*)\\}(?:(?:\\s|\\t)*)");
std::regex regex("(?:(?:\\s|\\t)*)Buffer\\{(.*)\\}(?:(?:\\s|\\t)*)Stack\\{(.*)\\}(?:(?:\\s|\\t)*)Columns\\{(.*)\\}(?:(?:\\s|\\t)*)(\\S+)\\{(.*)\\}(?:(?:\\s|\\t)*)In\\{(.*)\\}(?:(?:\\s|\\t)*)Out\\{(.*)\\}(?:(?:\\s|\\t)*)");
if (!util::doIfNameMatch(regex, definition, [this,&definition](auto sm)
{
try
{
unknownValueThreshold = std::stoi(sm.str(1));
for (auto & index : util::split(sm.str(2), ' '))
for (auto & index : util::split(sm.str(1), ' '))
bufferContext.emplace_back(std::stoi(index));
for (auto & index : util::split(sm.str(3), ' '))
for (auto & index : util::split(sm.str(2), ' '))
stackContext.emplace_back(std::stoi(index));
columns = util::split(sm.str(4), ' ');
columns = util::split(sm.str(3), ' ');
auto subModuleType = sm.str(5);
auto subModuleArguments = util::split(sm.str(6), ' ');
auto subModuleType = sm.str(4);
auto subModuleArguments = util::split(sm.str(5), ' ');
auto options = MyModule::ModuleOptions(true)
.bidirectional(std::stoi(subModuleArguments[0]))
......@@ -26,10 +24,8 @@ ContextModuleImpl::ContextModuleImpl(const std::string & definition)
.dropout(std::stof(subModuleArguments[2]))
.complete(std::stoi(subModuleArguments[3]));
int inSize = std::stoi(sm.str(7));
int outSize = std::stoi(sm.str(8));
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(60000, inSize)));
inSize = std::stoi(sm.str(6));
int outSize = std::stoi(sm.str(7));
if (subModuleType == "LSTM")
myModule = register_module("myModule", LSTM(columns.size()*inSize, outSize, options));
......@@ -53,7 +49,7 @@ std::size_t ContextModuleImpl::getInputSize()
return columns.size()*(bufferContext.size()+stackContext.size());
}
void ContextModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const
void ContextModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const
{
std::vector<long> contextIndexes;
......@@ -79,11 +75,6 @@ void ContextModuleImpl::addToContext(std::vector<std::vector<long>> & context, D
for (auto & contextElement : context)
contextElement.push_back(dictIndex);
for (auto & targetCol : unknownValueColumns)
if (col == targetCol)
if (dict.getNbOccs(dictIndex) <= unknownValueThreshold)
context.back().back() = dict.getIndexOrInsert(Dict::unknownValueStr);
}
}
......@@ -96,3 +87,8 @@ torch::Tensor ContextModuleImpl::forward(torch::Tensor input)
return myModule->forward(context);
}
void ContextModuleImpl::registerEmbeddings(std::size_t nbElements)
{
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(nbElements, inSize)));
}
......@@ -27,11 +27,9 @@ DepthLayerTreeEmbeddingModuleImpl::DepthLayerTreeEmbeddingModuleImpl(const std::
.dropout(std::stof(subModuleArguments[2]))
.complete(std::stoi(subModuleArguments[3]));
int inSize = std::stoi(sm.str(7));
inSize = std::stoi(sm.str(7));
int outSize = std::stoi(sm.str(8));
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(60000, inSize)));
for (unsigned int i = 0; i < maxElemPerDepth.size(); i++)
{
std::string name = fmt::format("{}_{}", i, subModuleType);
......@@ -83,7 +81,7 @@ std::size_t DepthLayerTreeEmbeddingModuleImpl::getInputSize()
return inputSize;
}
void DepthLayerTreeEmbeddingModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool) const
void DepthLayerTreeEmbeddingModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const
{
std::vector<long> focusedIndexes;
......@@ -122,3 +120,8 @@ void DepthLayerTreeEmbeddingModuleImpl::addToContext(std::vector<std::vector<lon
}
}
void DepthLayerTreeEmbeddingModuleImpl::registerEmbeddings(std::size_t nbElements)
{
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(nbElements, inSize)));
}
......@@ -25,11 +25,9 @@ FocusedColumnModuleImpl::FocusedColumnModuleImpl(const std::string & definition)
.dropout(std::stof(subModuleArguments[2]))
.complete(std::stoi(subModuleArguments[3]));
int inSize = std::stoi(sm.str(7));
inSize = std::stoi(sm.str(7));
int outSize = std::stoi(sm.str(8));
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(60000, inSize)));
if (subModuleType == "LSTM")
myModule = register_module("myModule", LSTM(inSize, outSize, options));
else if (subModuleType == "GRU")
......@@ -61,7 +59,7 @@ std::size_t FocusedColumnModuleImpl::getInputSize()
return (focusedBuffer.size()+focusedStack.size()) * maxNbElements;
}
void FocusedColumnModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool) const
void FocusedColumnModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const
{
std::vector<long> focusedIndexes;
......@@ -134,3 +132,8 @@ void FocusedColumnModuleImpl::addToContext(std::vector<std::vector<long>> & cont
}
}
void FocusedColumnModuleImpl::registerEmbeddings(std::size_t nbElements)
{
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(nbElements, inSize)));
}
......@@ -79,7 +79,13 @@ std::vector<std::vector<long>> ModularNetworkImpl::extractContext(Config & confi
{
std::vector<std::vector<long>> context(1);
for (auto & mod : modules)
mod->addToContext(context, dict, config, mustSplitUnknown());
mod->addToContext(context, dict, config);
return context;
}
void ModularNetworkImpl::registerEmbeddings(std::size_t nbElements)
{
for (auto & mod : modules)
mod->registerEmbeddings(nbElements);
}
......@@ -2,16 +2,6 @@
torch::Device NeuralNetworkImpl::device(torch::cuda::is_available() ? torch::kCUDA : torch::kCPU);
bool NeuralNetworkImpl::mustSplitUnknown() const
{
return splitUnknown;
}
void NeuralNetworkImpl::setSplitUnknown(bool splitUnknown)
{
this->splitUnknown = splitUnknown;
}
void NeuralNetworkImpl::setState(const std::string & state)
{
this->state = state;
......
......@@ -17,3 +17,7 @@ std::vector<std::vector<long>> RandomNetworkImpl::extractContext(Config &, Dict
return std::vector<std::vector<long>>{{0}};
}
void RandomNetworkImpl::registerEmbeddings(std::size_t)
{
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment