Commit d217cf58 authored by Franck Dary's avatar Franck Dary
Browse files

Rare values are now treated as unknown values. Embeddings sizes now exactly match dict size

parent 7982090e
......@@ -48,6 +48,7 @@ class Dict
void printEntry(std::FILE * file, int index, const std::string & entry, Encoding encoding) const;
std::size_t size() const;
int getNbOccs(int index) const;
void removeRareElements();
};
#endif
......@@ -173,3 +173,24 @@ int Dict::getNbOccs(int index) const
return nbOccs[index];
}
void Dict::removeRareElements()
{
int minNbOcc = std::numeric_limits<int>::max();
for (int nbOcc : nbOccs)
if (nbOcc < minNbOcc)
minNbOcc = nbOcc;
std::unordered_map<std::string, int> newElementsToIndexes;
std::vector<int> newNbOccs;
for (auto & it : elementsToIndexes)
if (nbOccs[it.second] > minNbOcc)
{
newElementsToIndexes.emplace(it.first, newElementsToIndexes.size());
newNbOccs.emplace_back(nbOccs[it.second]);
}
elementsToIndexes = newElementsToIndexes;
nbOccs = newNbOccs;
}
......@@ -9,7 +9,6 @@ void Decoder::decode(BaseConfig & config, std::size_t beamSize, bool debug, bool
{
torch::AutoGradMode useGrad(false);
machine.trainMode(false);
machine.splitUnknown(false);
machine.setDictsState(Dict::State::Closed);
machine.getStrategy().reset();
config.addPredicted(machine.getPredicted());
......
......@@ -47,7 +47,6 @@ class ReadingMachine
bool isPredicted(const std::string & columnName) const;
const std::set<std::string> & getPredicted() const;
void trainMode(bool isTrainMode);
void splitUnknown(bool splitUnknown);
void setDictsState(Dict::State state);
void saveBest() const;
void saveLast() const;
......
......@@ -21,8 +21,13 @@ ReadingMachine::ReadingMachine(std::filesystem::path path, std::vector<std::file
{
readFromFile(path);
std::size_t maxDictSize = 0;
for (auto path : dicts)
{
this->dicts.emplace(path.stem().string(), Dict{path.c_str(), Dict::State::Closed});
maxDictSize = std::max<std::size_t>(maxDictSize, this->dicts.at(path.stem().string()).size());
}
classifier->getNN()->registerEmbeddings(maxDictSize);
torch::load(classifier->getNN(), models[0]);
}
......@@ -182,11 +187,6 @@ void ReadingMachine::trainMode(bool isTrainMode)
classifier->getNN()->train(isTrainMode);
}
void ReadingMachine::splitUnknown(bool splitUnknown)
{
classifier->getNN()->setSplitUnknown(splitUnknown);
}
void ReadingMachine::setDictsState(Dict::State state)
{
for (auto & it : dicts)
......
......@@ -16,8 +16,7 @@ class ContextModuleImpl : public Submodule
std::vector<std::string> columns;
std::vector<int> bufferContext;
std::vector<int> stackContext;
int unknownValueThreshold;
std::vector<std::string> unknownValueColumns{"FORM", "LEMMA"};
int inSize;
public :
......@@ -25,7 +24,8 @@ class ContextModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(ContextModule);
......
......@@ -17,6 +17,7 @@ class DepthLayerTreeEmbeddingModuleImpl : public Submodule
std::vector<int> focusedStack;
torch::nn::Embedding wordEmbeddings{nullptr};
std::vector<std::shared_ptr<MyModule>> depthModules;
int inSize;
public :
......@@ -24,7 +25,8 @@ class DepthLayerTreeEmbeddingModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(DepthLayerTreeEmbeddingModule);
......
......@@ -16,6 +16,7 @@ class FocusedColumnModuleImpl : public Submodule
std::vector<int> focusedBuffer, focusedStack;
std::string column;
int maxNbElements;
int inSize;
public :
......@@ -23,7 +24,8 @@ class FocusedColumnModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(FocusedColumnModule);
......
......@@ -27,6 +27,7 @@ class ModularNetworkImpl : public NeuralNetworkImpl
ModularNetworkImpl(std::map<std::string,std::size_t> nbOutputsPerState, std::vector<std::string> definitions);
torch::Tensor forward(torch::Tensor input) override;
std::vector<std::vector<long>> extractContext(Config & config, Dict & dict) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
#endif
......@@ -13,7 +13,6 @@ class NeuralNetworkImpl : public torch::nn::Module
private :
bool splitUnknown{false};
std::string state;
protected :
......@@ -24,8 +23,7 @@ class NeuralNetworkImpl : public torch::nn::Module
virtual torch::Tensor forward(torch::Tensor input) = 0;
virtual std::vector<std::vector<long>> extractContext(Config & config, Dict & dict) const = 0;
bool mustSplitUnknown() const;
void setSplitUnknown(bool splitUnknown);
virtual void registerEmbeddings(std::size_t nbElements) = 0;
void setState(const std::string & state);
const std::string & getState() const;
};
......
......@@ -14,6 +14,7 @@ class RandomNetworkImpl : public NeuralNetworkImpl
RandomNetworkImpl(std::map<std::string,std::size_t> nbOutputsPerState);
torch::Tensor forward(torch::Tensor input) override;
std::vector<std::vector<long>> extractContext(Config &, Dict &) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
#endif
......@@ -14,6 +14,7 @@ class RawInputModuleImpl : public Submodule
torch::nn::Embedding wordEmbeddings{nullptr};
std::shared_ptr<MyModule> myModule{nullptr};
int leftWindow, rightWindow;
int inSize;
public :
......@@ -21,7 +22,8 @@ class RawInputModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(RawInputModule);
......
......@@ -14,6 +14,7 @@ class SplitTransModuleImpl : public Submodule
torch::nn::Embedding wordEmbeddings{nullptr};
std::shared_ptr<MyModule> myModule{nullptr};
int maxNbTrans;
int inSize;
public :
......@@ -21,7 +22,8 @@ class SplitTransModuleImpl : public Submodule
torch::Tensor forward(torch::Tensor input);
std::size_t getOutputSize() override;
std::size_t getInputSize() override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const override;
void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const override;
void registerEmbeddings(std::size_t nbElements) override;
};
TORCH_MODULE(SplitTransModule);
......
......@@ -16,8 +16,9 @@ class Submodule : public torch::nn::Module
void setFirstInputIndex(std::size_t firstInputIndex);
virtual std::size_t getOutputSize() = 0;
virtual std::size_t getInputSize() = 0;
virtual void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const = 0;
virtual void addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const = 0;
virtual torch::Tensor forward(torch::Tensor input) = 0;
virtual void registerEmbeddings(std::size_t nbElements) = 0;
};
#endif
......
......@@ -2,23 +2,21 @@
ContextModuleImpl::ContextModuleImpl(const std::string & definition)
{
std::regex regex("(?:(?:\\s|\\t)*)Unk\\{(.*)\\}(?:(?:\\s|\\t)*)Buffer\\{(.*)\\}(?:(?:\\s|\\t)*)Stack\\{(.*)\\}(?:(?:\\s|\\t)*)Columns\\{(.*)\\}(?:(?:\\s|\\t)*)(\\S+)\\{(.*)\\}(?:(?:\\s|\\t)*)In\\{(.*)\\}(?:(?:\\s|\\t)*)Out\\{(.*)\\}(?:(?:\\s|\\t)*)");
std::regex regex("(?:(?:\\s|\\t)*)Buffer\\{(.*)\\}(?:(?:\\s|\\t)*)Stack\\{(.*)\\}(?:(?:\\s|\\t)*)Columns\\{(.*)\\}(?:(?:\\s|\\t)*)(\\S+)\\{(.*)\\}(?:(?:\\s|\\t)*)In\\{(.*)\\}(?:(?:\\s|\\t)*)Out\\{(.*)\\}(?:(?:\\s|\\t)*)");
if (!util::doIfNameMatch(regex, definition, [this,&definition](auto sm)
{
try
{
unknownValueThreshold = std::stoi(sm.str(1));
for (auto & index : util::split(sm.str(2), ' '))
for (auto & index : util::split(sm.str(1), ' '))
bufferContext.emplace_back(std::stoi(index));
for (auto & index : util::split(sm.str(3), ' '))
for (auto & index : util::split(sm.str(2), ' '))
stackContext.emplace_back(std::stoi(index));
columns = util::split(sm.str(4), ' ');
columns = util::split(sm.str(3), ' ');
auto subModuleType = sm.str(5);
auto subModuleArguments = util::split(sm.str(6), ' ');
auto subModuleType = sm.str(4);
auto subModuleArguments = util::split(sm.str(5), ' ');
auto options = MyModule::ModuleOptions(true)
.bidirectional(std::stoi(subModuleArguments[0]))
......@@ -26,10 +24,8 @@ ContextModuleImpl::ContextModuleImpl(const std::string & definition)
.dropout(std::stof(subModuleArguments[2]))
.complete(std::stoi(subModuleArguments[3]));
int inSize = std::stoi(sm.str(7));
int outSize = std::stoi(sm.str(8));
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(60000, inSize)));
inSize = std::stoi(sm.str(6));
int outSize = std::stoi(sm.str(7));
if (subModuleType == "LSTM")
myModule = register_module("myModule", LSTM(columns.size()*inSize, outSize, options));
......@@ -53,7 +49,7 @@ std::size_t ContextModuleImpl::getInputSize()
return columns.size()*(bufferContext.size()+stackContext.size());
}
void ContextModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool splitUnknown) const
void ContextModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const
{
std::vector<long> contextIndexes;
......@@ -79,11 +75,6 @@ void ContextModuleImpl::addToContext(std::vector<std::vector<long>> & context, D
for (auto & contextElement : context)
contextElement.push_back(dictIndex);
for (auto & targetCol : unknownValueColumns)
if (col == targetCol)
if (dict.getNbOccs(dictIndex) <= unknownValueThreshold)
context.back().back() = dict.getIndexOrInsert(Dict::unknownValueStr);
}
}
......@@ -96,3 +87,8 @@ torch::Tensor ContextModuleImpl::forward(torch::Tensor input)
return myModule->forward(context);
}
void ContextModuleImpl::registerEmbeddings(std::size_t nbElements)
{
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(nbElements, inSize)));
}
......@@ -27,11 +27,9 @@ DepthLayerTreeEmbeddingModuleImpl::DepthLayerTreeEmbeddingModuleImpl(const std::
.dropout(std::stof(subModuleArguments[2]))
.complete(std::stoi(subModuleArguments[3]));
int inSize = std::stoi(sm.str(7));
inSize = std::stoi(sm.str(7));
int outSize = std::stoi(sm.str(8));
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(60000, inSize)));
for (unsigned int i = 0; i < maxElemPerDepth.size(); i++)
{
std::string name = fmt::format("{}_{}", i, subModuleType);
......@@ -83,7 +81,7 @@ std::size_t DepthLayerTreeEmbeddingModuleImpl::getInputSize()
return inputSize;
}
void DepthLayerTreeEmbeddingModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool) const
void DepthLayerTreeEmbeddingModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const
{
std::vector<long> focusedIndexes;
......@@ -122,3 +120,8 @@ void DepthLayerTreeEmbeddingModuleImpl::addToContext(std::vector<std::vector<lon
}
}
void DepthLayerTreeEmbeddingModuleImpl::registerEmbeddings(std::size_t nbElements)
{
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(nbElements, inSize)));
}
......@@ -25,11 +25,9 @@ FocusedColumnModuleImpl::FocusedColumnModuleImpl(const std::string & definition)
.dropout(std::stof(subModuleArguments[2]))
.complete(std::stoi(subModuleArguments[3]));
int inSize = std::stoi(sm.str(7));
inSize = std::stoi(sm.str(7));
int outSize = std::stoi(sm.str(8));
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(60000, inSize)));
if (subModuleType == "LSTM")
myModule = register_module("myModule", LSTM(inSize, outSize, options));
else if (subModuleType == "GRU")
......@@ -61,7 +59,7 @@ std::size_t FocusedColumnModuleImpl::getInputSize()
return (focusedBuffer.size()+focusedStack.size()) * maxNbElements;
}
void FocusedColumnModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config, bool) const
void FocusedColumnModuleImpl::addToContext(std::vector<std::vector<long>> & context, Dict & dict, const Config & config) const
{
std::vector<long> focusedIndexes;
......@@ -134,3 +132,8 @@ void FocusedColumnModuleImpl::addToContext(std::vector<std::vector<long>> & cont
}
}
void FocusedColumnModuleImpl::registerEmbeddings(std::size_t nbElements)
{
wordEmbeddings = register_module("embeddings", torch::nn::Embedding(torch::nn::EmbeddingOptions(nbElements, inSize)));
}
......@@ -79,7 +79,13 @@ std::vector<std::vector<long>> ModularNetworkImpl::extractContext(Config & confi
{
std::vector<std::vector<long>> context(1);
for (auto & mod : modules)
mod->addToContext(context, dict, config, mustSplitUnknown());
mod->addToContext(context, dict, config);
return context;
}
void ModularNetworkImpl::registerEmbeddings(std::size_t nbElements)
{
for (auto & mod : modules)
mod->registerEmbeddings(nbElements);
}
......@@ -2,16 +2,6 @@
torch::Device NeuralNetworkImpl::device(torch::cuda::is_available() ? torch::kCUDA : torch::kCPU);
bool NeuralNetworkImpl::mustSplitUnknown() const
{
return splitUnknown;
}
void NeuralNetworkImpl::setSplitUnknown(bool splitUnknown)
{
this->splitUnknown = splitUnknown;
}
void NeuralNetworkImpl::setState(const std::string & state)
{
this->state = state;
......
......@@ -17,3 +17,7 @@ std::vector<std::vector<long>> RandomNetworkImpl::extractContext(Config &, Dict
return std::vector<std::vector<long>>{{0}};
}
void RandomNetworkImpl::registerEmbeddings(std::size_t)
{
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment