diff --git a/MLP/include/MLP.hpp b/MLP/include/MLP.hpp index 479f403d1cf54f293c215b7d78c672d45115901f..68f1849caab821449cac0b1f0f5b18705c798861 100644 --- a/MLP/include/MLP.hpp +++ b/MLP/include/MLP.hpp @@ -81,15 +81,10 @@ class MLP private : - /// @brief The maximum number of parameters of the model. - static const unsigned int MAXLOOKUPSIZE = 200000; - /// @brief The Layers of the MLP. std::vector<Layer> layers; /// @brief The parameters corresponding to the layers of the MLP. std::vector< std::vector<dynet::Parameter> > parameters; - /// @brief The parameters corresponding to Dict values. - std::map< Dict*, std::pair<dynet::LookupParameter, std::map<void*, unsigned int> > > lookupParameters; /// @brief The dynet model containing the parameters to be trained. dynet::ParameterCollection model; @@ -172,7 +167,7 @@ class MLP public : - /// @brief Construct a new untrained MLP from a desired topology. + /// @brief initialize a new untrained MLP from a desired topology. /// /// topology example for 2 hidden layers : (150,RELU,0.3)(50,ELU,0.2)\n /// Of sizes 150 and 50, activation functions RELU and ELU, and dropout rates @@ -180,7 +175,9 @@ class MLP /// @param nbInputs The size of the input layer of the MLP. /// @param topology Description of each hidden Layer of the MLP. /// @param nbOutputs The size of the output layer of the MLP. - MLP(int nbInputs, const std::string & topology, int nbOutputs); + void init(int nbInputs, const std::string & topology, int nbOutputs); + /// @brief Construct a new MLP for training. + MLP(); /// @brief Read and construct a trained MLP from a file. /// /// The file must have been written by save. @@ -211,6 +208,10 @@ class MLP /// /// @return A pointer to the newly allocated trainer. dynet::Trainer * createTrainer(); + /// @brief Return the model. + /// + /// @return The model of this MLP. + dynet::ParameterCollection & getModel(); }; #endif diff --git a/MLP/src/MLP.cpp b/MLP/src/MLP.cpp index 7b64414ce9ae7fa1eca0a3fac7cc78ede0c87907..e53c198e43b150e7f856b58d5dd8e822ef4e5caa 100644 --- a/MLP/src/MLP.cpp +++ b/MLP/src/MLP.cpp @@ -80,14 +80,20 @@ void MLP::initDynet() dynet::initialize(getDefaultParams()); } -MLP::MLP(int nbInputs, const std::string & topology, int nbOutputs) +MLP::MLP() { randomSeed = ProgramParameters::seed; + trainMode = true; + dropoutActive = true; + trainer.reset(createTrainer()); + initDynet(); +} + +void MLP::init(int nbInputs, const std::string & topology, int nbOutputs) +{ std::string topo = topology; std::replace(topo.begin(), topo.end(), '(', ' '); std::replace(topo.begin(), topo.end(), ')', ' '); - trainMode = true; - dropoutActive = true; auto groups = split(topo); for (auto group : groups) @@ -112,10 +118,6 @@ MLP::MLP(int nbInputs, const std::string & topology, int nbOutputs) layers.emplace_back(layers.back().output_dim, nbOutputs, 0.0, Activation::LINEAR); - trainer.reset(createTrainer()); - - initDynet(); - checkLayersCompatibility(); for(Layer layer : layers) @@ -225,48 +227,15 @@ dynet::DynetParams & MLP::getDefaultParams() dynet::Expression MLP::featValue2Expression(dynet::ComputationGraph & cg, const FeatureModel::FeatureValue & fv) { Dict * dict = fv.dict; - std::vector<float> * value = dict->getValue(*fv.value); - - auto entry = lookupParameters.find(dict); - - if(entry == lookupParameters.end()) - { - lookupParameters[dict].first = model.add_lookup_parameters(MAXLOOKUPSIZE, {(unsigned)dict->getDimension(),1}); - } - - auto & ptr2index = lookupParameters[dict].second; - auto & lu = lookupParameters[dict].first; - bool isConst = (fv.policy == FeatureModel::Policy::Final) || (dict->mode == Dict::Mode::OneHot); - auto it = ptr2index.find(value); - - if(it != ptr2index.end()) - { - if(isConst) - return dynet::const_lookup(cg, lu, it->second); - else - return dynet::lookup(cg, lu, it->second); - } - - ptr2index[value] = ptr2index.size(); - it = ptr2index.find(value); - - unsigned int lookupSize = (int)(*lu.values()).size(); - if(it->second >= lookupSize) - { - fprintf(stderr, "ERROR (%s) : MAXLOOKUPSIZE (%d) is too small. Aborting.\n", ERRINFO, MAXLOOKUPSIZE); - exit(1); - } - - // Horrible trick : directly set Dict data as Tensor values - // Works only on CPU - (*lu.values())[it->second].v = value->data(); + auto & lu = dict->getLookupParameter(); + unsigned int index = dict->getValue(*fv.value); if(isConst) - return dynet::const_lookup(cg, lu, it->second); + return dynet::const_lookup(cg, lu, index); else - return dynet::lookup(cg, lu, it->second); + return dynet::lookup(cg, lu, index); } dynet::Expression MLP::run(dynet::ComputationGraph & cg, dynet::Expression x) @@ -432,3 +401,8 @@ void MLP::printTopology(FILE * output) fprintf(output, ")\n"); } +dynet::ParameterCollection & MLP::getModel() +{ + return model; +} + diff --git a/decoder/include/Decoder.hpp b/decoder/include/Decoder.hpp index dc8bef588841a0b6cb99a1a30ebcce3f6de5b0fb..b905f6eb39512496bb403109281f3d22541c4558 100644 --- a/decoder/include/Decoder.hpp +++ b/decoder/include/Decoder.hpp @@ -7,7 +7,6 @@ #define DECODER__H #include "TransitionMachine.hpp" -#include "BD.hpp" #include "Config.hpp" /// @brief A simple object capable of using a trained TransitionMachine to process a given BD. @@ -17,8 +16,6 @@ class Decoder /// @brief The trained TransitionMachine TransitionMachine & tm; - /// @brief The BD we need to fill - BD & bd; /// @brief The current configuration of the TransitionMachine Config & config; /// @brief is true, decode will print infos on stderr @@ -34,7 +31,7 @@ class Decoder /// @param bd The BD we need to fill /// @param config The current configuration of the TransitionMachine /// @param debugMode If true, infos will be printed on stderr. - Decoder(TransitionMachine & tm, BD & bd, Config & config, bool debugMode); + Decoder(TransitionMachine & tm, Config & config, bool debugMode); /// @brief Fill bd using tm. void decode(); }; diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp index 78acd26fc22e0a76104b91a34354a64b8dc9c8ba..2bf8d89664409a2082dd2c578591dc1ab999b6d7 100644 --- a/decoder/src/Decoder.cpp +++ b/decoder/src/Decoder.cpp @@ -1,8 +1,8 @@ #include "Decoder.hpp" #include "util.hpp" -Decoder::Decoder(TransitionMachine & tm, BD & bd, Config & config, bool debugMode) -: tm(tm), bd(bd), config(config) +Decoder::Decoder(TransitionMachine & tm, Config & config, bool debugMode) +: tm(tm), config(config) { this->debugMode = debugMode; } diff --git a/decoder/src/macaon_decode.cpp b/decoder/src/macaon_decode.cpp index ab4ecb55d61c7e4fa3a9d5eb5c3df7c4c2dc5451..5611e373007f34f83e52aedd0447c1ca01d5c87d 100644 --- a/decoder/src/macaon_decode.cpp +++ b/decoder/src/macaon_decode.cpp @@ -116,7 +116,7 @@ int main(int argc, char * argv[]) Config config(bd); config.readInput(ProgramParameters::input); - Decoder decoder(tapeMachine, bd, config, ProgramParameters::debug); + Decoder decoder(tapeMachine, config, ProgramParameters::debug); decoder.decode(); diff --git a/maca_common/include/Dict.hpp b/maca_common/include/Dict.hpp index 68de3278de280fab70e74ea837bd9e48824bfb38..cdf795846c4988b923474db0e1e91101dbf729f3 100644 --- a/maca_common/include/Dict.hpp +++ b/maca_common/include/Dict.hpp @@ -12,12 +12,13 @@ #include <set> #include <memory> #include <fasttext/fasttext.h> +#include <dynet/dynet.h> /// @brief Maps strings to real vectors (embeddings or one-hot encoded). /// -/// Each entry of the Dict is a pair of string / vector.\n -/// The strings can be seen as features, while the vector are the numerical value -/// of this feature. +/// Each entry of the Dict is a pair of string / Index.\n +/// The strings can be seen as features, while the index +/// points to the corresponding embedding value. class Dict { public : @@ -94,10 +95,15 @@ class Dict private : + /// @brief The maximum number of entry a Dict can hold. + /// + /// This limit exists because the dynet LookupParameter associed with the Dict is not dynamic and must come with a fixed finite size. + static constexpr unsigned int MAX_CAPACITY = 200000; /// @brief The dimension of each vector of this Dict (in number of float). int dimension; - /// @brief A storage that map every string entry of this Dict to its vector value - std::map< std::string, std::vector<float> > str2vec; + /// @brief A storage that map every string entry of this Dict to its index as a lookup parameter. + std::map<std::string, unsigned int> str2index; + dynet::LookupParameter lookupParameter; /// @brief The filename to which this Dict will be saved. std::string filename; /// @brief When in OneHot Mode, the Dict uses this variable to initialize new entries' vector to a unique OneHot value. @@ -121,6 +127,7 @@ class Dict /// When getDict is called it will find the requested Dict here, /// or construct it. static std::map< std::string, std::unique_ptr<Dict> > str2dict; + bool isInit; private : @@ -129,30 +136,36 @@ class Dict /// @param s An entry of this Dict. /// /// @return The FastText's pre-trained word embedding value corresponding to s. - std::vector<float> * getValueFasttext(const std::string & s); + unsigned int getValueFasttext(const std::string & s); /// @brief Initialize a new embedding for this Dict. /// /// This function zero-initialize the embedding. /// @param s The entry whose embedding we will initialize. - /// @param vec The vector containing the embedding we will initialize. - void initEmbedding(const std::string & s, std::vector<float> & vec); + /// @param index The index of the entry. + void initParameterAsEmbedding(const std::string & s, unsigned int index); + /// @brief Initialize a parameter as a one hot value + /// + /// @param s The entry whose embedding we will initialize. + /// @param index The index of the parameter to initialize. + void initParameterAsOneHot(const std::string & s, unsigned int index); /// @brief Randomly initialize a new embedding for this Dict. /// /// Each float value of vec will be randomly set between -1.0 and +1.0 - /// @param vec The vector containing the embedding we will initialize. - void initEmbeddingRandom(std::vector<float> & vec); + /// @param index The index of the parameter to initialize. + void initEmbeddingRandom(unsigned int index); /// @brief Initialize a new embedding using FastText's value for this string. /// /// @param s The entry whose embedding we will initialize. - /// @param vec The vector containing the embedding we will initialize. - void initEmbeddingFromFasttext(const std::string & s, std::vector<float> & vec); + /// @param index The index of the parameter. + void initEmbeddingFromFasttext(const std::string & s, unsigned int index); /// @brief Add a new entry to this Dict. /// /// @param s The new entry to add. /// - /// @return The vector corresponding to the newly added entry. - std::vector<float> * addEntry(const std::string & s); - + /// @return The lookupParameter index of the newly added entry. + unsigned int addEntry(const std::string & s); + void init(dynet::ParameterCollection & pc); + void initFromFile(dynet::ParameterCollection & pc); /// @brief Read and construct a new Dict from a file. /// /// @param policy The Policy of the new Dict. @@ -174,6 +187,8 @@ class Dict /// /// @return A pointer to an entry matching the value of s. const std::string * getStrFasttext(const std::string & s); + /// @brief Create file. + void createFile(); public : @@ -203,16 +218,23 @@ class Dict /// @param directory The directory in which we will save every Dict. /// @param namePrefix The prefix of the name of the dicts we need to save. static void saveDicts(const std::string & directory, const std::string & namePrefix); + /// @brief Create a file for every Dict matching the prefix. + /// + /// @param directory The directory in which the files will be saved. + /// @param namePrefix The prefix that Dict must match. + static void createFiles(const std::string & directory, const std::string & namePrefix); + static void initDicts(dynet::ParameterCollection & pc, const std::string & namePrefix); + static void initDictsFromFile(dynet::ParameterCollection & pc, const std::string & namePrefix); /// @brief Delete all Dicts. static void deleteDicts(); /// @brief Save the current Dict in the corresponding file. void save(); - /// @brief Get the vector value of an entry. + /// @brief Get the index. /// /// @param s The entry which value we need. /// - /// @return The vector value corresponding to s. - std::vector<float> * getValue(const std::string & s); + /// @return The index of the lookupParameter corresponding to s. + unsigned int getValue(const std::string & s); /// @brief Get a pointer to the entry matching s. /// /// This is used when we need a permanent pointer to a string matching s, @@ -222,14 +244,14 @@ class Dict /// /// @return A pointer to an entry matching the value of s. const std::string * getStr(const std::string & s); - /// @brief Get the vector value of unknown values. + /// @brief Get the index of unknown values. /// - /// @return Pointer to the vector value of the entry unknownValueStr. - std::vector<float> * getUnknownValue(); - /// @brief Get the vector value of null values. + /// @return Index of the entry unknownValueStr. + unsigned int getUnknownValue(); + /// @brief Get the index of null values. /// - /// @return Pointer to the vector value of the entry nullValueStr. - std::vector<float> * getNullValue(); + /// @return Index of the entry nullValueStr. + unsigned int getNullValue(); /// @brief Get the dimmension of the vectors of this Dict. /// /// @return Dimmension of the vectors of this Dict. @@ -238,6 +260,7 @@ class Dict /// /// @param output The FILE to which the Dict will be printed. void printForDebug(FILE * output); + dynet::LookupParameter & getLookupParameter(); }; #endif diff --git a/maca_common/include/ProgramParameters.hpp b/maca_common/include/ProgramParameters.hpp index 0ee0d9df5f9829a94b4f0081c8252e0cff7f5af1..e47dd6959084148a2361b1232b680a75e9864283 100644 --- a/maca_common/include/ProgramParameters.hpp +++ b/maca_common/include/ProgramParameters.hpp @@ -44,6 +44,7 @@ struct ProgramParameters static bool showFeatureRepresentation; static int iterationSize; static int nbTrain; + static bool randomEmbeddings; private : diff --git a/maca_common/include/util.hpp b/maca_common/include/util.hpp index a3540d17beacc839b06437e5eacc3a5c0d1fea15..fc012e7f77bbc0fe3dfe25e51f0dc2497c318182 100644 --- a/maca_common/include/util.hpp +++ b/maca_common/include/util.hpp @@ -155,6 +155,13 @@ void printColumns(FILE * output, const std::vector< std::vector<std::string> > & /// /// @return The formated string. std::string float2str(float f, const char * format); +/// @brief Return a version of s not suffixed by suffix. +/// +/// @param s The base string. +/// @param suffix The suffix to remove. +/// +/// @return s without suffix at the end. +std::string removeSuffix(const std::string & s, const std::string & suffix); /// @brief Macro giving informations about an error. #define ERRINFO (getFilenameFromPath(std::string(__FILE__))+ ":l." + std::to_string(__LINE__)).c_str() diff --git a/maca_common/src/Dict.cpp b/maca_common/src/Dict.cpp index 3e6f655c673b9f06c466166029704488ce57ec86..4096b470a8ee55c3b73059880ee133da00f65d53 100644 --- a/maca_common/src/Dict.cpp +++ b/maca_common/src/Dict.cpp @@ -2,6 +2,7 @@ #include "File.hpp" #include "util.hpp" #include "ProgramParameters.hpp" +#include <dynet/io.h> std::string Dict::currentClassifierName; std::string Dict::nullValueStr = "_nullVALUEstr_"; @@ -46,6 +47,7 @@ const char * Dict::mode2str(Mode mode) Dict::Dict(const std::string & name, int dimension, Mode mode) { + this->isInit = false; this->policy = Policy::FromZero; this->filename = name; this->name = name; @@ -53,12 +55,32 @@ Dict::Dict(const std::string & name, int dimension, Mode mode) this->mode = mode; this->dimension = dimension; +} + +Dict::Dict(Policy policy, const std::string & filename) +{ + this->isInit = false; + this->policy = policy; + this->filename = filename; + this->oneHotIndex = 0; + this->name = removeSuffix(getFilenameFromPath(filename), ".dict"); +} +void Dict::init(dynet::ParameterCollection & pc) +{ + if (isInit) + { + fprintf(stderr, "ERROR (%s) : Dict is already initialized. Aborting.\n", ERRINFO); + exit(1); + } + + isInit = true; + this->lookupParameter = pc.add_lookup_parameters(MAX_CAPACITY, {(unsigned int)dimension}); addEntry(nullValueStr); addEntry(unknownValueStr); } -Dict::Dict(Policy policy, const std::string & filename) +void Dict::initFromFile(dynet::ParameterCollection & pc) { auto badFormatAndAbort = [&](std::string errInfo) { @@ -66,12 +88,14 @@ Dict::Dict(Policy policy, const std::string & filename) exit(1); }; - this->policy = policy; - this->filename = filename; - this->oneHotIndex = 0; + if (isInit) + { + fprintf(stderr, "ERROR (%s) : Dict is already initialized. Aborting.\n", ERRINFO); + exit(1); + } - File file(filename, "r"); - FILE * fd = file.getDescriptor(); + File * file = new File(filename, "r"); + FILE * fd = file->getDescriptor(); char b1[1024]; char b2[1024]; @@ -82,8 +106,7 @@ Dict::Dict(Policy policy, const std::string & filename) name = b1; mode = str2mode(b2); - addEntry(nullValueStr); - addEntry(unknownValueStr); + isInit = true; // If a fasttext pretrained embedding file is specified if(fscanf(fd, "Fasttext : %s\n", b1) == 1) @@ -107,89 +130,107 @@ Dict::Dict(Policy policy, const std::string & filename) if(this->policy == Policy::FromZero) return; - while(fscanf(fd, "\1%[^\1]\1", b1) == 1) - { - std::string entry(b1); + int readIndex; + while (fscanf(fd, "\1%[^\1]\1%d\n", b1, &readIndex) == 2) + str2index[b1] = readIndex; - str2vec[entry] = std::vector<float>(); - auto & vec = str2vec[entry]; + delete file; - // For OneHot we only write the index - if(mode == Mode::Embeddings) - for(int i = 0; i < dimension; i++) - { - float value; - if(fscanf(fd, "%f", &value) != 1) - badFormatAndAbort(ERRINFO); - vec.emplace_back(value); - } - else - { - int index; - if(fscanf(fd, "%d", &index) != 1) - badFormatAndAbort(ERRINFO); + dynet::TextFileLoader loader(filename); + lookupParameter = loader.load_lookup_param(pc, "lookup"); +} - vec.resize(dimension, 0.0); - vec[index] = 1.0; +void Dict::saveDicts(const std::string & directory, const std::string & namePrefix) +{ + for (auto & it : str2dict) + { + if(!strncmp(it.first.c_str(), namePrefix.c_str(), namePrefix.size())) + { + it.second->filename = directory + it.second->name + ".dict"; + it.second->save(); } - if(fscanf(fd, "%*[^\n]\n")) - badFormatAndAbort(ERRINFO); } } -void Dict::saveDicts(const std::string & directory, const std::string & namePrefix) +void Dict::createFiles(const std::string & directory, const std::string & namePrefix) { for (auto & it : str2dict) { if(!strncmp(it.first.c_str(), namePrefix.c_str(), namePrefix.size())) { it.second->filename = directory + it.second->name + ".dict"; - it.second->save(); + it.second->createFile(); + } + } +} + +void Dict::initDicts(dynet::ParameterCollection & pc, const std::string & namePrefix) +{ + for (auto & it : str2dict) + { + if(!strncmp(it.first.c_str(), namePrefix.c_str(), namePrefix.size())) + { + it.second->init(pc); + } + } +} + +void Dict::initDictsFromFile(dynet::ParameterCollection & pc, const std::string & namePrefix) +{ + for (auto & it : str2dict) + { + if(!strncmp(it.first.c_str(), namePrefix.c_str(), namePrefix.size())) + { + it.second->initFromFile(pc); } } } +void Dict::createFile() +{ + // If policy is Final, we didn't change any entry so no need to rewrite the file + if (policy == Policy::Final) + return; + + File * file = new File(filename, "w"); + FILE * fd = file->getDescriptor(); + + fprintf(fd, "%s\n%d\n%s\n", name.c_str(), dimension, mode2str(mode)); + + if(ftEmbeddings.get()) + fprintf(fd, "Fasttext : %s\n", ftFilename.c_str()); + + delete file; +} + void Dict::save() { // If policy is Final, we didn't change any entry so no need to rewrite the file if (policy == Policy::Final) return; - File file(filename, "w"); - FILE * fd = file.getDescriptor(); + File * file = new File(filename, "w"); + FILE * fd = file->getDescriptor(); fprintf(fd, "%s\n%d\n%s\n", name.c_str(), dimension, mode2str(mode)); if(ftEmbeddings.get()) fprintf(fd, "Fasttext : %s\n", ftFilename.c_str()); - for(auto & it : str2vec) - { - fprintf(fd, "\1%s\1\t", it.first.c_str()); + for (auto & it : str2index) + fprintf(fd, "\1%s\1%u\n", it.first.c_str(), it.second); - // For OneHot we only write the index - if(mode == Mode::Embeddings) - for(float value : it.second) - fprintf(fd, "%f\t", value); - else - { - for(unsigned int index = 0; index < it.second.size(); index++) - if(it.second[index] > 0.1) - { - fprintf(fd, "%d", index); - break; - } - } + delete file; - fprintf(fd, "\n"); - } + dynet::TextFileSaver s(filename, true); + s.save(lookupParameter, "lookup"); } -std::vector<float> * Dict::getValue(const std::string & s) +unsigned int Dict::getValue(const std::string & s) { - auto it = str2vec.find(s); - if(it != str2vec.end()) - return &(it->second); + auto it = str2index.find(s); + if(it != str2index.end()) + return it->second; if(policy == Policy::Final) { @@ -202,30 +243,18 @@ std::vector<float> * Dict::getValue(const std::string & s) return addEntry(s); } -std::vector<float> * Dict::getValueFasttext(const std::string & s) +unsigned int Dict::getValueFasttext(const std::string &) { - auto it = ftVocab.find(s); - if(it != ftVocab.end()) - return &(it->second); - - if(s.empty()) - { - fprintf(stderr, "ERROR (%s) : dict \'%s\' was asked to store an empty entry. Aborting.\n", ERRINFO, name.c_str()); - exit(1); - } - - ftVocab.emplace(s, std::vector<float>(dimension, 0.0)); - auto & vec = ftVocab[s]; - ftEmbeddings->getWordVector(*ftVector.get(), s); - memcpy(vec.data(), ftVector.get()->data(), dimension * sizeof vec[0]); + fprintf(stderr, "ERROR (%s) : Not implemented.\n", ERRINFO); + exit(1); - return &vec; + return 0; } const std::string * Dict::getStr(const std::string & s) { - auto it = str2vec.find(s); - if(it != str2vec.end()) + auto it = str2index.find(s); + if(it != str2index.end()) return &(it->first); if(policy == Policy::Final) @@ -237,7 +266,7 @@ const std::string * Dict::getStr(const std::string & s) addEntry(s); - it = str2vec.find(s); + it = str2index.find(s); return &(it->first); } @@ -263,17 +292,24 @@ const std::string * Dict::getStrFasttext(const std::string & s) return &(it->first); } -void Dict::initEmbedding(const std::string & s, std::vector<float> & vec) +void Dict::initParameterAsOneHot(const std::string &, unsigned int index) +{ + std::vector<float> vec(dimension, 0.0); + vec[index] = 1.0; + lookupParameter.initialize(index, vec); +} + +void Dict::initParameterAsEmbedding(const std::string & s, unsigned int index) { - vec[0] = 0.0; // just to shut warning up - // Here initialize a new embedding, doing nothing = all zeroes - //initEmbeddingRandom(vec); + if (ProgramParameters::randomEmbeddings) + initEmbeddingRandom(index); if(ftEmbeddings.get()) - initEmbeddingFromFasttext(s, vec); + initEmbeddingFromFasttext(s, index); } -void Dict::initEmbeddingRandom(std::vector<float> & vec) +void Dict::initEmbeddingRandom(unsigned int index) { + std::vector<float> vec(dimension, 0.0); int range = 1; for (auto & val : vec) @@ -284,48 +320,57 @@ void Dict::initEmbeddingRandom(std::vector<float> & vec) result += decimal; val = result; } + + lookupParameter.initialize(index, vec); } -void Dict::initEmbeddingFromFasttext(const std::string & s, std::vector<float> & vec) +void Dict::initEmbeddingFromFasttext(const std::string & s, unsigned int index) { ftEmbeddings->getWordVector(*ftVector.get(), s); - memcpy(vec.data(), ftVector.get()->data(), dimension * sizeof vec[0]); + lookupParameter.initialize(index, *(std::vector<float>*)ftVector.get()); } -std::vector<float> * Dict::getUnknownValue() +unsigned int Dict::getUnknownValue() { - return &str2vec[unknownValueStr]; + return str2index[unknownValueStr]; } -std::vector<float> * Dict::getNullValue() +unsigned int Dict::getNullValue() { - return &str2vec[nullValueStr]; + return str2index[nullValueStr]; } -std::vector<float> * Dict::addEntry(const std::string & s) +unsigned int Dict::addEntry(const std::string & s) { + if (!isInit) + { + fprintf(stderr, "ERROR (%s) : dict \'%s\' is not initialized. Aborting.\n", ERRINFO, name.c_str()); + exit(1); + } + if(s.empty()) { fprintf(stderr, "ERROR (%s) : dict \'%s\' was asked to store an empty entry. Aborting.\n", ERRINFO, name.c_str()); exit(1); } - str2vec.emplace(s, std::vector<float>(dimension, 0.0)); - auto & vec = str2vec[s]; + auto index = str2index.size(); + str2index.emplace(s, index); if(mode == Mode::OneHot) { if(oneHotIndex >= dimension) fprintf(stderr, "WARNING (%s) : Dict %s of dimension %d is asked to store %d elements in one-hot.\n", ERRINFO, name.c_str(), dimension, oneHotIndex+1); else - vec[oneHotIndex] = 1.0; - - oneHotIndex++; + { + initParameterAsOneHot(s, index); + oneHotIndex++; + } } else - initEmbedding(s, vec); + initParameterAsEmbedding(s, index); - return &vec; + return index; } Dict * Dict::getDict(Policy policy, const std::string & filename) @@ -338,7 +383,7 @@ Dict * Dict::getDict(Policy policy, const std::string & filename) str2dict.insert(std::make_pair(dict->name, std::unique_ptr<Dict>(dict))); - return str2dict[filename].get(); + return str2dict[dict->name].get(); } Dict * Dict::getDict(const std::string & name) @@ -413,7 +458,7 @@ int Dict::getDimension() void Dict::printForDebug(FILE * output) { - fprintf(output, "Dict name \'%s\' nbElems = %lu\n", name.c_str(), str2vec.size()); + fprintf(output, "Dict name \'%s\' nbElems = %lu\n", name.c_str(), str2index.size()); } void Dict::deleteDicts() @@ -421,3 +466,8 @@ void Dict::deleteDicts() str2dict.clear(); } +dynet::LookupParameter & Dict::getLookupParameter() +{ + return lookupParameter; +} + diff --git a/maca_common/src/ProgramParameters.cpp b/maca_common/src/ProgramParameters.cpp index b505d043058734022436c33fea72cd8fdf6cde22..e5835a6b594dc24e4d293740c2109ed22d8cb99b 100644 --- a/maca_common/src/ProgramParameters.cpp +++ b/maca_common/src/ProgramParameters.cpp @@ -37,5 +37,6 @@ bool ProgramParameters::interactive; int ProgramParameters::dynamicEpoch; float ProgramParameters::dynamicProbability; bool ProgramParameters::showFeatureRepresentation; +bool ProgramParameters::randomEmbeddings; int ProgramParameters::iterationSize; int ProgramParameters::nbTrain; diff --git a/maca_common/src/util.cpp b/maca_common/src/util.cpp index 368cec7d2494ce6901cbe311ac971129bf6c7e3f..6d8fc121e8a0f7742aa8a34768f660c6df20780b 100644 --- a/maca_common/src/util.cpp +++ b/maca_common/src/util.cpp @@ -349,3 +349,14 @@ bool isNum(const std::string & s) return true; } +std::string removeSuffix(const std::string & s, const std::string & suffix) +{ + int lastIndex = s.size()-1; + while (lastIndex >= 0 && s[lastIndex] == suffix[suffix.size()-1-(s.size()-1-lastIndex)]) + lastIndex--; + + std::string result = std::string(s.begin(), s.begin()+lastIndex+1); + + return result; +} + diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp index e9a7f751fc43c4964e75729a32bedf923c1f2adc..7b255df6aad6c554ab7f6aa6e208957d8655293b 100644 --- a/trainer/src/Trainer.cpp +++ b/trainer/src/Trainer.cpp @@ -94,7 +94,7 @@ std::map<std::string, float> Trainer::getScoreOnDev() void Trainer::train() { - Dict::saveDicts(ProgramParameters::expPath, ""); + Dict::createFiles(ProgramParameters::expPath, ""); fprintf(stderr, "Training of \'%s\' :\n", tm.name.c_str()); diff --git a/trainer/src/macaon_train.cpp b/trainer/src/macaon_train.cpp index 69d608100a5d707f17510be3512d42a0fffd01d3..0c37b792e13cdef159bb870bd22e8d9ca1546e7e 100644 --- a/trainer/src/macaon_train.cpp +++ b/trainer/src/macaon_train.cpp @@ -62,6 +62,8 @@ po::options_description getOptionsDescription() "For each state of the Config, show its feature representation") ("interactive", po::value<bool>()->default_value(true), "Is the shell interactive ? Display advancement informations") + ("randomEmbeddings", po::value<bool>()->default_value(false), + "When activated, the embeddings will be randomly initialized") ("shuffle", po::value<bool>()->default_value(true), "Shuffle examples after each iteration"); @@ -160,7 +162,7 @@ do\n\ ARGS=\"$ARGS $arg\"\n\ done\n\ \n\ -macaon_decode --lang $LANG --tm machine.tm --bd test.bd -I $INPUT --mcd $MCD --expName " + ProgramParameters::expName + "$ARGS"; +macaon_decode --lang " + ProgramParameters::lang + " --tm machine.tm --bd test.bd -I $INPUT --mcd $MCD --expName " + ProgramParameters::expName + "$ARGS"; if (system(("rm -r " + ProgramParameters::expPath + " 2> /dev/null").c_str())){} if (system(("mkdir " + ProgramParameters::expPath).c_str())){} @@ -239,6 +241,7 @@ int main(int argc, char * argv[]) ProgramParameters::removeDuplicates = vm["duplicates"].as<bool>(); ProgramParameters::interactive = vm["interactive"].as<bool>(); ProgramParameters::shuffleExamples = vm["shuffle"].as<bool>(); + ProgramParameters::randomEmbeddings = vm["randomEmbeddings"].as<bool>(); ProgramParameters::learningRate = vm["lr"].as<float>(); ProgramParameters::beta1 = vm["b1"].as<float>(); ProgramParameters::beta2 = vm["b2"].as<float>(); diff --git a/transition_machine/include/TransitionMachine.hpp b/transition_machine/include/TransitionMachine.hpp index a990d63fea9d29791fb79ead066515353071ac73..d9ceaae4e114c8c819718127c36a0ebf89eeae55 100644 --- a/transition_machine/include/TransitionMachine.hpp +++ b/transition_machine/include/TransitionMachine.hpp @@ -18,7 +18,7 @@ class TransitionMachine { public : - class State; + struct State; /// @brief A Transition from one state to another. struct Transition diff --git a/transition_machine/src/Classifier.cpp b/transition_machine/src/Classifier.cpp index b26490b363f1386910400e042d1d6c0bc471ae5e..3816df5a98a954d0b16e85c4e0a90f52451f53d8 100644 --- a/transition_machine/src/Classifier.cpp +++ b/transition_machine/src/Classifier.cpp @@ -123,18 +123,23 @@ void Classifier::initClassifier(Config & config) if(!trainMode) { mlp.reset(new MLP(ProgramParameters::expPath + name + ".model")); + Dict::initDictsFromFile(mlp->getModel(), name); return; } - int nbInputs = 0; - int nbOutputs = as->actions.size(); + mlp.reset(new MLP()); + + Dict::initDicts(mlp->getModel(), name); auto fd = fm->getFeatureDescription(config); + int nbInputs = 0; + int nbOutputs = as->actions.size(); + for (auto feat : fd.values) nbInputs += feat.dict->getDimension(); - mlp.reset(new MLP(nbInputs, topology, nbOutputs)); + mlp->init(nbInputs, topology, nbOutputs); } FeatureModel::FeatureDescription Classifier::getFeatureDescription(Config & config) diff --git a/transition_machine/src/FeatureModel.cpp b/transition_machine/src/FeatureModel.cpp index 4e671a920a32679ddb129a5fe352222fe04aa00f..52fa264416c2910df232938e95e6d18b4ff5a39d 100644 --- a/transition_machine/src/FeatureModel.cpp +++ b/transition_machine/src/FeatureModel.cpp @@ -46,10 +46,13 @@ std::string FeatureModel::FeatureValue::toString() { std::string result; - auto realVector = dict->getValue(*value); + unsigned int index = dict->getValue(*value); + float * realVector = (*dict->getLookupParameter().values())[index].batch_ptr(0); - for (auto & f : *realVector) - result += " " + float2str(f, "%5.2f"); + unsigned int dim = dict->getDimension(); + + for (unsigned int i = 0; i < dim; i++) + result += " " + float2str(realVector[i], "%5.2f"); return result; }