Skip to content
Snippets Groups Projects
Select Git revision
  • ed05ee4ac47f827f31bd8de5b760c772106be90f
  • master default protected
  • loss
  • producer
4 results

DepthLayerTreeEmbeddingModule.cpp

Blame
  • FocusedColumnModule.cpp 6.35 KiB
    #include "FocusedColumnModule.hpp"
    
    FocusedColumnModuleImpl::FocusedColumnModuleImpl(std::string name, const std::string & definition, std::filesystem::path path) : path(path)
    {
      setName(name);
      std::regex regex("(?:(?:\\s|\\t)*)Column\\{(.*)\\}(?:(?:\\s|\\t)*)NbElem\\{(.*)\\}(?:(?:\\s|\\t)*)Buffer\\{(.*)\\}(?:(?:\\s|\\t)*)Stack\\{(.*)\\}(?:(?:\\s|\\t)*)(\\S+)\\{(.*)\\}(?:(?:\\s|\\t)*)In\\{(.*)\\}(?:(?:\\s|\\t)*)Out\\{(.*)\\}(?:(?:\\s|\\t)*)w2v\\{(.*)\\}(?:(?:\\s|\\t)*)");
      if (!util::doIfNameMatch(regex, definition, [this,&definition](auto sm)
            {
              try
              {
                func = getFunction(sm.str(1));
                column = util::split(sm.str(1), ':').back();
                maxNbElements = std::stoi(sm.str(2));
    
                for (auto & index : util::split(sm.str(3), ' '))
                  focusedBuffer.emplace_back(std::stoi(index));
    
                for (auto & index : util::split(sm.str(4), ' '))
                  focusedStack.emplace_back(std::stoi(index));
    
                auto subModuleType = sm.str(5);
                auto subModuleArguments = util::split(sm.str(6), ' ');
    
                auto options = MyModule::ModuleOptions(true)
                  .bidirectional(std::stoi(subModuleArguments[0]))
                  .num_layers(std::stoi(subModuleArguments[1]))
                  .dropout(std::stof(subModuleArguments[2]))
                  .complete(std::stoi(subModuleArguments[3]));
    
                inSize = std::stoi(sm.str(7));
                int outSize = std::stoi(sm.str(8));
    
                if (subModuleType == "LSTM")
                  myModule = register_module("myModule", LSTM(inSize, outSize, options));
                else if (subModuleType == "GRU")
                  myModule = register_module("myModule", GRU(inSize, outSize, options));
                else if (subModuleType == "Concat")
                  myModule = register_module("myModule", Concat(inSize));
                else
                  util::myThrow(fmt::format("unknown sumodule type '{}'", subModuleType));
    
                w2vFiles = sm.str(9);
    
                if (!w2vFiles.empty())
                {
                  auto pathes = util::split(w2vFiles.string(), ' ');
                  for (auto & p : pathes)
                  {
                    auto splited = util::split(p, ',');
                    if (splited.size() != 2)
                      util::myThrow("expected 'prefix,pretrained.w2v'");
                    auto pretrained = getDict().loadWord2Vec(this->path / splited[1], splited[0]);
                    if (pretrained)
                    {
                      getDict().setState(Dict::State::Closed);
                      dictSetPretrained(true);
                    }
                  }
                }
    
              } catch (std::exception & e) {util::myThrow(fmt::format("{} in '{}'",e.what(),definition));}
            }))
        util::myThrow(fmt::format("invalid definition '{}'", definition));
    }
    
    torch::Tensor FocusedColumnModuleImpl::forward(torch::Tensor input)
    {
      std::vector<torch::Tensor> outputs;
      for (unsigned int i = 0; i < focusedBuffer.size()+focusedStack.size(); i++)
        outputs.emplace_back(myModule->forward(wordEmbeddings(input.narrow(1, firstInputIndex+i*maxNbElements, maxNbElements))).reshape({input.size(0), -1}));
    
      return torch::cat(outputs, 1);
    }
    
    std::size_t FocusedColumnModuleImpl::getOutputSize()
    {
      return (focusedBuffer.size()+focusedStack.size())*myModule->getOutputSize(maxNbElements);
    }
    
    std::size_t FocusedColumnModuleImpl::getInputSize()
    {
      return (focusedBuffer.size()+focusedStack.size()) * maxNbElements;
    }
    
    void FocusedColumnModuleImpl::addToContext(torch::Tensor & context, const Config & config)
    {
      auto & dict = getDict();
      std::vector<long> focusedIndexes;
    
      for (int index : focusedBuffer)
        focusedIndexes.emplace_back(config.getRelativeWordIndex(index));
    
      for (int index : focusedStack)
        if (config.hasStack(index))
          focusedIndexes.emplace_back(config.getStack(index));
        else
          focusedIndexes.emplace_back(-2);
    
      int insertIndex = 0;
      for (auto index : focusedIndexes)
      {
        if (index == -1 or index == -2)
        {
          for (int i = 0; i < maxNbElements; i++)
          {
            context[firstInputIndex+insertIndex] = dict.getIndexOrInsert(index == -1 ? Dict::oobValueStr : Dict::nullValueStr, column);
            insertIndex++;
          }
          continue;
        }
    
        std::vector<std::string> elements;
        if (column == "FORM")
        {
          auto asUtf8 = util::splitAsUtf8(func(std::string(config.getAsFeature(column, index))));
          for (int i = 0; i < maxNbElements; i++)
            if (i < (int)asUtf8.size())
              elements.emplace_back(fmt::format("{}", asUtf8[i]));
            else
              elements.emplace_back("<padding>");
        }
        else if (column == "FEATS")
        {
          auto splited = util::split(func(std::string(config.getAsFeature(column, index))), '|');
    
          for (int i = 0; i < maxNbElements; i++)
            if (i < (int)splited.size())
              elements.emplace_back(splited[i]);
            else
              elements.emplace_back("<padding>");
        }
        else if (column == Config::idColName)
        {
          if (config.getAsFeature(Config::idColName, index).empty())
            elements.emplace_back("empty");
          else if (config.isMultiwordPredicted(index))
            elements.emplace_back("multiword");
          else if (config.getAsFeature(Config::isMultiColName, index) == Config::EOSSymbol1)
            elements.emplace_back("part");
          else if (config.isTokenPredicted(index))
            elements.emplace_back("token");
        }
        else if (column == "EOS")
        {
          bool isEOS = func(config.getAsFeature(Config::EOSColName, index)) == Config::EOSSymbol1;
          elements.emplace_back(fmt::format("{}", isEOS));
        }
        else
        {
          elements.emplace_back(func(config.getAsFeature(column, index)));
        }
    
        if ((int)elements.size() != maxNbElements)
          util::myThrow(fmt::format("elements.size ({}) != maxNbElements ({})", elements.size(), maxNbElements));
    
        for (auto & element : elements)
        {
          context[firstInputIndex+insertIndex] = dict.getIndexOrInsert(element, column);
          insertIndex++;
        }
      }
    }
    
    void FocusedColumnModuleImpl::registerEmbeddings()
    {
      if (!wordEmbeddings)
        wordEmbeddings = register_module("embeddings", WordEmbeddings(getDict().size(), inSize, w2vFiles.empty() ? std::set<std::size_t>() : getDict().getSpecialIndexes()));
      auto pathes = util::split(w2vFiles.string(), ' ');
      for (auto & p : pathes)
      {
        auto splited = util::split(p, ',');
        loadPretrainedW2vEmbeddings(wordEmbeddings->getNormalEmbeddings(), path / splited[1], splited[0]);
      }
    }