ContextModule.cpp 6.25 KB
Newer Older
1
2
#include "ContextModule.hpp"

3
ContextModuleImpl::ContextModuleImpl(std::string name, const std::string & definition, std::filesystem::path path) : path(path)
4
{
Franck Dary's avatar
Franck Dary committed
5
  setName(name);
6

7
  std::regex regex("(?:(?:\\s|\\t)*)Targets\\{(.*)\\}(?:(?:\\s|\\t)*)Columns\\{(.*)\\}(?:(?:\\s|\\t)*)(\\S+)\\{(.*)\\}(?:(?:\\s|\\t)*)In\\{(.*)\\}(?:(?:\\s|\\t)*)Out\\{(.*)\\}(?:(?:\\s|\\t)*)w2v\\{(.*)\\}(?:(?:\\s|\\t)*)");
8
9
10
11
  if (!util::doIfNameMatch(regex, definition, [this,&definition](auto sm)
        {
          try
          {
12
13
14
15
16
17
18
            for (auto & target : util::split(sm.str(1), ' '))
            {
              auto splited = util::split(target, '.');
              if (splited.size() != 2 and splited.size() != 3)
                util::myThrow(fmt::format("invalid target '{}' expected 'object.index(.childIndex)'", target));
              targets.emplace_back(std::make_tuple(Config::str2object(splited[0]), std::stoi(splited[1]), splited.size() == 3 ? std::optional<int>(std::stoi(splited[2])) : std::optional<int>()));
            }
19

20
            auto funcColumns = util::split(sm.str(2), ' ');
21
22
23
            columns.clear();
            for (auto & funcCol : funcColumns)
            {
24
25
              functions.emplace_back() = getFunction(funcCol);
              columns.emplace_back(util::split(funcCol, ':').back());
26
            }
27

28
29
            auto subModuleType = sm.str(3);
            auto subModuleArguments = util::split(sm.str(4), ' ');
30
31
32
33
34
35
36

            auto options = MyModule::ModuleOptions(true)
              .bidirectional(std::stoi(subModuleArguments[0]))
              .num_layers(std::stoi(subModuleArguments[1]))
              .dropout(std::stof(subModuleArguments[2]))
              .complete(std::stoi(subModuleArguments[3]));

37
38
            inSize = std::stoi(sm.str(5));
            int outSize = std::stoi(sm.str(6));
39
40
41
42
43

            if (subModuleType == "LSTM")
              myModule = register_module("myModule", LSTM(columns.size()*inSize, outSize, options));
            else if (subModuleType == "GRU")
              myModule = register_module("myModule", GRU(columns.size()*inSize, outSize, options));
Franck Dary's avatar
Franck Dary committed
44
45
            else if (subModuleType == "Concat")
              myModule = register_module("myModule", Concat(inSize));
Franck Dary's avatar
Franck Dary committed
46
47
            else if (subModuleType == "Transformer")
              myModule = register_module("myModule", Transformer(columns.size()*inSize, outSize, options));
48
49
50
            else
              util::myThrow(fmt::format("unknown sumodule type '{}'", subModuleType));

51
            w2vFiles = sm.str(7);
52

53
            if (!w2vFiles.empty())
54
            {
55
56
              auto pathes = util::split(w2vFiles.string(), ' ');
              for (auto & p : pathes)
57
58
59
60
61
62
63
64
              {
                auto splited = util::split(p, ',');
                if (splited.size() != 2)
                  util::myThrow("expected 'prefix,pretrained.w2v'");
                getDict().loadWord2Vec(this->path / splited[1], splited[0]);
                getDict().setState(Dict::State::Closed);
                dictSetPretrained(true);
              }
65
66
            }

67
68
69
70
71
72
73
          } catch (std::exception & e) {util::myThrow(fmt::format("{} in '{}'",e.what(),definition));}
        }))
    util::myThrow(fmt::format("invalid definition '{}'", definition));
}

std::size_t ContextModuleImpl::getOutputSize()
{
74
  return myModule->getOutputSize(targets.size());
75
76
77
78
}

std::size_t ContextModuleImpl::getInputSize()
{
79
  return columns.size()*(targets.size());
80
81
}

Franck Dary's avatar
Franck Dary committed
82
void ContextModuleImpl::addToContext(std::vector<std::vector<long>> & context, const Config & config)
83
{
Franck Dary's avatar
Franck Dary committed
84
  auto & dict = getDict();
85
86
  std::vector<long> contextIndexes;

87
88
89
90
91
92
93
94
95
96
  for (auto & target : targets)
    if (config.hasRelativeWordIndex(std::get<0>(target), std::get<1>(target)))
    {
      int baseIndex = config.getRelativeWordIndex(std::get<0>(target), std::get<1>(target));
      if (!std::get<2>(target))
        contextIndexes.emplace_back(baseIndex);
      else
      {
        int childIndex = *std::get<2>(target);
        auto childs = util::split(config.getAsFeature(Config::childsColName, baseIndex).get(), '|');
97
98
        int candidate = -1;

99
        if (childIndex >= 0 and childIndex < (int)childs.size())
100
101
102
103
104
        {
          candidate = std::stoi(childs[childIndex]);
          if (candidate > baseIndex)
            candidate = -1;
        }
105
        else if (childIndex < 0 and ((int)childs.size())+childIndex >= 0)
106
107
108
109
110
111
112
        {
          candidate = std::stoi(childs[childs.size()+childIndex]);
          if (candidate < baseIndex)
            candidate = -1;
        }

        contextIndexes.emplace_back(candidate);
113
114
      }
    }
115
116
117
118
    else
      contextIndexes.emplace_back(-1);

  for (auto index : contextIndexes)
119
120
121
    for (unsigned int colIndex = 0; colIndex < columns.size(); colIndex++)
    {
      auto & col = columns[colIndex];
122
123
124
      if (index == -1)
      {
        for (auto & contextElement : context)
125
          contextElement.push_back(dict.getIndexOrInsert(Dict::nullValueStr, col));
126
127
128
      }
      else
      {
129
130
131
132
133
        int dictIndex;
        if (col == Config::idColName)
        {
          std::string value;
          if (config.isCommentPredicted(index))
134
            value = "comment";
135
          else if (config.isMultiwordPredicted(index))
136
            value = "multiword";
137
          else if (config.isTokenPredicted(index))
138
139
            value = "token";
          dictIndex = dict.getIndexOrInsert(value, col);
140
141
        }
        else
142
143
        {
          std::string featureValue = functions[colIndex](config.getAsFeature(col, index));
144
          dictIndex = dict.getIndexOrInsert(featureValue, col);
145
        }
146
147
148
149

        for (auto & contextElement : context)
          contextElement.push_back(dictIndex);
      }
150
    }
151
152
153
154
155
156
157
158
}

torch::Tensor ContextModuleImpl::forward(torch::Tensor input)
{
  auto context = wordEmbeddings(input.narrow(1, firstInputIndex, getInputSize()));

  context = context.view({context.size(0), context.size(1)/(int)columns.size(), (int)columns.size()*context.size(2)});

Franck Dary's avatar
Franck Dary committed
159
  return myModule->forward(context).reshape({input.size(0), -1});
160
161
}

162
void ContextModuleImpl::registerEmbeddings()
163
{
164
  wordEmbeddings = register_module("embeddings", WordEmbeddings(getDict().size(), inSize));
165
166
  auto pathes = util::split(w2vFiles.string(), ' ');
  for (auto & p : pathes)
167
168
  {
    auto splited = util::split(p, ',');
169
    loadPretrainedW2vEmbeddings(wordEmbeddings->get(), path / splited[1], splited[0]);
170
  }
171
172
}