Skip to content
Snippets Groups Projects
BaseConfig.cpp 6.35 KiB
#include "BaseConfig.hpp"
#include "util.hpp"

void BaseConfig::createColumns(std::string mcd)
{
  this->mcd = mcd;

  colIndex2Name.clear();
  colName2Index.clear();

  auto splited = util::split(mcd, ',');
  for (auto & colName : splited)
  {
    colIndex2Name.emplace_back(colName);
    colName2Index.emplace(colName, colIndex2Name.size()-1);
  }

  for (auto & column : extraColumns)
  {
    if (colName2Index.count(column))
      util::myThrow(fmt::format("mcd '{}' must not contain column '{}'", mcd, column));
    colIndex2Name.emplace_back(column);
    colName2Index.emplace(column, colIndex2Name.size()-1);
  }
}

void BaseConfig::readRawInput(std::string_view rawFilename)
{
  std::FILE * file = std::fopen(rawFilename.data(), "r");

  if (not file)
    util::myThrow(fmt::format("Cannot open file '{}'", rawFilename));

  std::string rawInputTemp;

  while (not std::feof(file))
    rawInputTemp.push_back(std::fgetc(file));

  std::fclose(file);

  rawInput = util::splitAsUtf8(rawInputTemp);
  rawInput.replace(util::utf8char("\n"), util::utf8char(" "));
  rawInput.replace(util::utf8char("\t"), util::utf8char(" "));
}

void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<int> & _sentencesIndexes)
{
  auto sentencesIndexes = _sentencesIndexes;
  if (sentencesIndexes.empty())
    sentencesIndexes.emplace_back(-1);

  for (int targetSentenceIndex : sentencesIndexes)
  {
    std::FILE * file = std::fopen(tsvFilename.data(), "r");
  
    if (not file)
      util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename));
  
    char lineBuffer[100000];
    int inputLineIndex = 0;
    bool inputHasBeenRead = false;
    int usualNbCol = -1;
    int nbMultiwords = 0;
    int curSentenceIndex = 0;
    std::vector<std::string> pendingComments;
  
    while (!std::feof(file))
    {
      if (lineBuffer != std::fgets(lineBuffer, 100000, file))
        break;
  
      std::string_view line(lineBuffer);
      inputLineIndex++;
  
      if (line.size() < 3)
      {
        if (!inputHasBeenRead)
          continue;

        if (targetSentenceIndex == -1 or targetSentenceIndex == curSentenceIndex)
        {
          get(EOSColName, getNbLines()-1, 0) = EOSSymbol1;
  
          try
          {
            std::map<std::string, int> id2index;
            int firstIndexOfSequence = getNbLines()-1;
            for (int i = (int)getNbLines()-1; has(0, i, 0); --i)
            {
              if (!isToken(i))
                continue;
  
              if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1)
                break;
  
              firstIndexOfSequence = i;
              id2index[getConst(idColName, i, 0)] = i;
            }
            if (hasColIndex(headColName))
              for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i)
              {
                if (!isToken(i))
                  continue;
                auto & head = get(headColName, i, 0);
                if (head == "0")
                  head = "-1";
                else
                  head = std::to_string(id2index[head]);
              }
          } catch(std::exception & e) {util::myThrow(e.what());}
        }

        curSentenceIndex += 1;
  
        continue;
      }
  
      if (line.back() == '\n')
        line.remove_suffix(1);
  
      if (line[0] == '#')
      {
        if (util::doIfNameMatch(std::regex("(?:(?:\\s|\\t)*)#(?:(?:\\s|\\t)*)global.columns(?:(?:\\s|\\t)*)=(?:(?:\\s|\\t)*)(.+)"), line, [this](const auto & sm)
              {
                createColumns(util::join(",", util::split(util::strip(sm.str(1)), ' '))); 
              }))
          continue;
  
        if (targetSentenceIndex == -1 or targetSentenceIndex == curSentenceIndex)
          pendingComments.emplace_back(line);
        continue;
      }
  
      inputHasBeenRead = true;
  
      auto splited = util::split(line, '\t');
      if (usualNbCol == -1)
        usualNbCol = splited.size();
      if ((int)splited.size() != usualNbCol)
        util::myThrow(fmt::format("in file {} line {} is invalid, it shoud have {} columns", tsvFilename, line, usualNbCol));

      if (targetSentenceIndex != -1 and targetSentenceIndex != curSentenceIndex)
        continue;

      // Ignore empty nodes
      if (hasColIndex(idColName) && splited[getColIndex(idColName)].find('.') != std::string::npos)
        continue;
  
      addLines(1);
      get(EOSColName, getNbLines()-1, 0) = EOSSymbol0;
      if (nbMultiwords > 0)
      {
        get(isMultiColName, getNbLines()-1, 0) = EOSSymbol1;
        nbMultiwords--;
      }
      else
        get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0;
  
      get(commentsColName, getNbLines()-1, 0) = util::join("\n", pendingComments);
      pendingComments.clear();
  
      for (unsigned int i = 0; i < splited.size(); i++)
        if (i < colIndex2Name.size() - extraColumns.size())
        {
          std::string value = std::string(splited[i]);
          get(i, getNbLines()-1, 0) = value;
        }
  
      if (isMultiword(getNbLines()-1))
        nbMultiwords = getMultiwordSize(getNbLines()-1)+1;
    }
  
    std::fclose(file);
  } // End for targetSentenceIndex
}

BaseConfig::BaseConfig(const BaseConfig & other) : Config(other), colIndex2Name(other.colIndex2Name), colName2Index(other.colName2Index)
{
}

BaseConfig::BaseConfig(std::string mcd, std::string_view tsvFilename, const util::utf8string & rawInput, const std::vector<int> & sentencesIndexes)
{
  if (tsvFilename.empty() and rawInput.empty())
    util::myThrow("tsvFilename and rawInput can't be both empty");

  createColumns(mcd);

  if (not rawInput.empty())
    this->rawInput = rawInput;

// sentencesIndexes = index of sentences to keep. Empty vector == keep all sentences.
  if (not tsvFilename.empty())
    readTSVInput(tsvFilename, sentencesIndexes);

  if (!has(0,wordIndex,0))
    addLines(1);
}

std::size_t BaseConfig::getNbColumns() const
{
  return colIndex2Name.size();
}

std::size_t BaseConfig::getColIndex(const std::string & colName) const
{
  auto it = colName2Index.find(colName);
  if (it == colName2Index.end())
    util::myThrow(fmt::format("unknown column name '{}', mcd = '{}'", colName, mcd));
  return it->second;
}

bool BaseConfig::hasColIndex(const std::string & colName) const
{
  return colName2Index.count(colName);
}

const std::string & BaseConfig::getColName(int colIndex) const
{
  return colIndex2Name[colIndex];
}

std::size_t BaseConfig::getFirstLineIndex() const
{
  return 0;
}