Skip to content
Snippets Groups Projects
BaseConfig.cpp 5.48 KiB
Newer Older
  • Learn to ignore specific revisions
  • #include "BaseConfig.hpp"
    #include "util.hpp"
    
    void BaseConfig::readMCD(std::string_view mcdFilename)
    {
      if (!colIndex2Name.empty())
        util::myThrow("a mcd has already been read for this BaseConfig");
    
      std::FILE * file = std::fopen(mcdFilename.data(), "r");
    
      if (not file)
        util::myThrow(fmt::format("Cannot open file '{}'", mcdFilename));
    
      char lineBuffer[1024];
      while (std::fscanf(file, "%1023[^\n]\n", lineBuffer) == 1)
      {
        colIndex2Name.emplace_back(lineBuffer);
        colName2Index.emplace(lineBuffer, colIndex2Name.size()-1);
      }
    
      std::fclose(file);
    
    
      for (auto & column : extraColumns)
      {
        if (colName2Index.count(column))
          util::myThrow(fmt::format("mcd '{}' must not contain column '{}'", mcdFilename, column));
        colIndex2Name.emplace_back(column);
        colName2Index.emplace(column, colIndex2Name.size()-1);
      }
    
    }
    
    void BaseConfig::readRawInput(std::string_view rawFilename)
    {
      std::FILE * file = std::fopen(rawFilename.data(), "r");
    
      if (not file)
        util::myThrow(fmt::format("Cannot open file '{}'", rawFilename));
    
    
      std::string rawInputTemp;
    
    
        rawInputTemp.push_back(std::fgetc(file));
    
      rawInputUtf8 = util::splitAsUtf8(rawInputTemp);
    
      rawInputUtf8.replace(util::utf8char("\n"), util::utf8char(" "));
      rawInputUtf8.replace(util::utf8char("\t"), util::utf8char(" "));
    
    }
    
    void BaseConfig::readTSVInput(std::string_view tsvFilename)
    {
      std::FILE * file = std::fopen(tsvFilename.data(), "r");
    
      if (not file)
        util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename));
    
      char lineBuffer[100000];
      int inputLineIndex = 0;
      bool inputHasBeenRead = false;
      int usualNbCol = -1;
    
      int nbMultiwords = 0;
    
      while (!std::feof(file))
      {
        if (lineBuffer != std::fgets(lineBuffer, 100000, file))
          break;
    
        std::string_view line(lineBuffer);
        inputLineIndex++;
    
        if (line.size() < 3)
        {
          if (!inputHasBeenRead)
            continue;
    
          get(EOSColName, getNbLines()-1, 0) = EOSSymbol1;
    
    
          try
          {
            std::map<std::string, int> id2index;
            int firstIndexOfSequence = getNbLines()-1;
            for (int i = (int)getNbLines()-1; has(0, i, 0); --i)
            {
              if (!isToken(i))
                continue;
    
              if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1)
                break;
    
              firstIndexOfSequence = i;
              id2index[getConst(idColName, i, 0)] = i;
            }
    
            if (hasColIndex(headColName))
              for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i)
              {
                if (!isToken(i))
                  continue;
                auto & head = get(headColName, i, 0);
                if (head == "0")
                  continue;
                head = std::to_string(id2index[head]);
              }
    
          } catch(std::exception & e) {util::myThrow(e.what());}
    
    
          continue;
        }
    
        if (line.back() == '\n')
          line.remove_suffix(1);
    
    
    Franck Dary's avatar
    Franck Dary committed
        if (line[0] == '#')
        {
          addLines(1);
          get(EOSColName, getNbLines()-1, 0) = EOSSymbol0;
    
          get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0;
    
    Franck Dary's avatar
    Franck Dary committed
          get(0, getNbLines()-1, 0) = std::string(line);
          continue;
        }
    
    
        inputHasBeenRead = true;
    
        auto splited = util::split(line, '\t');
        if (usualNbCol == -1)
          usualNbCol = splited.size();
        if ((int)splited.size() != usualNbCol)
          util::myThrow(fmt::format("in file {} line {} is invalid, it shoud have {} columns", tsvFilename, line, usualNbCol));
    
    
        // Ignore empty nodes
        if (hasColIndex(idColName) && splited[getColIndex(idColName)].find('.') != std::string::npos)
          continue;
    
    
        addLines(1);
        get(EOSColName, getNbLines()-1, 0) = EOSSymbol0;
    
        if (nbMultiwords > 0)
        {
          get(isMultiColName, getNbLines()-1, 0) = EOSSymbol1;
          nbMultiwords--;
        }
        else
          get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0;
    
    
        for (unsigned int i = 0; i < splited.size(); i++)
          if (i < colIndex2Name.size())
    
          {
            std::string value = std::string(splited[i]);
            get(i, getNbLines()-1, 0) = value;
          }
    
    
        if (isMultiword(getNbLines()-1))
          nbMultiwords = getMultiwordSize(getNbLines()-1)+1;
    
    BaseConfig::BaseConfig(const BaseConfig & other) : Config(rawInputUtf8, other), colIndex2Name(other.colIndex2Name), colName2Index(other.colName2Index), rawInputUtf8(other.rawInputUtf8)
    {
    }
    
    
    BaseConfig::BaseConfig(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename) : Config(rawInputUtf8)
    
    {
      if (tsvFilename.empty() and rawFilename.empty())
        util::myThrow("tsvFilename and rawFilenames can't be both empty");
      if (mcdFilename.empty())
        util::myThrow("mcdFilename can't be empty");
    
      readMCD(mcdFilename);
    
      if (not rawFilename.empty())
        readRawInput(rawFilename);
    
      if (not tsvFilename.empty())
        readTSVInput(tsvFilename);
    
      if (!has(0,wordIndex,0))
    
        addLines(1);
    
      if (isComment(wordIndex))
        moveWordIndex(1);
    
    }
    
    std::size_t BaseConfig::getNbColumns() const
    {
      return colIndex2Name.size();
    }
    
    std::size_t BaseConfig::getColIndex(const std::string & colName) const
    {
    
      auto it = colName2Index.find(colName);
      if (it == colName2Index.end())
        util::myThrow(fmt::format("unknown column name '{}'", colName));
      return it->second;
    
    bool BaseConfig::hasColIndex(const std::string & colName) const
    {
      return colName2Index.count(colName);
    }
    
    
    const std::string & BaseConfig::getColName(int colIndex) const
    {
      return colIndex2Name[colIndex];
    }
    
    std::size_t BaseConfig::getFirstLineIndex() const
    {
      return 0;
    }