#include "BaseConfig.hpp" #include "util.hpp" void BaseConfig::createColumns(std::string mcd) { this->mcd = mcd; colIndex2Name.clear(); colName2Index.clear(); auto splited = util::split(mcd, ','); for (auto & colName : splited) { colIndex2Name.emplace_back(colName); colName2Index.emplace(colName, colIndex2Name.size()-1); } for (auto & column : extraColumns) { if (colName2Index.count(column)) util::myThrow(fmt::format("mcd '{}' must not contain column '{}'", mcd, column)); colIndex2Name.emplace_back(column); colName2Index.emplace(column, colIndex2Name.size()-1); } } void BaseConfig::readRawInput(std::string_view rawFilename) { std::FILE * file = std::fopen(rawFilename.data(), "r"); if (not file) util::myThrow(fmt::format("Cannot open file '{}'", rawFilename)); std::string rawInputTemp; while (not std::feof(file)) rawInputTemp.push_back(std::fgetc(file)); std::fclose(file); rawInput = util::splitAsUtf8(rawInputTemp); rawInput.replace(util::utf8char("\n"), util::utf8char(" ")); rawInput.replace(util::utf8char("\t"), util::utf8char(" ")); } void BaseConfig::readTSVInput(std::string_view tsvFilename) { std::FILE * file = std::fopen(tsvFilename.data(), "r"); if (not file) util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename)); char lineBuffer[100000]; int inputLineIndex = 0; bool inputHasBeenRead = false; int usualNbCol = -1; int nbMultiwords = 0; while (!std::feof(file)) { if (lineBuffer != std::fgets(lineBuffer, 100000, file)) break; std::string_view line(lineBuffer); inputLineIndex++; if (line.size() < 3) { if (!inputHasBeenRead) continue; get(EOSColName, getNbLines()-1, 0) = EOSSymbol1; try { std::map<std::string, int> id2index; int firstIndexOfSequence = getNbLines()-1; for (int i = (int)getNbLines()-1; has(0, i, 0); --i) { if (!isToken(i)) continue; if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1) break; firstIndexOfSequence = i; id2index[getConst(idColName, i, 0)] = i; } if (hasColIndex(headColName)) for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i) { if (!isToken(i)) continue; auto & head = get(headColName, i, 0); if (head == "0") continue; head = std::to_string(id2index[head]); } } catch(std::exception & e) {util::myThrow(e.what());} continue; } if (line.back() == '\n') line.remove_suffix(1); if (line[0] == '#') { if (util::doIfNameMatch(std::regex("(?:(?:\\s|\\t)*)#(?:(?:\\s|\\t)*)global.columns(?:(?:\\s|\\t)*)=(?:(?:\\s|\\t)*)(.+)"), line, [this](const auto & sm) { createColumns(util::join(",", util::split(util::strip(sm.str(1)), ' '))); })) continue; addLines(1); get(EOSColName, getNbLines()-1, 0) = EOSSymbol0; get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0; get(0, getNbLines()-1, 0) = std::string(line); getLastNotEmptyHyp(0, getNbLines()-1) = std::string(line); continue; } inputHasBeenRead = true; auto splited = util::split(line, '\t'); if (usualNbCol == -1) usualNbCol = splited.size(); if ((int)splited.size() != usualNbCol) util::myThrow(fmt::format("in file {} line {} is invalid, it shoud have {} columns", tsvFilename, line, usualNbCol)); // Ignore empty nodes if (hasColIndex(idColName) && splited[getColIndex(idColName)].find('.') != std::string::npos) continue; addLines(1); get(EOSColName, getNbLines()-1, 0) = EOSSymbol0; if (nbMultiwords > 0) { get(isMultiColName, getNbLines()-1, 0) = EOSSymbol1; nbMultiwords--; } else get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0; for (unsigned int i = 0; i < splited.size(); i++) if (i < colIndex2Name.size() - extraColumns.size()) { std::string value = std::string(splited[i]); get(i, getNbLines()-1, 0) = value; } if (isMultiword(getNbLines()-1)) nbMultiwords = getMultiwordSize(getNbLines()-1)+1; } std::fclose(file); } BaseConfig::BaseConfig(const BaseConfig & other) : Config(other), colIndex2Name(other.colIndex2Name), colName2Index(other.colName2Index) { } BaseConfig::BaseConfig(std::string mcd, std::string_view tsvFilename, std::string_view rawFilename) { if (tsvFilename.empty() and rawFilename.empty()) util::myThrow("tsvFilename and rawFilenames can't be both empty"); createColumns(mcd); if (not rawFilename.empty()) readRawInput(rawFilename); if (not tsvFilename.empty()) readTSVInput(tsvFilename); if (!has(0,wordIndex,0)) { addComment(); addComment(); addLines(1); } if (isComment(wordIndex)) moveWordIndex(1); } std::size_t BaseConfig::getNbColumns() const { return colIndex2Name.size(); } std::size_t BaseConfig::getColIndex(const std::string & colName) const { auto it = colName2Index.find(colName); if (it == colName2Index.end()) util::myThrow(fmt::format("unknown column name '{}', mcd = '{}'", colName, mcd)); return it->second; } bool BaseConfig::hasColIndex(const std::string & colName) const { return colName2Index.count(colName); } const std::string & BaseConfig::getColName(int colIndex) const { return colIndex2Name[colIndex]; } std::size_t BaseConfig::getFirstLineIndex() const { return 0; }