#include "BaseConfig.hpp" #include "util.hpp" void BaseConfig::readMCD(std::string_view mcdFilename) { if (!colIndex2Name.empty()) util::myThrow("a mcd has already been read for this BaseConfig"); std::FILE * file = std::fopen(mcdFilename.data(), "r"); if (not file) util::myThrow(fmt::format("Cannot open file '{}'", mcdFilename)); char lineBuffer[1024]; while (std::fscanf(file, "%1023[^\n]\n", lineBuffer) == 1) { colIndex2Name.emplace_back(lineBuffer); colName2Index.emplace(lineBuffer, colIndex2Name.size()-1); } std::fclose(file); for (auto & column : extraColumns) { if (colName2Index.count(column)) util::myThrow(fmt::format("mcd '{}' must not contain column '{}'", mcdFilename, column)); colIndex2Name.emplace_back(column); colName2Index.emplace(column, colIndex2Name.size()-1); } } void BaseConfig::readRawInput(std::string_view rawFilename) { std::FILE * file = std::fopen(rawFilename.data(), "r"); if (not file) util::myThrow(fmt::format("Cannot open file '{}'", rawFilename)); std::string rawInputTemp; while (not std::feof(file)) rawInputTemp.push_back(std::fgetc(file)); std::fclose(file); rawInputUtf8 = util::splitAsUtf8(rawInputTemp); rawInputUtf8.replace(util::utf8char("\n"), util::utf8char(" ")); rawInputUtf8.replace(util::utf8char("\t"), util::utf8char(" ")); } void BaseConfig::readTSVInput(std::string_view tsvFilename) { std::FILE * file = std::fopen(tsvFilename.data(), "r"); if (not file) util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename)); char lineBuffer[100000]; int inputLineIndex = 0; bool inputHasBeenRead = false; int usualNbCol = -1; int nbMultiwords = 0; while (!std::feof(file)) { if (lineBuffer != std::fgets(lineBuffer, 100000, file)) break; std::string_view line(lineBuffer); inputLineIndex++; if (line.size() < 3) { if (!inputHasBeenRead) continue; get(EOSColName, getNbLines()-1, 0) = EOSSymbol1; try { std::map<std::string, int> id2index; int firstIndexOfSequence = getNbLines()-1; for (int i = (int)getNbLines()-1; has(0, i, 0); --i) { if (!isToken(i)) continue; if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1) break; firstIndexOfSequence = i; id2index[getConst(idColName, i, 0)] = i; } if (hasColIndex(headColName)) for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i) { if (!isToken(i)) continue; auto & head = get(headColName, i, 0); if (head == "0") continue; head = std::to_string(id2index[head]); } } catch(std::exception & e) {util::myThrow(e.what());} continue; } if (line.back() == '\n') line.remove_suffix(1); if (line[0] == '#') { addLines(1); get(EOSColName, getNbLines()-1, 0) = EOSSymbol0; get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0; get(0, getNbLines()-1, 0) = std::string(line); continue; } inputHasBeenRead = true; auto splited = util::split(line, '\t'); if (usualNbCol == -1) usualNbCol = splited.size(); if ((int)splited.size() != usualNbCol) util::myThrow(fmt::format("in file {} line {} is invalid, it shoud have {} columns", tsvFilename, line, usualNbCol)); // Ignore empty nodes if (hasColIndex(idColName) && splited[getColIndex(idColName)].find('.') != std::string::npos) continue; addLines(1); get(EOSColName, getNbLines()-1, 0) = EOSSymbol0; if (nbMultiwords > 0) { get(isMultiColName, getNbLines()-1, 0) = EOSSymbol1; nbMultiwords--; } else get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0; for (unsigned int i = 0; i < splited.size(); i++) if (i < colIndex2Name.size()) { std::string value = std::string(splited[i]); get(i, getNbLines()-1, 0) = value; } if (isMultiword(getNbLines()-1)) nbMultiwords = getMultiwordSize(getNbLines()-1)+1; } std::fclose(file); } BaseConfig::BaseConfig(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename) : Config(rawInputUtf8) { if (tsvFilename.empty() and rawFilename.empty()) util::myThrow("tsvFilename and rawFilenames can't be both empty"); if (mcdFilename.empty()) util::myThrow("mcdFilename can't be empty"); readMCD(mcdFilename); if (not rawFilename.empty()) readRawInput(rawFilename); if (not tsvFilename.empty()) readTSVInput(tsvFilename); if (!has(0,wordIndex,0)) { addComment(); addLines(1); } if (isComment(wordIndex)) moveWordIndex(1); } std::size_t BaseConfig::getNbColumns() const { return colIndex2Name.size(); } std::size_t BaseConfig::getColIndex(const std::string & colName) const { auto it = colName2Index.find(colName); if (it == colName2Index.end()) util::myThrow(fmt::format("unknown column name '{}'", colName)); return it->second; } bool BaseConfig::hasColIndex(const std::string & colName) const { return colName2Index.count(colName); } const std::string & BaseConfig::getColName(int colIndex) const { return colIndex2Name[colIndex]; } std::size_t BaseConfig::getFirstLineIndex() const { return 0; }