From c0f915b2766aaeeb01fec1361fb47d6ac1cbe9bc Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 12 Dec 2019 18:38:58 +0100 Subject: [PATCH] Can read Config --- dev/src/dev.cpp | 2 + reading_machine/include/Config.hpp | 17 +++- reading_machine/src/Config.cpp | 130 +++++++++++++++++++++++------ 3 files changed, 121 insertions(+), 28 deletions(-) diff --git a/dev/src/dev.cpp b/dev/src/dev.cpp index 26d0908..9dc76c0 100644 --- a/dev/src/dev.cpp +++ b/dev/src/dev.cpp @@ -10,6 +10,8 @@ int main(int argc, char * argv[]) Config config(argv[3], argv[1], argv[2]); + config.print(stdout); + return 0; } diff --git a/reading_machine/include/Config.hpp b/reading_machine/include/Config.hpp index ca74bb8..c20859e 100644 --- a/reading_machine/include/Config.hpp +++ b/reading_machine/include/Config.hpp @@ -21,10 +21,16 @@ class Config { + public : + + static constexpr const char * EOSColName = "EOS"; + static constexpr const char * EOSSymbol1 = "1"; + static constexpr const char * EOSSymbol0 = "0"; + private : - std::unordered_map<int, std::string> colIndex2Name; - std::unordered_map<int, std::string> colName2Index; + std::vector<std::string> colIndex2Name; + std::unordered_map<std::string, int> colName2Index; std::string rawInput; std::vector<std::string_view> rawInputUtf8; @@ -33,9 +39,16 @@ class Config using Line = std::vector<ReferenceAndHypotheses>; std::vector<Line> lines; + private : + + void readMCD(std::string_view mcdFilename); + void readRawInput(std::string_view rawFilename); + void readTSVInput(std::string_view tsvFilename); + public : Config(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename); + void print(FILE * dest) const; }; #endif diff --git a/reading_machine/src/Config.cpp b/reading_machine/src/Config.cpp index 581d72f..e3ee2f4 100644 --- a/reading_machine/src/Config.cpp +++ b/reading_machine/src/Config.cpp @@ -9,49 +9,127 @@ #include "Config.hpp" #include "util.hpp" -Config::Config(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename) +void Config::readMCD(std::string_view mcdFilename) { - if (tsvFilename.empty() and rawFilename.empty()) - util::myThrow("tsvFilename and rawFilenames can't be both empty"); - if (mcdFilename.empty()) - util::myThrow("mcdFilename can't be empty"); + if (!colIndex2Name.empty()) + util::myThrow("a mcd has already been read for this Config"); + + std::FILE * file = std::fopen(mcdFilename.data(), "r"); + if (not file) + util::myThrow(fmt::format("Cannot open file '{}'", mcdFilename)); + + char lineBuffer[1024]; + while (std::fscanf(file, "%1023[^\n]\n", lineBuffer) == 1) { - std::FILE * file = std::fopen(mcdFilename.data(), "r"); + colIndex2Name.emplace_back(lineBuffer); + colName2Index.emplace(lineBuffer, colIndex2Name.size()-1); + } - if (not file) - util::myThrow(fmt::format("Cannot open file '{}'", rawFilename)); + std::fclose(file); - char lineBuffer[1024]; - while (std::fscanf(file, "%1023[^\n]\n", lineBuffer) == 1) - { - auto splited = util::split(lineBuffer, ' '); + if (colName2Index.count(EOSColName)) + util::myThrow(fmt::format("mcd '{}' must not contain column '{}'", mcdFilename, EOSColName)); + colIndex2Name.emplace_back(EOSColName); + colName2Index.emplace(EOSColName, colIndex2Name.size()-1); +} - if (splited.size() != 2) - util::myThrow(fmt::format("Invalid line '{}' for mcd '{}'", lineBuffer, mcdFilename)); - } +void Config::readRawInput(std::string_view rawFilename) +{ + std::FILE * file = std::fopen(rawFilename.data(), "r"); - std::fclose(file); - } + if (not file) + util::myThrow(fmt::format("Cannot open file '{}'", rawFilename)); - if (not rawFilename.empty()) + while (not std::feof(file)) + rawInput.push_back(std::fgetc(file)); + + std::fclose(file); + + rawInputUtf8 = util::splitAsUtf8(rawInput); +} + +void Config::readTSVInput(std::string_view tsvFilename) +{ + std::FILE * file = std::fopen(tsvFilename.data(), "r"); + + if (not file) + util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename)); + + char lineBuffer[100000]; + int inputLineIndex = 0; + bool inputHasBeenRead = false; + int usualNbCol = -1; + while (!std::feof(file)) { - std::FILE * file = std::fopen(rawFilename.data(), "r"); + if (lineBuffer != std::fgets(lineBuffer, 100000, file)) + break; + + std::string_view line(lineBuffer); + inputLineIndex++; - if (not file) - util::myThrow(fmt::format("Cannot open file '{}'", rawFilename)); + if (line.size() < 3) + { + if (!inputHasBeenRead) + continue; + + lines.back()[colName2Index[EOSColName]][0] = EOSSymbol1; + + continue; + } + else if (line[0] == '#') + continue; + + if (line.back() == '\n') + line.remove_suffix(1); - while (not std::feof(file)) - rawInput.push_back(std::fgetc(file)); + inputHasBeenRead = true; - std::fclose(file); + auto splited = util::split(line, '\t'); + if (usualNbCol == -1) + usualNbCol = splited.size(); + if ((int)splited.size() != usualNbCol) + util::myThrow(fmt::format("in file {} line {} is invalid, it shoud have {} columns", tsvFilename, line, usualNbCol)); - rawInputUtf8 = util::splitAsUtf8(rawInput); + lines.emplace_back(); + for (unsigned int i = 0; i < colIndex2Name.size(); i++) + { + lines.back().emplace_back(); + lines.back().back().emplace_back(""); + } + lines.back()[colName2Index[EOSColName]][0] = EOSSymbol0; + for (unsigned int i = 0; i < splited.size(); i++) + if (i < colIndex2Name.size()) + lines.back()[i][0] = splited[i]; } + std::fclose(file); +} + +Config::Config(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename) +{ + if (tsvFilename.empty() and rawFilename.empty()) + util::myThrow("tsvFilename and rawFilenames can't be both empty"); + if (mcdFilename.empty()) + util::myThrow("mcdFilename can't be empty"); + + readMCD(mcdFilename); + + if (not rawFilename.empty()) + readRawInput(rawFilename); + if (not tsvFilename.empty()) + readTSVInput(tsvFilename); +} + +void Config::print(FILE * dest) const +{ + for (auto & line : lines) { - + for (unsigned int i = 0; i < line.size()-1; i++) + fmt::print(dest, "{}{}", line[i].back(), i < line.size()-2 ? "\t" : "\n"); + if (line[colName2Index.at(EOSColName)].back() == EOSSymbol1) + fmt::print(dest, "\n"); } } -- GitLab