diff --git a/reading_machine/src/BaseConfig.cpp b/reading_machine/src/BaseConfig.cpp index c9956d4886037ce308840293f6d800c0b7a3272d..3b511e6e17f16c7351c4780ed153f5048435a074 100644 --- a/reading_machine/src/BaseConfig.cpp +++ b/reading_machine/src/BaseConfig.cpp @@ -43,80 +43,58 @@ void BaseConfig::readRawInput(std::string_view rawFilename) rawInput.replace(util::utf8char("\t"), util::utf8char(" ")); } -void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<int> & _sentencesIndexes) +void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<int> & sentencesIndexes) { - auto sentencesIndexes = _sentencesIndexes; - if (sentencesIndexes.empty()) - sentencesIndexes.emplace_back(-1); + std::vector<std::vector<std::string>> sentences; - for (int targetSentenceIndex : sentencesIndexes) + std::FILE * file = std::fopen(tsvFilename.data(), "r"); + + if (not file) + util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename)); + + char lineBuffer[100000]; + bool inputHasBeenRead = false; + + sentences.emplace_back(); + while (!std::feof(file)) { - std::FILE * file = std::fopen(tsvFilename.data(), "r"); - - if (not file) - util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename)); - - char lineBuffer[100000]; - int inputLineIndex = 0; - bool inputHasBeenRead = false; + if (lineBuffer != std::fgets(lineBuffer, 100000, file)) + break; + + std::string_view line(lineBuffer); + sentences.back().emplace_back(line); + + if (line.size() < 3) + { + if (!inputHasBeenRead) + continue; + + sentences.emplace_back(); + continue; + } + + inputHasBeenRead = true; + } + + std::fclose(file); + + for (unsigned int i = 0; i < (sentencesIndexes.size() ? sentencesIndexes.size() : sentences.size()); i++) + { + int targetSentenceIndex = sentencesIndexes.size() ? sentencesIndexes[i] : i; + if (targetSentenceIndex >= (int)sentences.size()) + util::myThrow(fmt::format("asking for sentence index {} while we detected only {} sentences", targetSentenceIndex, sentences.size())); + auto & sentence = sentences[targetSentenceIndex]; int usualNbCol = -1; int nbMultiwords = 0; - int curSentenceIndex = 0; std::vector<std::string> pendingComments; - while (!std::feof(file)) + for (std::string line : sentence) { - if (lineBuffer != std::fgets(lineBuffer, 100000, file)) - break; - - std::string_view line(lineBuffer); - inputLineIndex++; - if (line.size() < 3) - { - if (!inputHasBeenRead) - continue; - - if (targetSentenceIndex == -1 or targetSentenceIndex == curSentenceIndex) - { - get(EOSColName, getNbLines()-1, 0) = EOSSymbol1; - - try - { - std::map<std::string, int> id2index; - int firstIndexOfSequence = getNbLines()-1; - for (int i = (int)getNbLines()-1; has(0, i, 0); --i) - { - if (!isToken(i)) - continue; - - if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1) - break; - - firstIndexOfSequence = i; - id2index[getConst(idColName, i, 0)] = i; - } - if (hasColIndex(headColName)) - for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i) - { - if (!isToken(i)) - continue; - auto & head = get(headColName, i, 0); - if (head == "0") - head = "-1"; - else - head = std::to_string(id2index[head]); - } - } catch(std::exception & e) {util::myThrow(e.what());} - } - - curSentenceIndex += 1; - continue; - } - + if (line.back() == '\n') - line.remove_suffix(1); + line.pop_back(); if (line[0] == '#') { @@ -126,22 +104,16 @@ void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<in })) continue; - if (targetSentenceIndex == -1 or targetSentenceIndex == curSentenceIndex) - pendingComments.emplace_back(line); + pendingComments.emplace_back(line); continue; } - inputHasBeenRead = true; - auto splited = util::split(line, '\t'); if (usualNbCol == -1) usualNbCol = splited.size(); if ((int)splited.size() != usualNbCol) util::myThrow(fmt::format("in file {} line {} is invalid, it shoud have {} columns", tsvFilename, line, usualNbCol)); - if (targetSentenceIndex != -1 and targetSentenceIndex != curSentenceIndex) - continue; - // Ignore empty nodes if (hasColIndex(idColName) && splited[getColIndex(idColName)].find('.') != std::string::npos) continue; @@ -168,9 +140,38 @@ void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<in if (isMultiword(getNbLines()-1)) nbMultiwords = getMultiwordSize(getNbLines()-1)+1; - } - - std::fclose(file); + } // End for line in sentence + + try + { + std::map<std::string, int> id2index; + int firstIndexOfSequence = getNbLines()-1; + for (int i = (int)getNbLines()-1; has(0, i, 0); --i) + { + if (!isToken(i)) + continue; + + if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1) + break; + + firstIndexOfSequence = i; + id2index[getConst(idColName, i, 0)] = i; + } + if (hasColIndex(headColName)) + for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i) + { + if (!isToken(i)) + continue; + auto & head = get(headColName, i, 0); + if (head == "0") + head = "-1"; + else + head = std::to_string(id2index[head]); + } + + get(EOSColName, getNbLines()-1, 0) = EOSSymbol1; + } catch(std::exception & e) {util::myThrow(e.what());} + } // End for targetSentenceIndex }