Skip to content
Snippets Groups Projects
Commit c10d0cbf authored by Franck Dary's avatar Franck Dary
Browse files

Improved speed of readTSV

parent d848584f
No related branches found
No related tags found
No related merge requests found
......@@ -43,80 +43,58 @@ void BaseConfig::readRawInput(std::string_view rawFilename)
rawInput.replace(util::utf8char("\t"), util::utf8char(" "));
}
void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<int> & _sentencesIndexes)
void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<int> & sentencesIndexes)
{
auto sentencesIndexes = _sentencesIndexes;
if (sentencesIndexes.empty())
sentencesIndexes.emplace_back(-1);
std::vector<std::vector<std::string>> sentences;
for (int targetSentenceIndex : sentencesIndexes)
std::FILE * file = std::fopen(tsvFilename.data(), "r");
if (not file)
util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename));
char lineBuffer[100000];
bool inputHasBeenRead = false;
sentences.emplace_back();
while (!std::feof(file))
{
std::FILE * file = std::fopen(tsvFilename.data(), "r");
if (not file)
util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename));
char lineBuffer[100000];
int inputLineIndex = 0;
bool inputHasBeenRead = false;
if (lineBuffer != std::fgets(lineBuffer, 100000, file))
break;
std::string_view line(lineBuffer);
sentences.back().emplace_back(line);
if (line.size() < 3)
{
if (!inputHasBeenRead)
continue;
sentences.emplace_back();
continue;
}
inputHasBeenRead = true;
}
std::fclose(file);
for (unsigned int i = 0; i < (sentencesIndexes.size() ? sentencesIndexes.size() : sentences.size()); i++)
{
int targetSentenceIndex = sentencesIndexes.size() ? sentencesIndexes[i] : i;
if (targetSentenceIndex >= (int)sentences.size())
util::myThrow(fmt::format("asking for sentence index {} while we detected only {} sentences", targetSentenceIndex, sentences.size()));
auto & sentence = sentences[targetSentenceIndex];
int usualNbCol = -1;
int nbMultiwords = 0;
int curSentenceIndex = 0;
std::vector<std::string> pendingComments;
while (!std::feof(file))
for (std::string line : sentence)
{
if (lineBuffer != std::fgets(lineBuffer, 100000, file))
break;
std::string_view line(lineBuffer);
inputLineIndex++;
if (line.size() < 3)
{
if (!inputHasBeenRead)
continue;
if (targetSentenceIndex == -1 or targetSentenceIndex == curSentenceIndex)
{
get(EOSColName, getNbLines()-1, 0) = EOSSymbol1;
try
{
std::map<std::string, int> id2index;
int firstIndexOfSequence = getNbLines()-1;
for (int i = (int)getNbLines()-1; has(0, i, 0); --i)
{
if (!isToken(i))
continue;
if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1)
break;
firstIndexOfSequence = i;
id2index[getConst(idColName, i, 0)] = i;
}
if (hasColIndex(headColName))
for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i)
{
if (!isToken(i))
continue;
auto & head = get(headColName, i, 0);
if (head == "0")
head = "-1";
else
head = std::to_string(id2index[head]);
}
} catch(std::exception & e) {util::myThrow(e.what());}
}
curSentenceIndex += 1;
continue;
}
if (line.back() == '\n')
line.remove_suffix(1);
line.pop_back();
if (line[0] == '#')
{
......@@ -126,22 +104,16 @@ void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<in
}))
continue;
if (targetSentenceIndex == -1 or targetSentenceIndex == curSentenceIndex)
pendingComments.emplace_back(line);
pendingComments.emplace_back(line);
continue;
}
inputHasBeenRead = true;
auto splited = util::split(line, '\t');
if (usualNbCol == -1)
usualNbCol = splited.size();
if ((int)splited.size() != usualNbCol)
util::myThrow(fmt::format("in file {} line {} is invalid, it shoud have {} columns", tsvFilename, line, usualNbCol));
if (targetSentenceIndex != -1 and targetSentenceIndex != curSentenceIndex)
continue;
// Ignore empty nodes
if (hasColIndex(idColName) && splited[getColIndex(idColName)].find('.') != std::string::npos)
continue;
......@@ -168,9 +140,38 @@ void BaseConfig::readTSVInput(std::string_view tsvFilename, const std::vector<in
if (isMultiword(getNbLines()-1))
nbMultiwords = getMultiwordSize(getNbLines()-1)+1;
}
std::fclose(file);
} // End for line in sentence
try
{
std::map<std::string, int> id2index;
int firstIndexOfSequence = getNbLines()-1;
for (int i = (int)getNbLines()-1; has(0, i, 0); --i)
{
if (!isToken(i))
continue;
if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1)
break;
firstIndexOfSequence = i;
id2index[getConst(idColName, i, 0)] = i;
}
if (hasColIndex(headColName))
for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i)
{
if (!isToken(i))
continue;
auto & head = get(headColName, i, 0);
if (head == "0")
head = "-1";
else
head = std::to_string(id2index[head]);
}
get(EOSColName, getNbLines()-1, 0) = EOSSymbol1;
} catch(std::exception & e) {util::myThrow(e.what());}
} // End for targetSentenceIndex
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment