Newer
Older
#include "BaseConfig.hpp"
#include "util.hpp"
void BaseConfig::readMCD(std::string_view mcdFilename)
{
if (!colIndex2Name.empty())
util::myThrow("a mcd has already been read for this BaseConfig");
std::FILE * file = std::fopen(mcdFilename.data(), "r");
if (not file)
util::myThrow(fmt::format("Cannot open file '{}'", mcdFilename));
char lineBuffer[1024];
while (std::fscanf(file, "%1023[^\n]\n", lineBuffer) == 1)
{
colIndex2Name.emplace_back(lineBuffer);
colName2Index.emplace(lineBuffer, colIndex2Name.size()-1);
}
std::fclose(file);
for (auto & column : extraColumns)
{
if (colName2Index.count(column))
util::myThrow(fmt::format("mcd '{}' must not contain column '{}'", mcdFilename, column));
colIndex2Name.emplace_back(column);
colName2Index.emplace(column, colIndex2Name.size()-1);
}
}
void BaseConfig::readRawInput(std::string_view rawFilename)
{
std::FILE * file = std::fopen(rawFilename.data(), "r");
if (not file)
util::myThrow(fmt::format("Cannot open file '{}'", rawFilename));
std::string rawInputTemp;
while (not std::feof(file))
rawInputTemp.push_back(std::fgetc(file));
std::fclose(file);
rawInputUtf8 = util::splitAsUtf8(rawInputTemp);
rawInputUtf8.replace(util::utf8char("\n"), util::utf8char(" "));
rawInputUtf8.replace(util::utf8char("\t"), util::utf8char(" "));
}
void BaseConfig::readTSVInput(std::string_view tsvFilename)
{
std::FILE * file = std::fopen(tsvFilename.data(), "r");
if (not file)
util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename));
char lineBuffer[100000];
int inputLineIndex = 0;
bool inputHasBeenRead = false;
int usualNbCol = -1;
while (!std::feof(file))
{
if (lineBuffer != std::fgets(lineBuffer, 100000, file))
break;
std::string_view line(lineBuffer);
inputLineIndex++;
if (line.size() < 3)
{
if (!inputHasBeenRead)
continue;
get(EOSColName, getNbLines()-1, 0) = EOSSymbol1;
try
{
std::map<std::string, int> id2index;
int firstIndexOfSequence = getNbLines()-1;
for (int i = (int)getNbLines()-1; has(0, i, 0); --i)
{
if (!isToken(i))
continue;
if (i != (int)getNbLines()-1 && getConst(EOSColName, i, 0) == EOSSymbol1)
break;
firstIndexOfSequence = i;
id2index[getConst(idColName, i, 0)] = i;
}
if (hasColIndex(headColName))
for (int i = firstIndexOfSequence; i < (int)getNbLines(); ++i)
{
if (!isToken(i))
continue;
auto & head = get(headColName, i, 0);
if (head == "0")
continue;
head = std::to_string(id2index[head]);
}
} catch(std::exception & e) {util::myThrow(e.what());}
continue;
}
if (line.back() == '\n')
line.remove_suffix(1);
if (line[0] == '#')
{
addLines(1);
get(EOSColName, getNbLines()-1, 0) = EOSSymbol0;
get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0;
get(0, getNbLines()-1, 0) = std::string(line);
continue;
}
inputHasBeenRead = true;
auto splited = util::split(line, '\t');
if (usualNbCol == -1)
usualNbCol = splited.size();
if ((int)splited.size() != usualNbCol)
util::myThrow(fmt::format("in file {} line {} is invalid, it shoud have {} columns", tsvFilename, line, usualNbCol));
// Ignore empty nodes
if (hasColIndex(idColName) && splited[getColIndex(idColName)].find('.') != std::string::npos)
continue;
addLines(1);
get(EOSColName, getNbLines()-1, 0) = EOSSymbol0;
if (nbMultiwords > 0)
{
get(isMultiColName, getNbLines()-1, 0) = EOSSymbol1;
nbMultiwords--;
}
else
get(isMultiColName, getNbLines()-1, 0) = EOSSymbol0;
for (unsigned int i = 0; i < splited.size(); i++)
if (i < colIndex2Name.size())
{
std::string value = std::string(splited[i]);
get(i, getNbLines()-1, 0) = value;
}
if (isMultiword(getNbLines()-1))
nbMultiwords = getMultiwordSize(getNbLines()-1)+1;
}
std::fclose(file);
}
BaseConfig::BaseConfig(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename) : Config(rawInputUtf8)
{
if (tsvFilename.empty() and rawFilename.empty())
util::myThrow("tsvFilename and rawFilenames can't be both empty");
if (mcdFilename.empty())
util::myThrow("mcdFilename can't be empty");
readMCD(mcdFilename);
if (not rawFilename.empty())
readRawInput(rawFilename);
if (not tsvFilename.empty())
readTSVInput(tsvFilename);
Franck Dary
committed
{
addComment();
Franck Dary
committed
}
if (isComment(wordIndex))
moveWordIndex(1);
}
std::size_t BaseConfig::getNbColumns() const
{
return colIndex2Name.size();
}
std::size_t BaseConfig::getColIndex(const std::string & colName) const
{
auto it = colName2Index.find(colName);
if (it == colName2Index.end())
util::myThrow(fmt::format("unknown column name '{}'", colName));
return it->second;
}
bool BaseConfig::hasColIndex(const std::string & colName) const
{
return colName2Index.count(colName);
}
const std::string & BaseConfig::getColName(int colIndex) const
{
return colIndex2Name[colIndex];
}
std::size_t BaseConfig::getFirstLineIndex() const
{
return 0;
}