Select Git revision
-
Franck Dary authoredFranck Dary authored
util.cpp 9.08 KiB
#include "util.hpp"
#include "utf8.hpp"
#include <ctime>
#include <algorithm>
#include "upper2lower"
float util::long2float(long l)
{
return l / util::float2longScale;
}
long util::float2long(float f)
{
return f * util::float2longScale;
}
int util::printedLength(std::string_view s)
{
return splitAsUtf8(s).size();
}
std::string_view util::getFilenameFromPath(std::string_view s)
{
int indexOfSlash = s.find_last_of('/');
return {s.data()+indexOfSlash+1, s.size()-1-indexOfSlash};
}
bool util::isSeparator(utf8char c)
{
return c == ' ' || isIllegal(c);
}
bool util::isIllegal(utf8char c)
{
return c == '\n' || c == '\t' || c == '\r';
}
bool util::isNumber(const std::string & s)
{
static std::map<utf8char, bool> digits{{"0",true},{"1",true},{"2",true},{"3",true},{"4",true},{"5",true},{"6",true},{"7",true},{"8",true},{"9",true},};
utf8string asUtf8 = splitAsUtf8(s);
bool hasDigit = false;
for (auto & c : asUtf8)
if (digits.count(c))
hasDigit = true;
else if (lower2upper.count(c) or upper2lower.count(c))
return false;
return hasDigit;
}
bool util::isUrl(const std::string & s)
{
return s.size() >= 4 and s[0] == 'h' and s[1] == 't' and s[2] == 't' and s[3] == 'p';
}
std::vector<std::string> util::split(std::string_view remaining, char delimiter)
{
std::vector<std::string> result;
for (auto firstDelimiterIndex = remaining.find_first_of(delimiter); firstDelimiterIndex != std::string_view::npos; firstDelimiterIndex = remaining.find_first_of(delimiter))
{
if (remaining[0] != delimiter)
result.emplace_back(remaining.data(), firstDelimiterIndex);
remaining = std::string_view(remaining.data()+firstDelimiterIndex+1, remaining.size()-1-firstDelimiterIndex);
}
if (remaining.size() > 0)
result.emplace_back(remaining);
return result;
}
util::utf8string util::splitAsUtf8(std::string_view s)
{
utf8string result;
const char * beginPtr = s.data();
const char * currentPtr = beginPtr;
const char * endPtr = s.data()+s.size();
while (currentPtr < endPtr)
{
try {utf8::next(currentPtr, endPtr);}
catch (std::exception &)
{
break;
}
if (currentPtr - beginPtr > 4 || currentPtr - beginPtr == 0)
myThrow(fmt::format("Invalid utf8 character at index {}", beginPtr-s.data()));
utf8char c;
for (int i = 0; i < currentPtr - beginPtr; i++)
c[i] = beginPtr[i];
beginPtr = currentPtr;
result.push_back(c);
}
return result;
}
std::string util::shrink(std::string s, int printedSize)
{
static const std::string filler = "…";
if (printedLength(s) <= printedSize)
return s;
try
{
float value = std::stof(s);
s = std::string(s.begin(), s.begin()+printedSize-1);
return fmt::format("{}{}", s, filler);
}
catch (std::exception &) {}
auto splited = splitAsUtf8(s);
std::string result;
std::string begin, end;
int nbLoop = 0;
while (printedLength(begin)+printedLength(end)+printedLength(filler) <= printedSize)
{
result = begin + filler + end;
if (nbLoop % 2)
end = fmt::format("{}{}", splited[splited.size()-1-(nbLoop/2)], end);
else
begin = fmt::format("{}{}", begin, splited[nbLoop/2]);
++nbLoop;
}
return result;
}
void util::warning(std::string_view message, const std::experimental::source_location & location)
{
fmt::print(stderr, "WARNING ({}) : {}\n", location, message);
}
void util::error(std::string_view message, const std::experimental::source_location & location)
{
fmt::print(stderr, "ERROR ({}) : {}\n", location, message);
exit(1);
}
void util::error(const std::exception & e, const std::experimental::source_location & location)
{
error(e.what(), location);
}
void util::myThrow(std::string_view message, const std::experimental::source_location & location)
{
throw std::invalid_argument(fmt::format("from ({}) {}", location, message));
}
std::string util::int2HumanStr(int number)
{
std::string nb = std::to_string(number);
std::string result;
for (unsigned int i = 0; i < nb.size(); i++)
{
result.push_back(nb[i]);
if (((nb.size()-i-1) % 3 == 0) && i < nb.size()-1)
result.push_back(' ');
}
return result;
}
bool util::doIfNameMatch(const std::regex & reg, std::string_view name, const std::function<void(const std::smatch &)> & f)
{
std::smatch sm;
std::string sname(name);
std::regex_match(sname, sm, reg);
if (sm.empty())
return false;
f(sm);
return true;
}
std::string util::strip(const std::string & s)
{
std::string striped;
if (s.empty())
return striped;
std::size_t first = 0;
while (first < s.size() and (s[first] == ' ' or s[first] == '\t'))
++first;
std::size_t last = s.size()-1;
while (last > first and (s[last] == ' ' or s[last] == '\t' or s[last] == '\n'))
--last;
return std::string(s.begin()+first, s.begin()+last+1);
}
std::vector<std::filesystem::path> util::findFilesByExtension(std::filesystem::path directory, std::string extension)
{
std::vector<std::filesystem::path> files;
for (auto entry : std::filesystem::directory_iterator(directory))
if (entry.is_regular_file())
{
auto path = entry.path();
if (path.extension() == extension)
files.push_back(path);
}
return files;
}
std::string util::getTime()
{
std::time_t rawtime;
char buffer[80];
std::time(&rawtime);
std::strftime(buffer, sizeof(buffer), "%H:%M:%S", std::localtime(&rawtime));
return std::string(buffer);
}
bool util::choiceWithProbability(float probability)
{
int maxVal = 100000;
int threshold = maxVal * probability;
return (std::rand() % maxVal) < threshold;
}
bool util::isUppercase(utf8char c)
{
return upper2lower.count(c);
}
std::string util::lower(const std::string & s)
{
auto splited = util::splitAsUtf8(s);
lowerInPlace(splited);
return fmt::format("{}", splited);
}
void util::lowerInPlace(utf8string & s)
{
for (auto & c : s)
{
auto it = upper2lower.find(c);
if (it != upper2lower.end())
c = it->second;
}
}
util::utf8string util::lower(const utf8string & s)
{
utf8string result = s;
lowerInPlace(result);
return result;
}
void util::lowerInPlace(utf8char & c)
{
auto it = upper2lower.find(c);
if (it != upper2lower.end())
c = it->second;
}
util::utf8char util::lower(const utf8char & c)
{
auto res = c;
lowerInPlace(res);
return res;
}
std::string util::upper(const std::string & s)
{
auto splited = util::splitAsUtf8(s);
upper(splited);
return fmt::format("{}", splited);
}
void util::upper(utf8string & s)
{
for (auto & c : s)
{
auto it = lower2upper.find(c);
if (it != lower2upper.end())
c = it->second;
}
}
util::utf8string util::upper(const utf8string & s)
{
auto result = s;
upper(result);
return result;
}
void util::upper(utf8char & c)
{
auto it = lower2upper.find(c);
if (it != lower2upper.end())
c = it->second;
}
std::vector<util::utf8string> util::readFileAsUtf8(std::string_view filename, bool lineByLine)
{
std::vector<utf8string> res;
std::FILE * file = std::fopen(filename.data(), "r");
if (not file)
util::myThrow(fmt::format("Cannot open file '{}'", filename));
std::string lineTemp;
if (!lineByLine)
{
while (not std::feof(file))
lineTemp.push_back(std::fgetc(file));
auto line = util::splitAsUtf8(lineTemp);
line.replace(util::utf8char("\n"), util::utf8char(" "));
line.replace(util::utf8char("\t"), util::utf8char(" "));
res.emplace_back(line);
}
else
{
while (not std::feof(file))
{
lineTemp.clear();
while (not std::feof(file))
{
lineTemp.push_back(std::fgetc(file));
if (lineTemp.back() == '\n')
break;
}
auto line = util::splitAsUtf8(lineTemp);
line.replace(util::utf8char("\n"), util::utf8char(" "));
line.replace(util::utf8char("\t"), util::utf8char(" "));
if (!line.empty())
res.emplace_back(line);
}
}
std::fclose(file);
return res;
}
std::vector<std::vector<std::string>> util::readTSV(std::string_view tsvFilename)
{
std::vector<std::vector<std::string>> sentences;
std::FILE * file = std::fopen(tsvFilename.data(), "r");
if (not file)
util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename));
char lineBuffer[100000];
bool inputHasBeenRead = false;
std::string mcdLine;
sentences.emplace_back();
while (!std::feof(file))
{
if (lineBuffer != std::fgets(lineBuffer, 100000, file))
break;
std::string line(lineBuffer);
sentences.back().emplace_back(line);
if (line.back() == '\n')
line.pop_back();
if (line.size() < 3)
{
if (!inputHasBeenRead)
continue;
sentences.emplace_back();
continue;
}
if (util::doIfNameMatch(std::regex("(?:(?:\\s|\\t)*)#(?:(?:\\s|\\t)*)global.columns(?:(?:\\s|\\t)*)=(?:(?:\\s|\\t)*)(.+)"), line, [&mcdLine, &line](const auto &)
{
mcdLine = line;
}))
continue;
inputHasBeenRead = true;
}
if (sentences.back().empty())
sentences.pop_back();
std::fclose(file);
if (not mcdLine.empty())
for (auto & sentence : sentences)
sentence.insert(sentence.begin(), mcdLine);
return sentences;
}