#include "util.hpp" #include "utf8.hpp" #include <ctime> #include <algorithm> #include "upper2lower" float util::long2float(long l) { return l / util::float2longScale; } long util::float2long(float f) { float res = f * util::float2longScale; if (f != 0 and std::round(res / f) != std::round(util::float2longScale)) util::error(fmt::format("Float '{}' is too big to be converted to long ({}!={})", f, res/f, util::float2longScale)); return res; } int util::printedLength(std::string_view s) { return splitAsUtf8(s).size(); } std::string_view util::getFilenameFromPath(std::string_view s) { int indexOfSlash = s.find_last_of('/'); return {s.data()+indexOfSlash+1, s.size()-1-indexOfSlash}; } bool util::isSeparator(utf8char c) { return c == ' ' || isIllegal(c); } bool util::isIllegal(utf8char c) { return c == '\n' || c == '\t' || c == '\r'; } bool util::isNumber(const std::string & s) { static std::map<utf8char, bool> digits{{"0",true},{"1",true},{"2",true},{"3",true},{"4",true},{"5",true},{"6",true},{"7",true},{"8",true},{"9",true},}; utf8string asUtf8 = splitAsUtf8(s); bool hasDigit = false; for (auto & c : asUtf8) if (digits.count(c)) hasDigit = true; else if (lower2upper.count(c) or upper2lower.count(c)) return false; return hasDigit; } bool util::isUrl(const std::string & s) { return s.size() >= 4 and s[0] == 'h' and s[1] == 't' and s[2] == 't' and s[3] == 'p'; } std::vector<std::string> util::split(std::string_view remaining, char delimiter) { std::vector<std::string> result; for (auto firstDelimiterIndex = remaining.find_first_of(delimiter); firstDelimiterIndex != std::string_view::npos; firstDelimiterIndex = remaining.find_first_of(delimiter)) { if (remaining[0] != delimiter) result.emplace_back(remaining.data(), firstDelimiterIndex); remaining = std::string_view(remaining.data()+firstDelimiterIndex+1, remaining.size()-1-firstDelimiterIndex); } if (remaining.size() > 0) result.emplace_back(remaining); return result; } util::utf8string util::splitAsUtf8(std::string_view s) { utf8string result; const char * beginPtr = s.data(); const char * currentPtr = beginPtr; const char * endPtr = s.data()+s.size(); while (currentPtr < endPtr) { try {utf8::next(currentPtr, endPtr);} catch (std::exception &) { break; } if (currentPtr - beginPtr > 4 || currentPtr - beginPtr == 0) myThrow(fmt::format("Invalid utf8 character at index {}", beginPtr-s.data())); utf8char c; for (int i = 0; i < currentPtr - beginPtr; i++) c[i] = beginPtr[i]; beginPtr = currentPtr; result.push_back(c); } return result; } std::string util::shrink(std::string s, int printedSize) { static const std::string filler = "…"; if (printedLength(s) <= printedSize) return s; try { float value = std::stof(s); s = std::string(s.begin(), s.begin()+printedSize-1); return fmt::format("{}{}", s, filler); } catch (std::exception &) {} auto splited = splitAsUtf8(s); std::string result; std::string begin, end; int nbLoop = 0; while (printedLength(begin)+printedLength(end)+printedLength(filler) <= printedSize) { result = begin + filler + end; if (nbLoop % 2) end = fmt::format("{}{}", splited[splited.size()-1-(nbLoop/2)], end); else begin = fmt::format("{}{}", begin, splited[nbLoop/2]); ++nbLoop; } return result; } void util::warning(std::string_view message, const std::experimental::source_location & location) { fmt::print(stderr, "WARNING ({}) : {}\n", location, message); } void util::error(std::string_view message, const std::experimental::source_location & location) { fmt::print(stderr, "ERROR ({}) : {}\n", location, message); exit(1); } void util::error(const std::exception & e, const std::experimental::source_location & location) { error(e.what(), location); } void util::myThrow(std::string_view message, const std::experimental::source_location & location) { throw std::invalid_argument(fmt::format("from ({}) {}", location, message)); } std::string util::int2HumanStr(int number) { std::string nb = std::to_string(number); std::string result; for (unsigned int i = 0; i < nb.size(); i++) { result.push_back(nb[i]); if (((nb.size()-i-1) % 3 == 0) && i < nb.size()-1) result.push_back(' '); } return result; } bool util::doIfNameMatch(const std::regex & reg, std::string_view name, const std::function<void(const std::smatch &)> & f) { std::smatch sm; std::string sname(name); std::regex_match(sname, sm, reg); if (sm.empty()) return false; f(sm); return true; } std::string util::strip(const std::string & s) { std::string striped; if (s.empty()) return striped; std::size_t first = 0; while (first < s.size() and (s[first] == ' ' or s[first] == '\t')) ++first; std::size_t last = s.size()-1; while (last > first and (s[last] == ' ' or s[last] == '\t' or s[last] == '\n')) --last; return std::string(s.begin()+first, s.begin()+last+1); } std::vector<std::filesystem::path> util::findFilesByExtension(std::filesystem::path directory, std::string extension) { std::vector<std::filesystem::path> files; for (auto entry : std::filesystem::directory_iterator(directory)) if (entry.is_regular_file()) { auto path = entry.path(); if (path.extension() == extension) files.push_back(path); } return files; } std::string util::getTime() { std::time_t rawtime; char buffer[80]; std::time(&rawtime); std::strftime(buffer, sizeof(buffer), "%H:%M:%S", std::localtime(&rawtime)); return std::string(buffer); } bool util::choiceWithProbability(float probability) { int maxVal = 100000; int threshold = maxVal * probability; return (std::rand() % maxVal) < threshold; } bool util::isUppercase(utf8char c) { return upper2lower.count(c); } std::string util::lower(const std::string & s) { auto splited = util::splitAsUtf8(s); lowerInPlace(splited); return fmt::format("{}", splited); } void util::lowerInPlace(utf8string & s) { for (auto & c : s) { auto it = upper2lower.find(c); if (it != upper2lower.end()) c = it->second; } } util::utf8string util::lower(const utf8string & s) { utf8string result = s; lowerInPlace(result); return result; } void util::lowerInPlace(utf8char & c) { auto it = upper2lower.find(c); if (it != upper2lower.end()) c = it->second; } util::utf8char util::lower(const utf8char & c) { auto res = c; lowerInPlace(res); return res; } std::string util::upper(const std::string & s) { auto splited = util::splitAsUtf8(s); upper(splited); return fmt::format("{}", splited); } void util::upper(utf8string & s) { for (auto & c : s) { auto it = lower2upper.find(c); if (it != lower2upper.end()) c = it->second; } } util::utf8string util::upper(const utf8string & s) { auto result = s; upper(result); return result; } void util::upper(utf8char & c) { auto it = lower2upper.find(c); if (it != lower2upper.end()) c = it->second; } std::vector<util::utf8string> util::readFileAsUtf8(std::string_view filename, bool lineByLine) { std::vector<utf8string> res; std::FILE * file = std::fopen(filename.data(), "r"); if (not file) util::myThrow(fmt::format("Cannot open file '{}'", filename)); std::string lineTemp; if (!lineByLine) { while (not std::feof(file)) lineTemp.push_back(std::fgetc(file)); auto line = util::splitAsUtf8(lineTemp); line.replace(util::utf8char("\n"), util::utf8char(" ")); line.replace(util::utf8char("\t"), util::utf8char(" ")); res.emplace_back(line); } else { while (not std::feof(file)) { lineTemp.clear(); while (not std::feof(file)) { lineTemp.push_back(std::fgetc(file)); if (lineTemp.back() == '\n') break; } auto line = util::splitAsUtf8(lineTemp); line.replace(util::utf8char("\n"), util::utf8char(" ")); line.replace(util::utf8char("\t"), util::utf8char(" ")); if (!line.empty()) res.emplace_back(line); } } std::fclose(file); return res; } std::vector<std::vector<std::string>> util::readTSV(std::string_view tsvFilename) { std::vector<std::vector<std::string>> sentences; std::FILE * file = std::fopen(tsvFilename.data(), "r"); if (not file) util::myThrow(fmt::format("Cannot open file '{}'", tsvFilename)); char lineBuffer[100000]; bool inputHasBeenRead = false; std::string mcdLine; sentences.emplace_back(); while (!std::feof(file)) { if (lineBuffer != std::fgets(lineBuffer, 100000, file)) break; std::string line(lineBuffer); sentences.back().emplace_back(line); if (line.back() == '\n') line.pop_back(); if (line.size() == 0) { if (!inputHasBeenRead) continue; sentences.emplace_back(); continue; } if (util::doIfNameMatch(std::regex("(?:(?:\\s|\\t)*)#(?:(?:\\s|\\t)*)global.columns(?:(?:\\s|\\t)*)=(?:(?:\\s|\\t)*)(.+)"), line, [&mcdLine, &line](const auto &) { mcdLine = line; })) continue; inputHasBeenRead = true; } if (sentences.back().empty()) sentences.pop_back(); std::fclose(file); if (not mcdLine.empty()) for (auto & sentence : sentences) sentence.insert(sentence.begin(), mcdLine); return sentences; }