From 5646953f729ab1405d27b88dd3c252db248e7502 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 12 Dec 2019 22:28:27 +0100 Subject: [PATCH] Optimization of Config size and added new types utf8char and utf8string --- common/include/util.hpp | 22 +++++++++++- common/src/util.cpp | 58 ++++++++++++++++++++---------- dev/src/dev.cpp | 4 ++- reading_machine/include/Config.hpp | 3 +- reading_machine/src/Config.cpp | 19 ++++++++++ 5 files changed, 85 insertions(+), 21 deletions(-) diff --git a/common/include/util.hpp b/common/include/util.hpp index c6a67c1..75b7b74 100644 --- a/common/include/util.hpp +++ b/common/include/util.hpp @@ -16,22 +16,29 @@ #include <string> #include <vector> +#include <array> #include <fmt/core.h> #include <experimental/source_location> #include "utf8.hpp" + namespace util { +typedef std::array<char, 4> utf8char; +typedef std::basic_string<utf8char> utf8string; + std::string_view getFilenameFromPath(std::string_view s); -std::vector<std::string_view> splitAsUtf8(std::string_view s); std::vector<std::string_view> split(std::string_view s, char delimiter); +utf8string splitAsUtf8(std::string_view s); void warning(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current()); void error(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current()); void error(const std::exception & e, const std::experimental::source_location & location = std::experimental::source_location::current()); void myThrow(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current()); +std::string int2HumanStr(int number); + }; template <> @@ -46,6 +53,19 @@ struct fmt::formatter<std::experimental::source_location> } }; +template <> +struct fmt::formatter<util::utf8char> +{ + constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); } + + template <typename FormatContext> + auto format(const util::utf8char & c, FormatContext & ctx) + { + char * asPtr = (char*)&c; + return format_to(ctx.out(), "{}{}{}{}", asPtr[0] ? asPtr[0] : '\0', asPtr[1] ? asPtr[1] : '\0', asPtr[2] ? asPtr[2] : '\0', asPtr[3] ? asPtr[3] : '\0'); + } +}; + std::string_view operator+(std::string_view a, std::string_view b); void operator+=(std::string_view & a, std::string_view b); diff --git a/common/src/util.cpp b/common/src/util.cpp index fe640bc..eed75ff 100644 --- a/common/src/util.cpp +++ b/common/src/util.cpp @@ -20,24 +20,6 @@ std::string_view getFilenameFromPath(std::string_view s) return {s.data()+indexOfSlash+1, s.size()-1-indexOfSlash}; } -std::vector<std::string_view> splitAsUtf8(std::string_view s) -{ - std::vector<std::string_view> result; - const char * beginPtr = s.data(); - const char * currentPtr = beginPtr; - const char * endPtr = s.data()+s.size()-1; - - while (true) - try - { - utf8::next(currentPtr, endPtr); - result.emplace_back(beginPtr, currentPtr-beginPtr); - beginPtr = currentPtr; - } catch (std::exception &) {break;} - - return result; -} - std::vector<std::string_view> split(std::string_view remaining, char delimiter) { std::vector<std::string_view> result; @@ -55,6 +37,31 @@ std::vector<std::string_view> split(std::string_view remaining, char delimiter) return result; } +utf8string splitAsUtf8(std::string_view s) +{ + utf8string result; + const char * beginPtr = s.data(); + const char * currentPtr = beginPtr; + const char * endPtr = s.data()+s.size()-1; + + if (!utf8::is_valid(beginPtr, endPtr)) + myThrow("Not a valid utf8 input"); + + while (currentPtr < endPtr) + { + utf8::next(currentPtr, endPtr); + if (currentPtr - beginPtr > 4 || currentPtr - beginPtr == 0) + myThrow(fmt::format("Invalid utf8 character at index {}", beginPtr-s.data())); + utf8char c = {}; + for (int i = 0; i < currentPtr - beginPtr; i++) + ((char*)&c)[i] = beginPtr[i]; + beginPtr = currentPtr; + result.push_back(c); + } + + return result; +} + void warning(std::string_view message, const std::experimental::source_location & location) { fmt::print(stderr, "WARNING ({}) : {}\n", location, message); @@ -76,6 +83,21 @@ void myThrow(std::string_view message, const std::experimental::source_location throw std::invalid_argument(fmt::format("from ({}) {}", location, message)); } +std::string int2HumanStr(int number) +{ + std::string nb = std::to_string(number); + std::string result; + + for (unsigned int i = 0; i < nb.size(); i++) + { + result.push_back(nb[i]); + if (((nb.size()-i-1) % 3 == 0) && i < nb.size()-1) + result.push_back(' '); + } + + return result; +} + }; std::string_view operator+(std::string_view a, std::string_view b) diff --git a/dev/src/dev.cpp b/dev/src/dev.cpp index 9dc76c0..4863c31 100644 --- a/dev/src/dev.cpp +++ b/dev/src/dev.cpp @@ -10,7 +10,9 @@ int main(int argc, char * argv[]) Config config(argv[3], argv[1], argv[2]); - config.print(stdout); + config.printSize(stderr); + + std::scanf("%*c"); return 0; } diff --git a/reading_machine/include/Config.hpp b/reading_machine/include/Config.hpp index c20859e..8895ef6 100644 --- a/reading_machine/include/Config.hpp +++ b/reading_machine/include/Config.hpp @@ -33,7 +33,7 @@ class Config std::unordered_map<std::string, int> colName2Index; std::string rawInput; - std::vector<std::string_view> rawInputUtf8; + util::utf8string rawInputUtf8; using ReferenceAndHypotheses = std::vector<std::string>; using Line = std::vector<ReferenceAndHypotheses>; @@ -49,6 +49,7 @@ class Config Config(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename); void print(FILE * dest) const; + void printSize(FILE * dest); }; #endif diff --git a/reading_machine/src/Config.cpp b/reading_machine/src/Config.cpp index e3ee2f4..4c59237 100644 --- a/reading_machine/src/Config.cpp +++ b/reading_machine/src/Config.cpp @@ -106,6 +106,25 @@ void Config::readTSVInput(std::string_view tsvFilename) std::fclose(file); } +void Config::printSize(FILE * dest) +{ + int rawInputNbElements = rawInput.size(); + int rawInputSize = sizeof rawInput + rawInput.capacity() * sizeof rawInput[0]; + + int rawInputUtf8NbElements = 4*rawInputUtf8.size(); + int rawInputUtf8Size = sizeof rawInputUtf8 + rawInputUtf8.capacity()* sizeof rawInputUtf8[0]; + + int totalSize = rawInputSize + rawInputUtf8Size; + + std::string unit = "Mo"; + int unitPower = 6; + float unitMultiplier = std::stof(fmt::format("0.{:0^{}}1","",unitPower-1)); + + fmt::print(dest, "{:<43} : {:<{}.2f} {}\n", fmt::format("{:<20} {:>12} elements", "rawInput", util::int2HumanStr(rawInputNbElements)), unitMultiplier*rawInputSize, 2+11-unitPower, unit); + fmt::print(dest, "{:<43} : {:<{}.2f} {}\n", fmt::format("{:<20} {:>12} elements", "rawInputUtf8", util::int2HumanStr(rawInputUtf8NbElements)), unitMultiplier*rawInputUtf8Size, 2+11-unitPower, unit); + fmt::print(dest, "{:<43} : {:<{}.2f} {}\n", "Total", unitMultiplier*totalSize, 2+11-unitPower, unit); +} + Config::Config(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename) { if (tsvFilename.empty() and rawFilename.empty()) -- GitLab