From 5646953f729ab1405d27b88dd3c252db248e7502 Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Thu, 12 Dec 2019 22:28:27 +0100
Subject: [PATCH] Optimization of Config size and added new types utf8char and
 utf8string

---
 common/include/util.hpp            | 22 +++++++++++-
 common/src/util.cpp                | 58 ++++++++++++++++++++----------
 dev/src/dev.cpp                    |  4 ++-
 reading_machine/include/Config.hpp |  3 +-
 reading_machine/src/Config.cpp     | 19 ++++++++++
 5 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/common/include/util.hpp b/common/include/util.hpp
index c6a67c1..75b7b74 100644
--- a/common/include/util.hpp
+++ b/common/include/util.hpp
@@ -16,22 +16,29 @@
 
 #include <string>
 #include <vector>
+#include <array>
 #include <fmt/core.h>
 #include <experimental/source_location>
 #include "utf8.hpp"
 
+
 namespace util
 {
+typedef std::array<char, 4> utf8char;
+typedef std::basic_string<utf8char> utf8string;
+
 std::string_view getFilenameFromPath(std::string_view s);
 
-std::vector<std::string_view> splitAsUtf8(std::string_view s);
 std::vector<std::string_view> split(std::string_view s, char delimiter);
+utf8string splitAsUtf8(std::string_view s);
 
 void warning(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current());
 void error(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current());
 void error(const std::exception & e, const std::experimental::source_location & location = std::experimental::source_location::current());
 void myThrow(std::string_view message, const std::experimental::source_location & location = std::experimental::source_location::current());
 
+std::string int2HumanStr(int number);
+
 };
 
 template <>
@@ -46,6 +53,19 @@ struct fmt::formatter<std::experimental::source_location>
   }
 };
 
+template <>
+struct fmt::formatter<util::utf8char>
+{
+  constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); }
+
+  template <typename FormatContext>
+  auto format(const util::utf8char & c, FormatContext & ctx)
+  {
+    char * asPtr = (char*)&c;
+    return format_to(ctx.out(), "{}{}{}{}", asPtr[0] ? asPtr[0] : '\0', asPtr[1] ? asPtr[1] : '\0', asPtr[2] ? asPtr[2] : '\0', asPtr[3] ? asPtr[3] : '\0');
+  }
+};
+
 std::string_view operator+(std::string_view a, std::string_view b);
 void operator+=(std::string_view & a, std::string_view b);
 
diff --git a/common/src/util.cpp b/common/src/util.cpp
index fe640bc..eed75ff 100644
--- a/common/src/util.cpp
+++ b/common/src/util.cpp
@@ -20,24 +20,6 @@ std::string_view getFilenameFromPath(std::string_view s)
   return {s.data()+indexOfSlash+1, s.size()-1-indexOfSlash};
 }
 
-std::vector<std::string_view> splitAsUtf8(std::string_view s)
-{
-  std::vector<std::string_view> result;
-  const char * beginPtr = s.data();
-  const char * currentPtr = beginPtr;
-  const char * endPtr = s.data()+s.size()-1;
-
-  while (true)
-    try
-    {
-      utf8::next(currentPtr, endPtr);
-      result.emplace_back(beginPtr, currentPtr-beginPtr);
-      beginPtr = currentPtr;
-    } catch (std::exception &) {break;}
-
-  return result;
-}
-
 std::vector<std::string_view> split(std::string_view remaining, char delimiter)
 {
   std::vector<std::string_view> result;
@@ -55,6 +37,31 @@ std::vector<std::string_view> split(std::string_view remaining, char delimiter)
   return result;
 }
 
+utf8string splitAsUtf8(std::string_view s)
+{
+  utf8string result;
+  const char * beginPtr = s.data();
+  const char * currentPtr = beginPtr;
+  const char * endPtr = s.data()+s.size()-1;
+
+  if (!utf8::is_valid(beginPtr, endPtr))
+    myThrow("Not a valid utf8 input");
+
+  while (currentPtr < endPtr)
+  {
+    utf8::next(currentPtr, endPtr);
+    if (currentPtr - beginPtr > 4 || currentPtr - beginPtr == 0)
+      myThrow(fmt::format("Invalid utf8 character at index {}", beginPtr-s.data()));
+    utf8char c = {};
+    for (int i = 0; i < currentPtr - beginPtr; i++)
+      ((char*)&c)[i] = beginPtr[i];
+    beginPtr = currentPtr;
+    result.push_back(c);
+  }
+
+  return result;
+}
+
 void warning(std::string_view message, const std::experimental::source_location & location)
 {
   fmt::print(stderr, "WARNING ({}) : {}\n", location, message);
@@ -76,6 +83,21 @@ void myThrow(std::string_view message, const std::experimental::source_location
   throw std::invalid_argument(fmt::format("from ({}) {}", location, message));
 }
 
+std::string int2HumanStr(int number)
+{
+  std::string nb = std::to_string(number);
+  std::string result;
+
+  for (unsigned int i = 0; i < nb.size(); i++)
+  {
+    result.push_back(nb[i]);
+    if (((nb.size()-i-1) % 3 == 0) && i < nb.size()-1)
+      result.push_back(' ');
+  }
+
+  return result;
+}
+
 };
 
 std::string_view operator+(std::string_view a, std::string_view b)
diff --git a/dev/src/dev.cpp b/dev/src/dev.cpp
index 9dc76c0..4863c31 100644
--- a/dev/src/dev.cpp
+++ b/dev/src/dev.cpp
@@ -10,7 +10,9 @@ int main(int argc, char * argv[])
 
   Config config(argv[3], argv[1], argv[2]);
 
-  config.print(stdout);
+  config.printSize(stderr);
+
+  std::scanf("%*c");
 
   return 0;
 }
diff --git a/reading_machine/include/Config.hpp b/reading_machine/include/Config.hpp
index c20859e..8895ef6 100644
--- a/reading_machine/include/Config.hpp
+++ b/reading_machine/include/Config.hpp
@@ -33,7 +33,7 @@ class Config
   std::unordered_map<std::string, int> colName2Index;
 
   std::string rawInput;
-  std::vector<std::string_view> rawInputUtf8;
+  util::utf8string rawInputUtf8;
 
   using ReferenceAndHypotheses = std::vector<std::string>;
   using Line = std::vector<ReferenceAndHypotheses>;
@@ -49,6 +49,7 @@ class Config
 
   Config(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename);
   void print(FILE * dest) const;
+  void printSize(FILE * dest);
 };
 
 #endif
diff --git a/reading_machine/src/Config.cpp b/reading_machine/src/Config.cpp
index e3ee2f4..4c59237 100644
--- a/reading_machine/src/Config.cpp
+++ b/reading_machine/src/Config.cpp
@@ -106,6 +106,25 @@ void Config::readTSVInput(std::string_view tsvFilename)
   std::fclose(file);
 }
 
+void Config::printSize(FILE * dest)
+{
+  int rawInputNbElements = rawInput.size();
+  int rawInputSize = sizeof rawInput + rawInput.capacity() * sizeof rawInput[0];
+
+  int rawInputUtf8NbElements = 4*rawInputUtf8.size();
+  int rawInputUtf8Size = sizeof rawInputUtf8 + rawInputUtf8.capacity()* sizeof rawInputUtf8[0];
+
+  int totalSize = rawInputSize + rawInputUtf8Size;
+
+  std::string unit = "Mo";
+  int unitPower = 6;
+  float unitMultiplier = std::stof(fmt::format("0.{:0^{}}1","",unitPower-1));
+
+  fmt::print(dest, "{:<43} : {:<{}.2f} {}\n", fmt::format("{:<20} {:>12} elements", "rawInput", util::int2HumanStr(rawInputNbElements)), unitMultiplier*rawInputSize, 2+11-unitPower, unit);
+  fmt::print(dest, "{:<43} : {:<{}.2f} {}\n", fmt::format("{:<20} {:>12} elements", "rawInputUtf8", util::int2HumanStr(rawInputUtf8NbElements)), unitMultiplier*rawInputUtf8Size, 2+11-unitPower, unit);
+  fmt::print(dest, "{:<43} : {:<{}.2f} {}\n", "Total", unitMultiplier*totalSize, 2+11-unitPower, unit);
+}
+
 Config::Config(std::string_view mcdFilename, std::string_view tsvFilename, std::string_view rawFilename)
 {
   if (tsvFilename.empty() and rawFilename.empty())
-- 
GitLab