From 3ac38354bcc9e1c0ef1e809da39a2881e3d9f09f Mon Sep 17 00:00:00 2001
From: Franck Dary <franck.dary@lis-lab.fr>
Date: Sun, 14 Jun 2020 17:12:57 +0200
Subject: [PATCH] Added special tokens for number and url. Dict tries lowercase
 before outputing unknownValue

---
 common/include/Dict.hpp |  2 ++
 common/include/util.hpp |  3 +++
 common/src/Dict.cpp     | 17 +++++++++++++++++
 common/src/util.cpp     | 20 ++++++++++++++++++++
 4 files changed, 42 insertions(+)

diff --git a/common/include/Dict.hpp b/common/include/Dict.hpp
index 6d3f27a..32571b3 100644
--- a/common/include/Dict.hpp
+++ b/common/include/Dict.hpp
@@ -19,6 +19,8 @@ class Dict
   static constexpr char const * nullValueStr = "__nullValue__";
   static constexpr char const * emptyValueStr = "__emptyValue__";
   static constexpr char const * separatorValueStr = "__separatorValue__";
+  static constexpr char const * numberValueStr = "__numberValue__";
+  static constexpr char const * urlValueStr = "__urlValue__";
   static constexpr std::size_t maxEntrySize = 5000;
 
   private :
diff --git a/common/include/util.hpp b/common/include/util.hpp
index e0fcf2a..90056c7 100644
--- a/common/include/util.hpp
+++ b/common/include/util.hpp
@@ -43,6 +43,9 @@ bool isIllegal(utf8char c);
 
 bool isUppercase(utf8char c);
 
+bool isUrl(const std::string & s);
+bool isNumber(const std::string & s);
+
 std::string getTime();
 
 template <typename T>
diff --git a/common/src/Dict.cpp b/common/src/Dict.cpp
index 12d7bff..cdf09df 100644
--- a/common/src/Dict.cpp
+++ b/common/src/Dict.cpp
@@ -7,6 +7,8 @@ Dict::Dict(State state)
   insert(unknownValueStr);
   insert(nullValueStr);
   insert(emptyValueStr);
+  insert(numberValueStr);
+  insert(urlValueStr);
 }
 
 Dict::Dict(const char * filename, State state)
@@ -76,6 +78,12 @@ int Dict::getIndexOrInsert(const std::string & element)
   if (element.size() == 1 and util::isSeparator(util::utf8char(element)))
     return getIndexOrInsert(separatorValueStr);
 
+  if (util::isNumber(element))
+    return getIndexOrInsert(numberValueStr);
+
+  if (util::isUrl(element))
+    return getIndexOrInsert(urlValueStr);
+
   const auto & found = elementsToIndexes.find(element);
 
   if (found == elementsToIndexes.end())
@@ -87,6 +95,15 @@ int Dict::getIndexOrInsert(const std::string & element)
         nbOccs[elementsToIndexes[element]]++;
       return elementsToIndexes[element];
     }
+
+    const auto & found2 = elementsToIndexes.find(util::lower(element));
+    if (found2 != elementsToIndexes.end())
+    {
+      if (isCountingOccs)
+        nbOccs[found2->second]++;
+      return found2->second;   
+    }
+
     if (isCountingOccs)
       nbOccs[elementsToIndexes[unknownValueStr]]++;
     return elementsToIndexes[unknownValueStr];
diff --git a/common/src/util.cpp b/common/src/util.cpp
index 5d2be74..dd9a982 100644
--- a/common/src/util.cpp
+++ b/common/src/util.cpp
@@ -25,6 +25,26 @@ bool util::isIllegal(utf8char c)
   return c == '\n' || c == '\t';
 }
 
+bool util::isNumber(const std::string & s)
+{
+  static std::map<utf8char, bool> digits{{"0",true},{"1",true},{"2",true},{"3",true},{"4",true},{"5",true},{"6",true},{"7",true},{"8",true},{"9",true},};
+  utf8string asUtf8 = splitAsUtf8(s);
+  bool hasDigit = false;
+
+  for (auto & c : asUtf8)
+    if (digits.count(c))
+      hasDigit = true;
+    else if (lower2upper.count(c) or upper2lower.count(c))
+      return false;
+
+  return hasDigit;
+}
+
+bool util::isUrl(const std::string & s)
+{
+  return s.size() >= 4 and s[0] == 'h' and s[1] == 't' and s[2] == 't' and s[3] == 'p';
+}
+
 std::vector<std::string> util::split(std::string_view remaining, char delimiter)
 {
   std::vector<std::string> result;
-- 
GitLab