From 3ac38354bcc9e1c0ef1e809da39a2881e3d9f09f Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Sun, 14 Jun 2020 17:12:57 +0200 Subject: [PATCH] Added special tokens for number and url. Dict tries lowercase before outputing unknownValue --- common/include/Dict.hpp | 2 ++ common/include/util.hpp | 3 +++ common/src/Dict.cpp | 17 +++++++++++++++++ common/src/util.cpp | 20 ++++++++++++++++++++ 4 files changed, 42 insertions(+) diff --git a/common/include/Dict.hpp b/common/include/Dict.hpp index 6d3f27a..32571b3 100644 --- a/common/include/Dict.hpp +++ b/common/include/Dict.hpp @@ -19,6 +19,8 @@ class Dict static constexpr char const * nullValueStr = "__nullValue__"; static constexpr char const * emptyValueStr = "__emptyValue__"; static constexpr char const * separatorValueStr = "__separatorValue__"; + static constexpr char const * numberValueStr = "__numberValue__"; + static constexpr char const * urlValueStr = "__urlValue__"; static constexpr std::size_t maxEntrySize = 5000; private : diff --git a/common/include/util.hpp b/common/include/util.hpp index e0fcf2a..90056c7 100644 --- a/common/include/util.hpp +++ b/common/include/util.hpp @@ -43,6 +43,9 @@ bool isIllegal(utf8char c); bool isUppercase(utf8char c); +bool isUrl(const std::string & s); +bool isNumber(const std::string & s); + std::string getTime(); template <typename T> diff --git a/common/src/Dict.cpp b/common/src/Dict.cpp index 12d7bff..cdf09df 100644 --- a/common/src/Dict.cpp +++ b/common/src/Dict.cpp @@ -7,6 +7,8 @@ Dict::Dict(State state) insert(unknownValueStr); insert(nullValueStr); insert(emptyValueStr); + insert(numberValueStr); + insert(urlValueStr); } Dict::Dict(const char * filename, State state) @@ -76,6 +78,12 @@ int Dict::getIndexOrInsert(const std::string & element) if (element.size() == 1 and util::isSeparator(util::utf8char(element))) return getIndexOrInsert(separatorValueStr); + if (util::isNumber(element)) + return getIndexOrInsert(numberValueStr); + + if (util::isUrl(element)) + return getIndexOrInsert(urlValueStr); + const auto & found = elementsToIndexes.find(element); if (found == elementsToIndexes.end()) @@ -87,6 +95,15 @@ int Dict::getIndexOrInsert(const std::string & element) nbOccs[elementsToIndexes[element]]++; return elementsToIndexes[element]; } + + const auto & found2 = elementsToIndexes.find(util::lower(element)); + if (found2 != elementsToIndexes.end()) + { + if (isCountingOccs) + nbOccs[found2->second]++; + return found2->second; + } + if (isCountingOccs) nbOccs[elementsToIndexes[unknownValueStr]]++; return elementsToIndexes[unknownValueStr]; diff --git a/common/src/util.cpp b/common/src/util.cpp index 5d2be74..dd9a982 100644 --- a/common/src/util.cpp +++ b/common/src/util.cpp @@ -25,6 +25,26 @@ bool util::isIllegal(utf8char c) return c == '\n' || c == '\t'; } +bool util::isNumber(const std::string & s) +{ + static std::map<utf8char, bool> digits{{"0",true},{"1",true},{"2",true},{"3",true},{"4",true},{"5",true},{"6",true},{"7",true},{"8",true},{"9",true},}; + utf8string asUtf8 = splitAsUtf8(s); + bool hasDigit = false; + + for (auto & c : asUtf8) + if (digits.count(c)) + hasDigit = true; + else if (lower2upper.count(c) or upper2lower.count(c)) + return false; + + return hasDigit; +} + +bool util::isUrl(const std::string & s) +{ + return s.size() >= 4 and s[0] == 'h' and s[1] == 't' and s[2] == 't' and s[3] == 'p'; +} + std::vector<std::string> util::split(std::string_view remaining, char delimiter) { std::vector<std::string> result; -- GitLab