Skip to content
Snippets Groups Projects
Commit 3ac38354 authored by Franck Dary's avatar Franck Dary
Browse files

Added special tokens for number and url. Dict tries lowercase before outputing unknownValue

parent ad8e0187
No related branches found
No related tags found
No related merge requests found
...@@ -19,6 +19,8 @@ class Dict ...@@ -19,6 +19,8 @@ class Dict
static constexpr char const * nullValueStr = "__nullValue__"; static constexpr char const * nullValueStr = "__nullValue__";
static constexpr char const * emptyValueStr = "__emptyValue__"; static constexpr char const * emptyValueStr = "__emptyValue__";
static constexpr char const * separatorValueStr = "__separatorValue__"; static constexpr char const * separatorValueStr = "__separatorValue__";
static constexpr char const * numberValueStr = "__numberValue__";
static constexpr char const * urlValueStr = "__urlValue__";
static constexpr std::size_t maxEntrySize = 5000; static constexpr std::size_t maxEntrySize = 5000;
private : private :
......
...@@ -43,6 +43,9 @@ bool isIllegal(utf8char c); ...@@ -43,6 +43,9 @@ bool isIllegal(utf8char c);
bool isUppercase(utf8char c); bool isUppercase(utf8char c);
bool isUrl(const std::string & s);
bool isNumber(const std::string & s);
std::string getTime(); std::string getTime();
template <typename T> template <typename T>
......
...@@ -7,6 +7,8 @@ Dict::Dict(State state) ...@@ -7,6 +7,8 @@ Dict::Dict(State state)
insert(unknownValueStr); insert(unknownValueStr);
insert(nullValueStr); insert(nullValueStr);
insert(emptyValueStr); insert(emptyValueStr);
insert(numberValueStr);
insert(urlValueStr);
} }
Dict::Dict(const char * filename, State state) Dict::Dict(const char * filename, State state)
...@@ -76,6 +78,12 @@ int Dict::getIndexOrInsert(const std::string & element) ...@@ -76,6 +78,12 @@ int Dict::getIndexOrInsert(const std::string & element)
if (element.size() == 1 and util::isSeparator(util::utf8char(element))) if (element.size() == 1 and util::isSeparator(util::utf8char(element)))
return getIndexOrInsert(separatorValueStr); return getIndexOrInsert(separatorValueStr);
if (util::isNumber(element))
return getIndexOrInsert(numberValueStr);
if (util::isUrl(element))
return getIndexOrInsert(urlValueStr);
const auto & found = elementsToIndexes.find(element); const auto & found = elementsToIndexes.find(element);
if (found == elementsToIndexes.end()) if (found == elementsToIndexes.end())
...@@ -87,6 +95,15 @@ int Dict::getIndexOrInsert(const std::string & element) ...@@ -87,6 +95,15 @@ int Dict::getIndexOrInsert(const std::string & element)
nbOccs[elementsToIndexes[element]]++; nbOccs[elementsToIndexes[element]]++;
return elementsToIndexes[element]; return elementsToIndexes[element];
} }
const auto & found2 = elementsToIndexes.find(util::lower(element));
if (found2 != elementsToIndexes.end())
{
if (isCountingOccs)
nbOccs[found2->second]++;
return found2->second;
}
if (isCountingOccs) if (isCountingOccs)
nbOccs[elementsToIndexes[unknownValueStr]]++; nbOccs[elementsToIndexes[unknownValueStr]]++;
return elementsToIndexes[unknownValueStr]; return elementsToIndexes[unknownValueStr];
......
...@@ -25,6 +25,26 @@ bool util::isIllegal(utf8char c) ...@@ -25,6 +25,26 @@ bool util::isIllegal(utf8char c)
return c == '\n' || c == '\t'; return c == '\n' || c == '\t';
} }
bool util::isNumber(const std::string & s)
{
static std::map<utf8char, bool> digits{{"0",true},{"1",true},{"2",true},{"3",true},{"4",true},{"5",true},{"6",true},{"7",true},{"8",true},{"9",true},};
utf8string asUtf8 = splitAsUtf8(s);
bool hasDigit = false;
for (auto & c : asUtf8)
if (digits.count(c))
hasDigit = true;
else if (lower2upper.count(c) or upper2lower.count(c))
return false;
return hasDigit;
}
bool util::isUrl(const std::string & s)
{
return s.size() >= 4 and s[0] == 'h' and s[1] == 't' and s[2] == 't' and s[3] == 'p';
}
std::vector<std::string> util::split(std::string_view remaining, char delimiter) std::vector<std::string> util::split(std::string_view remaining, char delimiter)
{ {
std::vector<std::string> result; std::vector<std::string> result;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment