Skip to content
Snippets Groups Projects
Commit 3ac38354 authored by Franck Dary's avatar Franck Dary
Browse files

Added special tokens for number and url. Dict tries lowercase before outputing unknownValue

parent ad8e0187
Branches
Tags
No related merge requests found
......@@ -19,6 +19,8 @@ class Dict
static constexpr char const * nullValueStr = "__nullValue__";
static constexpr char const * emptyValueStr = "__emptyValue__";
static constexpr char const * separatorValueStr = "__separatorValue__";
static constexpr char const * numberValueStr = "__numberValue__";
static constexpr char const * urlValueStr = "__urlValue__";
static constexpr std::size_t maxEntrySize = 5000;
private :
......
......@@ -43,6 +43,9 @@ bool isIllegal(utf8char c);
bool isUppercase(utf8char c);
bool isUrl(const std::string & s);
bool isNumber(const std::string & s);
std::string getTime();
template <typename T>
......
......@@ -7,6 +7,8 @@ Dict::Dict(State state)
insert(unknownValueStr);
insert(nullValueStr);
insert(emptyValueStr);
insert(numberValueStr);
insert(urlValueStr);
}
Dict::Dict(const char * filename, State state)
......@@ -76,6 +78,12 @@ int Dict::getIndexOrInsert(const std::string & element)
if (element.size() == 1 and util::isSeparator(util::utf8char(element)))
return getIndexOrInsert(separatorValueStr);
if (util::isNumber(element))
return getIndexOrInsert(numberValueStr);
if (util::isUrl(element))
return getIndexOrInsert(urlValueStr);
const auto & found = elementsToIndexes.find(element);
if (found == elementsToIndexes.end())
......@@ -87,6 +95,15 @@ int Dict::getIndexOrInsert(const std::string & element)
nbOccs[elementsToIndexes[element]]++;
return elementsToIndexes[element];
}
const auto & found2 = elementsToIndexes.find(util::lower(element));
if (found2 != elementsToIndexes.end())
{
if (isCountingOccs)
nbOccs[found2->second]++;
return found2->second;
}
if (isCountingOccs)
nbOccs[elementsToIndexes[unknownValueStr]]++;
return elementsToIndexes[unknownValueStr];
......
......@@ -25,6 +25,26 @@ bool util::isIllegal(utf8char c)
return c == '\n' || c == '\t';
}
bool util::isNumber(const std::string & s)
{
static std::map<utf8char, bool> digits{{"0",true},{"1",true},{"2",true},{"3",true},{"4",true},{"5",true},{"6",true},{"7",true},{"8",true},{"9",true},};
utf8string asUtf8 = splitAsUtf8(s);
bool hasDigit = false;
for (auto & c : asUtf8)
if (digits.count(c))
hasDigit = true;
else if (lower2upper.count(c) or upper2lower.count(c))
return false;
return hasDigit;
}
bool util::isUrl(const std::string & s)
{
return s.size() >= 4 and s[0] == 'h' and s[1] == 't' and s[2] == 't' and s[3] == 'p';
}
std::vector<std::string> util::split(std::string_view remaining, char delimiter)
{
std::vector<std::string> result;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment