Select Git revision
-
Franck Dary authoredFranck Dary authored
Dict.cpp 5.11 KiB
#include "Dict.hpp"
#include "util.hpp"
Dict::Dict(State state)
{
setState(state);
insert(unknownValueStr);
insert(nullValueStr);
insert(emptyValueStr);
}
Dict::Dict(const char * filename, State state)
{
readFromFile(filename);
setState(state);
}
void Dict::readFromFile(const char * filename)
{
std::FILE * file = std::fopen(filename, "r");
if (!file)
util::myThrow(fmt::format("could not open file '{}'", filename));
char buffer[1048];
if (std::fscanf(file, "Encoding : %1047s\n", buffer) != 1)
util::myThrow(fmt::format("file '{}' bad format", filename));
Encoding encoding{Encoding::Ascii};
if (std::string(buffer) == "Ascii")
encoding = Encoding::Ascii;
else if (std::string(buffer) == "Binary")
encoding = Encoding::Binary;
else
util::myThrow(fmt::format("file '{}' bad format", filename));
int nbEntries;
if (std::fscanf(file, "Nb entries : %d\n", &nbEntries) != 1)
util::myThrow(fmt::format("file '{}' bad format", filename));
elementsToIndexes.reserve(nbEntries);
int entryIndex;
int nbOccsEntry;
char entryString[maxEntrySize+1];
for (int i = 0; i < nbEntries; i++)
{
if (!readEntry(file, &entryIndex, &nbOccsEntry, entryString, encoding))
util::myThrow(fmt::format("file '{}' line {} bad format", filename, i));
elementsToIndexes[entryString] = entryIndex;
while ((int)nbOccs.size() <= entryIndex)
nbOccs.emplace_back(0);
nbOccs[entryIndex] = nbOccsEntry;
}
std::fclose(file);
}
void Dict::insert(const std::string & element)
{
if (element.size() > maxEntrySize)
util::myThrow(fmt::format("inserting element of size={} > maxElementSize={}", element.size(), maxEntrySize));
elementsToIndexes.emplace(element, elementsToIndexes.size());
while (nbOccs.size() < elementsToIndexes.size())
nbOccs.emplace_back(0);
}
int Dict::getIndexOrInsert(const std::string & element)
{
if (element.empty())
return getIndexOrInsert(emptyValueStr);
if (element.size() == 1 and util::isSeparator(util::utf8char(element)))
return getIndexOrInsert(separatorValueStr);
const auto & found = elementsToIndexes.find(element);
if (found == elementsToIndexes.end())
{
if (state == State::Open)
{
insert(element);
if (isCountingOccs)
nbOccs[elementsToIndexes[element]]++;
return elementsToIndexes[element];
}
if (isCountingOccs)
nbOccs[elementsToIndexes[unknownValueStr]]++;
return elementsToIndexes[unknownValueStr];
}
if (isCountingOccs)
nbOccs[found->second]++;
return found->second;
}
void Dict::setState(State state)
{
this->state = state;
}
Dict::State Dict::getState() const
{
return state;
}
void Dict::save(std::filesystem::path path, Encoding encoding) const
{
std::FILE * destination = std::fopen(path.c_str(), "w");
if (!destination)
util::myThrow(fmt::format("could not write file '{}'", path.string()));
fprintf(destination, "Encoding : %s\n", encoding == Encoding::Ascii ? "Ascii" : "Binary");
fprintf(destination, "Nb entries : %lu\n", elementsToIndexes.size());
for (auto & it : elementsToIndexes)
printEntry(destination, it.second, it.first, encoding);
std::fclose(destination);
}
bool Dict::readEntry(std::FILE * file, int * index, int * nbOccsEntry, char * entry, Encoding encoding)
{
if (encoding == Encoding::Ascii)
{
static std::string readFormat = "%d\t%d\t%"+std::to_string(maxEntrySize)+"[^\n]\n";
return fscanf(file, readFormat.c_str(), index, nbOccsEntry, entry) == 3;
}
else
{
if (std::fread(index, sizeof *index, 1, file) != 1)
return false;
if (std::fread(nbOccsEntry, sizeof *nbOccsEntry, 1, file) != 1)
return false;
for (unsigned int i = 0; i < maxEntrySize; i++)
{
if (std::fread(entry+i, 1, 1, file) != 1)
return false;
if (!entry[i])
return true;
}
return false;
}
}
void Dict::printEntry(std::FILE * file, int index, const std::string & entry, Encoding encoding) const
{
auto entryNbOccs = getNbOccs(index);
if (encoding == Encoding::Ascii)
{
static std::string printFormat = "%d\t%d\t%s\n";
fprintf(file, printFormat.c_str(), index, entryNbOccs, entry.c_str());
}
else
{
std::fwrite(&index, sizeof index, 1, file);
std::fwrite(&entryNbOccs, sizeof entryNbOccs, 1, file);
std::fwrite(entry.c_str(), 1, entry.size()+1, file);
}
}
void Dict::countOcc(bool isCountingOccs)
{
this->isCountingOccs = isCountingOccs;
}
std::size_t Dict::size() const
{
return elementsToIndexes.size();
}
int Dict::getNbOccs(int index) const
{
if (index < 0 || index >= (int)nbOccs.size())
return 0;
return nbOccs[index];
}
void Dict::removeRareElements()
{
int minNbOcc = std::numeric_limits<int>::max();
for (int nbOcc : nbOccs)
if (nbOcc < minNbOcc)
minNbOcc = nbOcc;
std::unordered_map<std::string, int> newElementsToIndexes;
std::vector<int> newNbOccs;
for (auto & it : elementsToIndexes)
if (nbOccs[it.second] > minNbOcc)
{
newElementsToIndexes.emplace(it.first, newElementsToIndexes.size());
newNbOccs.emplace_back(nbOccs[it.second]);
}
elementsToIndexes = newElementsToIndexes;
nbOccs = newNbOccs;
}