Select Git revision
Franck Dary authored
Config.cpp 14.03 KiB
#include "Config.hpp"
#include <algorithm>
#include "File.hpp"
#include "ProgramParameters.hpp"
#include "Action.hpp"
#include "ProgramOutput.hpp"
#include "utf8.hpp"
Config::Config(BD & bd, const std::string inputFilename) : bd(bd), hashHistory(HISTORY_SIZE), pastActions(HISTORY_SIZE)
{
this->outputFile = nullptr;
this->stackHistory = -1;
this->inputFilename = inputFilename;
this->lastIndexPrinted = -1;
head = 0;
inputAllRead = false;
for(int i = 0; i < bd.getNbLines(); i++)
tapes.emplace_back(bd.getNameOfLine(i), bd.lineIsKnown(i));
this->totalEntropy = 0;
}
Config::Config(const Config & other) : bd(other.bd), hashHistory(other.hashHistory), pastActions(other.pastActions)
{
this->currentStateName = other.currentStateName;
this->actionHistory = other.actionHistory;
this->entropyHistory = other.entropyHistory;
this->stack = other.stack;
this->stackHistory = other.stackHistory;
this->head = other.head;
this->outputFile = other.outputFile;
this->lastIndexPrinted = other.lastIndexPrinted;
this->tapes = other.tapes;
this->totalEntropy = other.totalEntropy;
this->inputFilename = other.inputFilename;
this->inputAllRead = other.inputAllRead;
this->file.reset(new File(*other.file.get()));
}
Config::Tape::Tape(const std::string & name, bool isKnown) : ref(ProgramParameters::readSize*4+1, Dict::unknownValueStr), hyp(ProgramParameters::readSize*4+1, std::make_pair(Dict::unknownValueStr, 0.0))
{
this->head = 0;
this->name = name;
this->isKnown = isKnown;
this->totalEntropy = 0.0;
}
bool Config::hasTape(const std::string & name)
{
return bd.hasLineOfName(name);
}
Config::Tape & Config::getTape(const std::string & name)
{
return tapes[bd.getLineOfName(name)];
}
Config::Tape & Config::getTapeByInputCol(int col)
{
return tapes[bd.getLineOfInputCol(col)];
}
void Config::readInput()
{
if (inputAllRead)
return;
if (!file.get())
file.reset(new File(inputFilename, "r"));
FILE * fd = file->getDescriptor();
char buffer[100000];
std::vector<std::string> cols;
unsigned int usualColsSize = 0;
int toRead = ProgramParameters::readSize;
int haveRead = 0;
while(haveRead < toRead && fscanf(fd, "%[^\n]\n", buffer) == 1)
{
if (!utf8::is_valid(buffer, buffer+std::strlen(buffer)))
{
fprintf(stderr, "ERROR (%s) : input (%s) line %d is not toally utf-8 formated. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size());
exit(1);
}
cols = split(buffer, '\t');
if (!usualColsSize)
usualColsSize = cols.size();
if (cols.size() != usualColsSize)
{
fprintf(stderr, "ERROR (%s) : input (%s) line %d has %lu columns instead of %u. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size(), cols.size(), usualColsSize);
exit(1);
}
printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex(), tapes[0].getNextOverridenRealIndex());
for(unsigned int i = 0; i < cols.size(); i++)
if(bd.hasLineOfInputCol(i))
{
auto & tape = getTapeByInputCol(i);
tape.addToRef(cols[i]);
tape.addToHyp("");
if (tape.getName() == ProgramParameters::tapeToMask)
if (choiceWithProbability(ProgramParameters::maskRate))
tape.maskIndex(tape.refSize()-1);
}
haveRead++;
}
// Making all tapes the same size
int maxTapeSize = 0;
for(auto & tape : tapes)
maxTapeSize = std::max<unsigned int>(maxTapeSize, tape.refSize());
if (haveRead < toRead || tapes[0].size() == ProgramParameters::tapeSize)
{
printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex(), tapes[0].getNextOverridenRealIndex());
inputAllRead = true;
}
for(auto & tape : tapes)
{
while(tape.refSize() < maxTapeSize)
tape.addToRef("");
while(tape.hypSize() < maxTapeSize)
tape.addToHyp("");
if (inputAllRead)
{
tape.addToRef("0");
tape.addToHyp("");
}
}
}
void Config::printForDebug(FILE * output)
{
int window = 5;
std::vector< std::vector<std::string> > cols;
cols.emplace_back();
cols[0].emplace_back();
cols[0].emplace_back();
for(auto & tape : tapes)
{
cols[0].emplace_back(tape.getName());
for(int i = std::max(0, head-window); i < std::min(tape.hypSize(), head+window); i++)
{
unsigned int colIndex = i - std::max(0, head-window)+1;
while(cols.size() <= colIndex)
cols.emplace_back();
if(&tape == &tapes[0])
{
cols[colIndex].emplace_back(i == head ? "head" : std::to_string(i));
cols[colIndex].emplace_back(i == head ? " || " : "");
}
cols[colIndex].emplace_back(shrinkString(tape[i-head], 10, ".."));
}
}
fprintf(output, "Configuration : %.2f entropy\n", totalEntropy);
for(int i = 0; i < 80; i++)
fprintf(output, "-%s", i == 80-1 ? "\n" : "");
printColumns(output, cols, 3);
fprintf(output, "Stack : ");
for(int s : stack)
fprintf(output, "%d ", s);
fprintf(output, "\n");
for(int i = 0; i < 80; i++)
fprintf(output, "-%s", i == 80-1 ? "\n" : "");
}
void Config::printAsExample(FILE *)
{
fprintf(stderr, "ERROR (%s) : not supported. Aborting.\n", ERRINFO);
exit(1);
}
void Config::printAsOutput(FILE * output, int dataIndex, int realIndex)
{
if (dataIndex == -1 || !output)
return;
lastIndexPrinted = dataIndex;
std::vector< std::pair<std::string, float> > toPrint;
for (unsigned int j = 0; j < tapes.size(); j++)
{
if(bd.mustPrintLine(j))
toPrint.emplace_back(tapes[j][dataIndex-head].empty() ? "0" : tapes[j][dataIndex-head].c_str(), tapes[j].getEntropy(dataIndex-head));
}
ProgramOutput::instance.addLine(toPrint, realIndex);
}
void Config::moveHead(int mvt)
{
if (head + mvt < tapes[0].size())
{
head += mvt;
for (auto & tape : tapes)
tape.moveHead(mvt);
if (mvt > 0 && head % ProgramParameters::readSize == 0 && head >= (3*ProgramParameters::readSize))
readInput();
}
else if (!endOfTapes())
{
fprintf(stderr, "ERROR (%s) : Input has not been read completely, yet the head is already at the end of tapes. Aborting.\n", ERRINFO);
exit(1);
}
}
bool Config::isFinal()
{
return endOfTapes() && stack.empty();
}
void Config::reset()
{
for (auto & tape : tapes)
tape.clear();
actionHistory.clear();
pastActions.clear();
hashHistory.clear();
actionsHistory.clear();
stack.clear();
stackHistory = -1;
inputAllRead = false;
head = 0;
file.reset();
while (tapes[0].size() < ProgramParameters::readSize*4 && !inputAllRead)
readInput();
}
const std::string & Config::Tape::operator[](int relativeIndex)
{
if(isKnown)
return getRef(relativeIndex);
return getHyp(relativeIndex);
}
float Config::Tape::getEntropy(int relativeIndex)
{
if(isKnown)
return 0.0;
return hyp.get(head + relativeIndex).second;
}
const std::string & Config::Tape::getRef(int relativeIndex)
{
return ref.get(head + relativeIndex);
}
const std::string & Config::Tape::getHyp(int relativeIndex)
{
return hyp.get(head + relativeIndex).first;
}
void Config::Tape::setHyp(int relativeIndex, const std::string & elem)
{
hyp.set(head + relativeIndex, std::pair<std::string,float>(elem,totalEntropy));
}
std::string & Config::getCurrentStateName()
{
if(currentStateName.empty())
{
fprintf(stderr, "ERROR (%s) : currentStateName is empty. Aborting.\n", ERRINFO);
exit(1);
}
return currentStateName;
}
void Config::setCurrentStateName(const std::string & name)
{
this->currentStateName = name;
}
LimitedStack<std::string> & Config::getCurrentStateHistory()
{
if (!actionHistory.count(getCurrentStateName()))
actionHistory.emplace(getCurrentStateName(), HISTORY_SIZE);
return actionHistory.find(getCurrentStateName())->second;
}
LimitedStack<std::string> & Config::getStateHistory(const std::string & state)
{
if (!actionHistory.count(state))
actionHistory.emplace(state, HISTORY_SIZE);
return actionHistory.find(state)->second;
}
LimitedStack<float> & Config::getCurrentStateEntropyHistory()
{
if (!entropyHistory.count(getCurrentStateName()))
entropyHistory.emplace(getCurrentStateName(), HISTORY_SIZE);
return entropyHistory.find(getCurrentStateName())->second;
}
void Config::shuffle(const std::string & delimiterTape, const std::string & delimiter)
{
auto & tape = getTape(delimiterTape);
std::vector< std::pair<unsigned int, unsigned int> > delimiters;
unsigned int previousIndex = 0;
for (int i = 0; i < tape.refSize(); i++)
if (tape.getRef(i-head) == delimiter)
{
delimiters.emplace_back(previousIndex, i);
previousIndex = i+1;
}
if (delimiters.empty())
{
fprintf(stderr, "ERROR (%s) : Requested to shuffle based on tape \'%s\' with \'%s\' as a delimiter, but none as been found. Aborting.\n", ERRINFO, delimiterTape.c_str(), delimiter.c_str());
exit(1);
}
std::pair<unsigned int, unsigned int> suffix = {delimiters.back().second+1, tape.refSize()-1};
std::random_shuffle(delimiters.begin(), delimiters.end());
auto newTapes = tapes;
for (unsigned int tape = 0; tape < tapes.size(); tape++)
{
newTapes[tape].clearDataForCopy();
for (auto & delimiter : delimiters)
newTapes[tape].copyPart(tapes[tape], delimiter.first, delimiter.second+1);
if (suffix.first <= suffix.second)
newTapes[tape].copyPart(tapes[tape], suffix.first, suffix.second+1);
}
tapes = newTapes;
}
int Config::stackGetElem(int index) const
{
if (index == -1)
return stackHistory;
if (index < 0 || index >= (int)stack.size())
{
fprintf(stderr, "ERROR (%s) : requested element index \'%d\' in the stack. Aborting.\n", ERRINFO, index);
exit(1);
}
return stack[stack.size()-1-index];
}
bool Config::stackHasIndex(int index) const
{
return index == -1 || (index >= 0 && index < (int)stack.size());
}
bool Config::stackEmpty() const
{
return !stackHasIndex(0);
}
void Config::stackPop()
{
if (stack.empty())
{
fprintf(stderr, "ERROR (%s) : Popping empty stack. Aborting.\n", ERRINFO);
exit(1);
}
stackHistory = stack.back();
stack.pop_back();
}
void Config::stackPush(int elem)
{
stack.push_back(elem);
}
int Config::stackTop()
{
if (stack.empty())
{
fprintf(stderr, "ERROR (%s) : Requesting back element of empty stack. Aborting.\n", ERRINFO);
exit(1);
}
return stack.back();
}
int Config::stackSize() const
{
return stack.size();
}
void Config::loadFromFile(File &)
{
fprintf(stderr, "ERROR (%s) : not supported. Aborting.\n", ERRINFO);
exit(1);
}
void Config::addToEntropyHistory(float entropy)
{
if (!entropyHistory.count(getCurrentStateName()))
entropyHistory.emplace(getCurrentStateName(), HISTORY_SIZE);
entropyHistory.find(getCurrentStateName())->second.push(entropy);
}
std::size_t Config::computeHash()
{
static int window = 3;
int start = std::max(0, head-window);
int end = std::min(tapes[0].refSize()-1, head+window);
std::hash<std::string> hasher;
std::size_t result = 0;
for (int i = start; i < end; i++)
for (auto & tape : tapes)
result ^= (hasher(tape[i-head])*0x9e3779b9+(result << 6)+(result >>2));
return result;
}
void Config::addHashToHistory()
{
hashHistory.push(computeHash());
}
Dict * Config::getDictOfLine(int num)
{
return bd.getDictOfLine(num);
}
Dict * Config::getDictOfLine(const std::string & name)
{
return bd.getDictOfLine(name);
}
int Config::getHead() const
{
return head;
}
const std::string & Config::Tape::getName()
{
return name;
}
void Config::Tape::moveHead(int mvt)
{
head += mvt;
}
bool Config::endOfTapes() const
{
return inputAllRead && tapes[0].headIsAtEnd();
}
bool Config::Tape::headIsAtEnd() const
{
return head == ref.getLastIndex();
}
int Config::Tape::size()
{
return refSize();
}
int Config::Tape::dataSize()
{
return ref.getDataSize();
}
int Config::Tape::refSize()
{
return ref.getLastIndex()+1;
}
int Config::Tape::hypSize()
{
return hyp.getLastIndex()+1;
}
void Config::Tape::addToHyp(const std::string & elem)
{
hyp.push(std::pair<std::string, float>(elem,totalEntropy));
}
void Config::Tape::addToRef(const std::string & elem)
{
ref.push(elem);
}
void Config::Tape::clear()
{
head = 0;
ref.clear();
hyp.clear();
}
void Config::Tape::copyPart(Tape & other, unsigned int from, unsigned int to)
{
ref.copy(other.ref, from, to);
hyp.copy(other.hyp, from, to);
}
void Config::Tape::clearDataForCopy()
{
ref.clearData();
hyp.clearData();
}
void Config::setOutputFile(FILE * outputFile)
{
this->outputFile = outputFile;
}
int Config::Tape::getNextOverridenDataIndex()
{
return ref.getNextOverridenDataIndex();
}
int Config::Tape::getNextOverridenRealIndex()
{
return ref.getNextOverridenRealIndex();
}
void Config::printTheRest()
{
if (!outputFile)
return;
int tapeSize = tapes[0].size();
int goalPrintIndex = lastIndexPrinted;
int realIndex = tapeSize - 1 - ((((tapes[0].dataSize()-(goalPrintIndex == -1 ? 0 : 0)))-(goalPrintIndex+1))+(goalPrintIndex));
for (int i = goalPrintIndex+1; i < (tapes[0].dataSize()-(goalPrintIndex == -1 ? 1 : 0)); i++)
{
printAsOutput(outputFile, i, realIndex);
realIndex++;
}
for (int i = 0; i < goalPrintIndex; i++)
{
printAsOutput(outputFile, i, realIndex);
realIndex++;
}
}
void Config::setEntropy(float entropy)
{
totalEntropy = entropy;
for (auto & tape : tapes)
tape.setTotalEntropy(totalEntropy);
}
float Config::getEntropy() const
{
return totalEntropy;
}
void Config::addToEntropy(float entropy)
{
totalEntropy += entropy;
for (auto & tape : tapes)
tape.setTotalEntropy(totalEntropy);
}
void Config::Tape::setTotalEntropy(float entropy)
{
totalEntropy = entropy;
}
void Config::Tape::maskIndex(int index)
{
ref.maskIndex(index);
}
void Config::printColumnInfos(unsigned int index)
{
for (auto & tape : tapes)
fprintf(stderr, "%s\t: %s\n", tape.getName().c_str(), tape[index-getHead()].c_str());
fprintf(stderr, "\n");
}
void Config::addToActionsHistory(std::string & classifier, std::string & action, int cost)
{
actionsHistory[classifier+"_"+std::to_string(head)].emplace_back(action, cost);
}
std::vector< std::pair<std::string, int> > & Config::getActionsHistory(std::string & classifier)
{
return actionsHistory[classifier+"_"+std::to_string(head)];
}