#include "Config.hpp" #include <algorithm> #include "File.hpp" #include "ProgramParameters.hpp" #include "Action.hpp" #include "ProgramOutput.hpp" #include "utf8.hpp" Config::Config(BD & bd, const std::string inputFilename) : bd(bd), hashHistory(HISTORY_SIZE), pastActions(HISTORY_SIZE) { this->outputFile = nullptr; this->stackHistory = -1; this->inputFilename = inputFilename; this->lastIndexPrinted = -1; head = 0; inputAllRead = false; for(int i = 0; i < bd.getNbLines(); i++) tapes.emplace_back(bd.getNameOfLine(i), bd.lineIsKnown(i)); this->totalEntropy = 0; } Config::Config(const Config & other) : bd(other.bd), hashHistory(other.hashHistory), pastActions(other.pastActions) { this->currentStateName = other.currentStateName; this->actionHistory = other.actionHistory; this->entropyHistory = other.entropyHistory; this->stack = other.stack; this->stackHistory = other.stackHistory; this->head = other.head; this->outputFile = other.outputFile; this->lastIndexPrinted = other.lastIndexPrinted; this->tapes = other.tapes; this->totalEntropy = other.totalEntropy; this->inputFilename = other.inputFilename; this->inputAllRead = other.inputAllRead; this->file.reset(new File(*other.file.get())); } Config::Tape::Tape(const std::string & name, bool isKnown) : ref(ProgramParameters::readSize*4+1, Dict::unknownValueStr), hyp(ProgramParameters::readSize*4+1, std::make_pair(Dict::unknownValueStr, 0.0)) { this->head = 0; this->name = name; this->isKnown = isKnown; this->totalEntropy = 0.0; } bool Config::hasTape(const std::string & name) { return bd.hasLineOfName(name); } Config::Tape & Config::getTape(const std::string & name) { return tapes[bd.getLineOfName(name)]; } Config::Tape & Config::getTapeByInputCol(int col) { return tapes[bd.getLineOfInputCol(col)]; } void Config::readInput() { if (inputAllRead) return; if (!file.get()) file.reset(new File(inputFilename, "r")); FILE * fd = file->getDescriptor(); char buffer[100000]; std::vector<std::string> cols; unsigned int usualColsSize = 0; int toRead = ProgramParameters::readSize; int haveRead = 0; while(haveRead < toRead && fscanf(fd, "%[^\n]\n", buffer) == 1) { if (!utf8::is_valid(buffer, buffer+std::strlen(buffer))) { fprintf(stderr, "ERROR (%s) : input (%s) line %d is not toally utf-8 formated. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size()); exit(1); } cols = split(buffer, '\t'); if (!usualColsSize) usualColsSize = cols.size(); if (cols.size() != usualColsSize) { fprintf(stderr, "ERROR (%s) : input (%s) line %d has %lu columns instead of %u. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size(), cols.size(), usualColsSize); exit(1); } printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex(), tapes[0].getNextOverridenRealIndex()); for(unsigned int i = 0; i < cols.size(); i++) if(bd.hasLineOfInputCol(i)) { auto & tape = getTapeByInputCol(i); tape.addToRef(cols[i]); tape.addToHyp(""); if (tape.getName() == ProgramParameters::tapeToMask) if (choiceWithProbability(ProgramParameters::maskRate)) tape.maskIndex(tape.refSize()-1); } haveRead++; } // Making all tapes the same size int maxTapeSize = 0; for(auto & tape : tapes) maxTapeSize = std::max<unsigned int>(maxTapeSize, tape.refSize()); if (haveRead < toRead || tapes[0].size() == ProgramParameters::tapeSize) { printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex(), tapes[0].getNextOverridenRealIndex()); inputAllRead = true; } for(auto & tape : tapes) { while(tape.refSize() < maxTapeSize) tape.addToRef(""); while(tape.hypSize() < maxTapeSize) tape.addToHyp(""); if (inputAllRead) { tape.addToRef("0"); tape.addToHyp(""); } } } void Config::printForDebug(FILE * output) { int window = 5; std::vector< std::vector<std::string> > cols; cols.emplace_back(); cols[0].emplace_back(); cols[0].emplace_back(); for(auto & tape : tapes) { cols[0].emplace_back(tape.getName()); for(int i = std::max(0, head-window); i < std::min(tape.hypSize(), head+window); i++) { unsigned int colIndex = i - std::max(0, head-window)+1; while(cols.size() <= colIndex) cols.emplace_back(); if(&tape == &tapes[0]) { cols[colIndex].emplace_back(i == head ? "head" : std::to_string(i)); cols[colIndex].emplace_back(i == head ? " || " : ""); } cols[colIndex].emplace_back(shrinkString(tape[i-head], 10, "..")); } } fprintf(output, "Configuration : %.2f entropy\n", totalEntropy); fprintf(output, "isFinal : %s endOfTapes : %s\n", isFinal() ? "true" : "false", endOfTapes() ? "true" : "false"); for(int i = 0; i < 80; i++) fprintf(output, "-%s", i == 80-1 ? "\n" : ""); printColumns(output, cols, 3); fprintf(output, "Stack : "); for(int s : stack) fprintf(output, "%d ", s); fprintf(output, "\n"); for(int i = 0; i < 80; i++) fprintf(output, "-%s", i == 80-1 ? "\n" : ""); } void Config::printAsExample(FILE *) { fprintf(stderr, "ERROR (%s) : not supported. Aborting.\n", ERRINFO); exit(1); } void Config::printAsOutput(FILE * output, int dataIndex, int realIndex) { if (dataIndex == -1 || !output) return; lastIndexPrinted = dataIndex; std::vector< std::pair<std::string, float> > toPrint; for (unsigned int j = 0; j < tapes.size(); j++) { if(bd.mustPrintLine(j)) toPrint.emplace_back(tapes[j][dataIndex-head].empty() ? "0" : tapes[j][dataIndex-head].c_str(), tapes[j].getEntropy(dataIndex-head)); } ProgramOutput::instance.addLine(toPrint, realIndex); } void Config::moveHead(int mvt) { if (head + mvt < tapes[0].size()) { head += mvt; for (auto & tape : tapes) tape.moveHead(mvt); if (mvt > 0 && head % ProgramParameters::readSize == 0 && head >= (3*ProgramParameters::readSize)) readInput(); } else if (!endOfTapes()) { fprintf(stderr, "ERROR (%s) : Input has not been read completely, yet the head is already at the end of tapes. Aborting.\n", ERRINFO); exit(1); } } bool Config::isFinal() { return endOfTapes() && stack.empty(); } void Config::reset() { for (auto & tape : tapes) tape.clear(); actionHistory.clear(); pastActions.clear(); hashHistory.clear(); actionsHistory.clear(); stack.clear(); stackHistory = -1; inputAllRead = false; head = 0; file.reset(); while (tapes[0].size() < ProgramParameters::readSize*4 && !inputAllRead) readInput(); } const std::string & Config::Tape::operator[](int relativeIndex) { if(isKnown) return getRef(relativeIndex); return getHyp(relativeIndex); } float Config::Tape::getEntropy(int relativeIndex) { if(isKnown) return 0.0; return hyp.get(head + relativeIndex).second; } const std::string & Config::Tape::getRef(int relativeIndex) { return ref.get(head + relativeIndex); } const std::string & Config::Tape::getHyp(int relativeIndex) { return hyp.get(head + relativeIndex).first; } void Config::Tape::setHyp(int relativeIndex, const std::string & elem) { hyp.set(head + relativeIndex, std::pair<std::string,float>(elem,totalEntropy)); } std::string & Config::getCurrentStateName() { if(currentStateName.empty()) { fprintf(stderr, "ERROR (%s) : currentStateName is empty. Aborting.\n", ERRINFO); exit(1); } return currentStateName; } void Config::setCurrentStateName(const std::string & name) { this->currentStateName = name; } LimitedStack<std::string> & Config::getCurrentStateHistory() { if (!actionHistory.count(getCurrentStateName())) actionHistory.emplace(getCurrentStateName(), HISTORY_SIZE); return actionHistory.find(getCurrentStateName())->second; } LimitedStack<std::string> & Config::getStateHistory(const std::string & state) { if (!actionHistory.count(state)) actionHistory.emplace(state, HISTORY_SIZE); return actionHistory.find(state)->second; } LimitedStack<float> & Config::getCurrentStateEntropyHistory() { if (!entropyHistory.count(getCurrentStateName())) entropyHistory.emplace(getCurrentStateName(), HISTORY_SIZE); return entropyHistory.find(getCurrentStateName())->second; } void Config::shuffle(const std::string & delimiterTape, const std::string & delimiter) { auto & tape = getTape(delimiterTape); std::vector< std::pair<unsigned int, unsigned int> > delimiters; unsigned int previousIndex = 0; for (int i = 0; i < tape.refSize(); i++) if (tape.getRef(i-head) == delimiter) { delimiters.emplace_back(previousIndex, i); previousIndex = i+1; } if (delimiters.empty()) { fprintf(stderr, "ERROR (%s) : Requested to shuffle based on tape \'%s\' with \'%s\' as a delimiter, but none as been found. Aborting.\n", ERRINFO, delimiterTape.c_str(), delimiter.c_str()); exit(1); } std::pair<unsigned int, unsigned int> suffix = {delimiters.back().second+1, tape.refSize()-1}; std::random_shuffle(delimiters.begin(), delimiters.end()); auto newTapes = tapes; for (unsigned int tape = 0; tape < tapes.size(); tape++) { newTapes[tape].clearDataForCopy(); for (auto & delimiter : delimiters) newTapes[tape].copyPart(tapes[tape], delimiter.first, delimiter.second+1); if (suffix.first <= suffix.second) newTapes[tape].copyPart(tapes[tape], suffix.first, suffix.second+1); } tapes = newTapes; } int Config::stackGetElem(int index) const { if (index == -1) return stackHistory; if (index < 0 || index >= (int)stack.size()) { fprintf(stderr, "ERROR (%s) : requested element index \'%d\' in the stack. Aborting.\n", ERRINFO, index); exit(1); } return stack[stack.size()-1-index]; } bool Config::stackHasIndex(int index) const { return index == -1 || (index >= 0 && index < (int)stack.size()); } bool Config::stackEmpty() const { return !stackHasIndex(0); } void Config::stackPop() { if (stack.empty()) { fprintf(stderr, "ERROR (%s) : Popping empty stack. Aborting.\n", ERRINFO); exit(1); } stackHistory = stack.back(); stack.pop_back(); } void Config::stackPush(int elem) { stack.push_back(elem); } int Config::stackTop() { if (stack.empty()) { fprintf(stderr, "ERROR (%s) : Requesting back element of empty stack. Aborting.\n", ERRINFO); exit(1); } return stack.back(); } int Config::stackSize() const { return stack.size(); } void Config::loadFromFile(File &) { fprintf(stderr, "ERROR (%s) : not supported. Aborting.\n", ERRINFO); exit(1); } void Config::addToEntropyHistory(float entropy) { if (!entropyHistory.count(getCurrentStateName())) entropyHistory.emplace(getCurrentStateName(), HISTORY_SIZE); entropyHistory.find(getCurrentStateName())->second.push(entropy); } std::size_t Config::computeHash() { static int window = 10; int start = std::max(0, head-window); int end = std::min(tapes[0].refSize()-1, head+window); std::hash<std::string> hasher; std::size_t result = 0; for (int i = start; i < end; i++) for (auto & tape : tapes) result ^= (hasher(tape[i-head])*0x9e3779b9+(result << 6)+(result >>2)); return result; } void Config::addHashToHistory() { hashHistory.push(computeHash()); } Dict * Config::getDictOfLine(int num) { return bd.getDictOfLine(num); } Dict * Config::getDictOfLine(const std::string & name) { return bd.getDictOfLine(name); } int Config::getHead() const { return head; } const std::string & Config::Tape::getName() { return name; } void Config::Tape::moveHead(int mvt) { head += mvt; } bool Config::endOfTapes() const { return inputAllRead && tapes[0].headIsAtEnd(); } bool Config::Tape::headIsAtEnd() const { return head == ref.getLastIndex(); } int Config::Tape::size() { return refSize(); } int Config::Tape::dataSize() { return ref.getDataSize(); } int Config::Tape::refSize() { return ref.getLastIndex()+1; } int Config::Tape::hypSize() { return hyp.getLastIndex()+1; } void Config::Tape::addToHyp(const std::string & elem) { hyp.push(std::pair<std::string, float>(elem,totalEntropy)); } void Config::Tape::addToRef(const std::string & elem) { ref.push(elem); } void Config::Tape::clear() { head = 0; ref.clear(); hyp.clear(); } void Config::Tape::copyPart(Tape & other, unsigned int from, unsigned int to) { ref.copy(other.ref, from, to); hyp.copy(other.hyp, from, to); } void Config::Tape::clearDataForCopy() { ref.clearData(); hyp.clearData(); } void Config::setOutputFile(FILE * outputFile) { this->outputFile = outputFile; } int Config::Tape::getNextOverridenDataIndex() { return ref.getNextOverridenDataIndex(); } int Config::Tape::getNextOverridenRealIndex() { return ref.getNextOverridenRealIndex(); } void Config::printTheRest() { if (!outputFile) return; int tapeSize = tapes[0].size(); int goalPrintIndex = lastIndexPrinted; int realIndex = tapeSize - 1 - ((((tapes[0].dataSize()-(goalPrintIndex == -1 ? 0 : 0)))-(goalPrintIndex+1))+(goalPrintIndex)); for (int i = goalPrintIndex+1; i < (tapes[0].dataSize()-(goalPrintIndex == -1 ? 1 : 0)); i++) { printAsOutput(outputFile, i, realIndex); realIndex++; } for (int i = 0; i < goalPrintIndex; i++) { printAsOutput(outputFile, i, realIndex); realIndex++; } } void Config::setEntropy(float entropy) { totalEntropy = entropy; for (auto & tape : tapes) tape.setTotalEntropy(totalEntropy); } float Config::getEntropy() const { return totalEntropy; } void Config::addToEntropy(float entropy) { totalEntropy += entropy; for (auto & tape : tapes) tape.setTotalEntropy(totalEntropy); } void Config::Tape::setTotalEntropy(float entropy) { totalEntropy = entropy; } void Config::Tape::maskIndex(int index) { ref.maskIndex(index); } void Config::printColumnInfos(unsigned int index) { for (auto & tape : tapes) fprintf(stderr, "%s\t: %s\n", tape.getName().c_str(), tape[index-getHead()].c_str()); fprintf(stderr, "\n"); } void Config::addToActionsHistory(std::string & state, const std::string & action, int cost) { actionsHistory[state+"_"+std::to_string(head)].emplace_back(action, cost); } std::vector< std::pair<std::string, int> > & Config::getActionsHistory(std::string & state) { return actionsHistory[state+"_"+std::to_string(head)]; } float Config::Tape::getScore() { float res = 0.0; for (int i = 0; i < refSize(); i++) { if (getRef(i-head) == getHyp(i-head)) res += 1; } return 100.0*res / refSize(); }