#include "Config.hpp" #include <algorithm> #include "File.hpp" #include "ProgramParameters.hpp" #include "Action.hpp" #include "ProgramOutput.hpp" #include "utf8.hpp" Config::Config(BD & bd, const std::string inputFilename) : bd(bd), hashHistory(HISTORY_SIZE), pastActions(HISTORY_SIZE) { this->outputFile = nullptr; this->stackHistory = -1; this->lastIndexPrinted = -1; this->inputFilename = inputFilename; head = 0; rawInputHead = 0; currentWordIndex = 1; rawInputHeadIndex = 0; inputAllRead = false; for(int i = 0; i < bd.getNbLines(); i++) tapes.emplace_back(bd.getNameOfLine(i), bd.lineIsKnown(i)); this->totalEntropy = 0; readInput(); } Config::Config(const Config & other) : bd(other.bd), hashHistory(other.hashHistory), pastActions(other.pastActions) { this->currentStateName = other.currentStateName; this->actionHistory = other.actionHistory; this->entropyHistory = other.entropyHistory; this->stack = other.stack; this->stackHistory = other.stackHistory; this->head = other.head; this->outputFile = other.outputFile; this->lastIndexPrinted = other.lastIndexPrinted; this->tapes = other.tapes; this->totalEntropy = other.totalEntropy; this->rawInputHead = other.rawInputHead; this->currentWordIndex = other.currentWordIndex; this->rawInputHeadIndex = other.rawInputHeadIndex; this->rawInput = other.rawInput; this->inputFilename = other.inputFilename; this->inputAllRead = other.inputAllRead; this->file.reset(new File(*other.file.get())); } Config::Tape::Tape(const std::string & name, bool isKnown) : ref(ProgramParameters::readSize, Dict::unknownValueStr), hyp(ProgramParameters::readSize, std::make_pair(Dict::unknownValueStr, 0.0)) { this->head = 0; this->name = name; this->isKnown = isKnown; this->totalEntropy = 0.0; } bool Config::hasTape(const std::string & name) { return bd.hasLineOfName(name); } Config::Tape & Config::getTape(const std::string & name) { return tapes[bd.getLineOfName(name)]; } Config::Tape & Config::getTapeByInputCol(int col) { return tapes[bd.getLineOfInputCol(col)]; } void Config::readInput() { if (inputAllRead) return; if (ProgramParameters::rawInput) { file.reset(new File(inputFilename, "r")); while (!file->isFinished()) rawInput += file->getChar(); inputAllRead = true; for (auto & tape : tapes) { tape.addToRef(""); tape.addToHyp(""); } return; } if (!file.get()) file.reset(new File(inputFilename, "r")); FILE * fd = file->getDescriptor(); char buffer[100000]; int lineIndex = 0; while (fscanf(fd, "%[^\n]\n", buffer) == 1) { lineIndex++; if (!utf8::is_valid(buffer, buffer+std::strlen(buffer))) { fprintf(stderr, "ERROR (%s) : input (%s) line %d is not toally utf-8 formated. Aborting.\n", ERRINFO, inputFilename.c_str(), lineIndex); exit(1); } if (std::strlen(buffer) <= 3) continue; if (split(buffer, '=')[0] == "# sent_id ") inputContent.emplace_back(); else if (buffer[0] == '#' && split(buffer, '=')[0] != "# text ") continue; inputContent.back().emplace_back(buffer); } inputAllRead = true; fillTapesWithInput(); } void Config::fillTapesWithInput() { rawInput = ""; std::vector<std::string> cols; unsigned int usualColsSize = 0; auto & ids = getTape("ID"); bool hasGov = hasTape("GOV"); auto & govs = hasGov ? getTape("GOV") : ids; for (auto & sentence : inputContent) { int sentenceStartIndex = ids.refSize(); for (unsigned int wordIndex = 0; wordIndex < sentence.size(); wordIndex++) { auto & word = sentence[wordIndex]; if (split(word, '=')[0] == "# text ") { std::string prefix = rawInput.empty() ? "" : " "; if (choiceWithProbability(0.3)) prefix = "\n"; else if (choiceWithProbability(0.3)) prefix = ""; if (rawInput.empty()) prefix = ""; rawInput += prefix + std::string(word.begin()+9, word.end()); continue; } else if (word[0] == '#') continue; cols = split(word, '\t'); if (!usualColsSize) usualColsSize = cols.size(); if (cols.size() != usualColsSize) { fprintf(stderr, "ERROR (%s) : input (%s) line %d has %lu columns instead of %u. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size(), cols.size(), usualColsSize); exit(1); } for(unsigned int i = 0; i < cols.size(); i++) if(bd.hasLineOfInputCol(i)) { auto & tape = getTapeByInputCol(i); tape.addToRef(cols[i]); tape.addToHyp(""); if (tape.getName() == ProgramParameters::tapeToMask) if (choiceWithProbability(ProgramParameters::maskRate)) tape.maskIndex(tape.refSize()-1); if (tape.getName() == ProgramParameters::sequenceDelimiterTape) { fprintf(stderr, "ERROR (%s) : Tape \'%s\' must not be given as a column in the input since it's the sequence delimiter. Aborting.\n", ERRINFO, tape.getName().c_str()); exit(1); } } getTape(ProgramParameters::sequenceDelimiterTape).addToRef(wordIndex == sentence.size()-1 ? ProgramParameters::sequenceDelimiter : ""); getTape(ProgramParameters::sequenceDelimiterTape).addToHyp(""); } for (int word = sentenceStartIndex; hasGov && word < ids.refSize(); word++) { if (split(ids.getRef(word), '-').size() > 1) continue; if (split(ids.getRef(word), '.').size() > 1) continue; if (govs.getRef(word) == "0") continue; try { int id = std::stoi(ids.getRef(word)); std::string goalId = govs.getRef(word); int relativeIndex = 0; if (std::stoi(goalId) < id) { while (ids.getRef(word+relativeIndex) != goalId) { if (--relativeIndex+word < 0) throw ""; } } else { while (ids.getRef(word+relativeIndex) != goalId) if (++relativeIndex+word >= ids.refSize()) throw ""; } govs.setRef(word, std::to_string(relativeIndex)); } catch (std::exception &) { fprintf(stderr, "ERROR (%s) : invalid governor '%s' '%s'. Aborting.\n", ERRINFO, govs.getRef(word).c_str(), getTape("FORM").getRef(word).c_str()); exit(1); } } } // Making all tapes the same size int maxTapeSize = 0; for(auto & tape : tapes) maxTapeSize = std::max<unsigned int>(maxTapeSize, tape.refSize()); for(auto & tape : tapes) { while (tape.refSize() < maxTapeSize) tape.addToRef(""); while (tape.hypSize() < maxTapeSize) tape.addToHyp(""); } } void Config::printForDebug(FILE * output) { int window = 5; std::vector< std::vector<std::string> > cols; cols.emplace_back(); cols[0].emplace_back(); cols[0].emplace_back(); for(auto & tape : tapes) { cols[0].emplace_back(tape.getName()); for(int i = std::max(0, head-window); i < std::min(tape.hypSize(), head+window); i++) { unsigned int colIndex = i - std::max(0, head-window)+1; while(cols.size() <= colIndex) cols.emplace_back(); if(&tape == &tapes[0]) { cols[colIndex].emplace_back(i == head ? "head" : std::to_string(i)); cols[colIndex].emplace_back(i == head ? " || " : ""); } cols[colIndex].emplace_back(shrinkString(tape[i-head], 10, "..")); } } fprintf(output, "Configuration : %.2f entropy\n", totalEntropy); fprintf(output, "isFinal : %s endOfTapes : %s\n", isFinal() ? "true" : "false", endOfTapes() ? "true" : "false"); for(int i = 0; i < 80; i++) fprintf(output, "-%s", i == 80-1 ? "\n" : ""); if (!rawInput.empty()) { int rawWindow = 30; int relativeHeadIndex = getEndIndexOfNthSymbolFrom(rawInput.begin()+rawInputHeadIndex, rawInput.end(), rawWindow); auto endIter = rawInput.begin() + rawInputHeadIndex + relativeHeadIndex + 1; if (relativeHeadIndex < 0) endIter = rawInput.end(); std::string toPrint(rawInput.begin()+rawInputHeadIndex, endIter); fprintf(stderr, "%s\n", toPrint.c_str()); for(int i = 0; i < 80; i++) fprintf(output, "-%s", i == 80-1 ? "\n" : ""); } printColumns(output, cols, 3); fprintf(output, "Stack : "); for(int s : stack) fprintf(output, "%d ", s); fprintf(output, "\n"); for(int i = 0; i < 80; i++) fprintf(output, "-%s", i == 80-1 ? "\n" : ""); } void Config::printAsExample(FILE *) { fprintf(stderr, "ERROR (%s) : not supported. Aborting.\n", ERRINFO); exit(1); } void Config::printAsOutput(FILE * output, int dataIndex, int realIndex, bool forceRef) { if (dataIndex == -1 || !output) return; lastIndexPrinted = dataIndex; std::vector< std::pair<std::string, float> > toPrint; for (unsigned int j = 0; j < tapes.size(); j++) { int outputTapeIndex = bd.getOutputIndexOfLine(j); while ((int)toPrint.size() < outputTapeIndex+1) toPrint.emplace_back("_", 0.0); if(bd.mustPrintLine(j)) { if (!forceRef) toPrint[outputTapeIndex] = {tapes[j][dataIndex-head].empty() ? "_" : tapes[j][dataIndex-head].c_str(), tapes[j].getEntropy(dataIndex-head)}; else toPrint[outputTapeIndex] = {tapes[j].getRef(dataIndex-head).empty() ? "_" : tapes[j].getRef(dataIndex-head).c_str(), tapes[j].getEntropy(dataIndex-head)}; } } bool allEmpty = true; for (auto & it : toPrint) if (it.first != "_" && !it.first.empty()) { allEmpty = false; break; } if (allEmpty) return; ProgramOutput::instance.addLine(output, toPrint, realIndex); if (!ProgramParameters::delayedOutput) { auto eos = forceRef ? getTape(ProgramParameters::sequenceDelimiterTape).getRef(dataIndex-head) : getTape(ProgramParameters::sequenceDelimiterTape)[dataIndex-head]; if (eos == ProgramParameters::sequenceDelimiter) fprintf(output, "\n"); } } void Config::moveHead(int mvt) { if (head + mvt <= tapes[0].size()) { head += mvt; if (mvt > 0) for (int i = 0; i < mvt; i++) if (hasTape("ID") && split(getTape("ID").getHyp(i), '-').size() <= 1) currentWordIndex += 1; if (mvt < 0) for (int i = 0; i < mvt; i++) if (hasTape("ID") && split(getTape("ID").getHyp(-i), '-').size() <= 1) currentWordIndex -= 1; for (auto & tape : tapes) tape.moveHead(mvt); } else if (!endOfTapes()) { fprintf(stderr, "ERROR (%s) : Input has not been read completely, yet the head is already at the end of tapes. Aborting.\n", ERRINFO); exit(1); } } void Config::moveRawInputHead(int mvt) { if (mvt >= 0) { int relativeIndexMvt = getStartIndexOfNthSymbolFrom(rawInput.begin()+rawInputHeadIndex, rawInput.end(), mvt); if (relativeIndexMvt > 0) { rawInputHead += mvt; rawInputHeadIndex += relativeIndexMvt; } } else { int relativeIndexMvt = getStartIndexOfNthSymbolFrom(rawInput.begin()+rawInputHeadIndex, rawInput.begin(), mvt); if (relativeIndexMvt < 0) { rawInputHeadIndex += relativeIndexMvt; rawInputHead += mvt; } } } bool Config::isFinal() { if (rawInputHeadIndex > 0 && !rawInput.empty()) return (rawInputHeadIndex >= (int)rawInput.size()); return endOfTapes() && stack.empty(); } void Config::reset() { for (auto & tape : tapes) tape.clear(); actionHistory.clear(); pastActions.clear(); hashHistory.clear(); actionsHistory.clear(); stack.clear(); stackHistory = -1; head = 0; rawInputHead = 0; rawInputHeadIndex = 0; currentWordIndex = 1; } const std::string & Config::Tape::operator[](int relativeIndex) { if(isKnown) return getRef(relativeIndex); return getHyp(relativeIndex); } float Config::Tape::getEntropy(int relativeIndex) { if(isKnown) return 0.0; return hyp.get(head + relativeIndex).second; } const std::string & Config::Tape::getRef(int relativeIndex) { return ref.get(head + relativeIndex); } const std::string & Config::Tape::getHyp(int relativeIndex) { return hyp.get(head + relativeIndex).first; } void Config::Tape::setHyp(int relativeIndex, const std::string & elem) { hyp.set(head + relativeIndex, std::pair<std::string,float>(elem,totalEntropy)); } void Config::Tape::setRef(int relativeIndex, const std::string & elem) { ref.set(head + relativeIndex, elem); } void Config::Tape::set(int relativeIndex, const std::string & elem) { if(isKnown) return setRef(relativeIndex, elem); return setHyp(relativeIndex, elem); } std::string & Config::getCurrentStateName() { if(currentStateName.empty()) { fprintf(stderr, "ERROR (%s) : currentStateName is empty. Aborting.\n", ERRINFO); exit(1); } return currentStateName; } void Config::setCurrentStateName(const std::string & name) { this->currentStateName = name; } LimitedStack<std::string> & Config::getCurrentStateHistory() { if (!actionHistory.count(getCurrentStateName())) actionHistory.emplace(getCurrentStateName(), HISTORY_SIZE); return actionHistory.find(getCurrentStateName())->second; } LimitedStack<std::string> & Config::getStateHistory(const std::string & state) { if (!actionHistory.count(state)) actionHistory.emplace(state, HISTORY_SIZE); return actionHistory.find(state)->second; } LimitedStack<float> & Config::getCurrentStateEntropyHistory() { if (!entropyHistory.count(getCurrentStateName())) entropyHistory.emplace(getCurrentStateName(), HISTORY_SIZE); return entropyHistory.find(getCurrentStateName())->second; } void Config::shuffle() { std::random_shuffle(inputContent.begin(), inputContent.end()); } int Config::stackGetElem(int index) const { if (index == -1) return stackHistory; if (index < 0 || index >= (int)stack.size()) { fprintf(stderr, "ERROR (%s) : requested element index \'%d\' in the stack. Aborting.\n", ERRINFO, index); exit(1); } return stack[stack.size()-1-index]; } bool Config::stackHasIndex(int index) const { return index == -1 || (index >= 0 && index < (int)stack.size()); } bool Config::stackEmpty() const { return !stackHasIndex(0); } void Config::stackPop() { if (stack.empty()) { fprintf(stderr, "ERROR (%s) : Popping empty stack. Aborting.\n", ERRINFO); exit(1); } stackHistory = stack.back(); stack.pop_back(); } void Config::stackPush(int elem) { stack.push_back(elem); } int Config::stackTop() { if (stack.empty()) { fprintf(stderr, "ERROR (%s) : Requesting back element of empty stack. Aborting.\n", ERRINFO); exit(1); } return stack.back(); } int Config::stackSize() const { return stack.size(); } void Config::loadFromFile(File &) { fprintf(stderr, "ERROR (%s) : not supported. Aborting.\n", ERRINFO); exit(1); } void Config::addToEntropyHistory(float entropy) { if (!entropyHistory.count(getCurrentStateName())) entropyHistory.emplace(getCurrentStateName(), HISTORY_SIZE); entropyHistory.find(getCurrentStateName())->second.push(entropy); } std::size_t Config::computeHash() { static std::hash<std::string> strhasher; static std::hash<int> inthasher; std::size_t result = 0; result ^= (strhasher(currentStateName)*0x9e3779b9+(result << 6)+(result >>2)); result ^= (inthasher(getHead())*0x9e3779b9+(result << 6)+(result >>2)); return result; } void Config::addHashToHistory() { hashHistory.push(computeHash()); } Dict * Config::getDictOfLine(int num) { return bd.getDictOfLine(num); } Dict * Config::getDictOfLine(const std::string & name) { return bd.getDictOfLine(name); } int Config::getHead() const { return head; } const std::string & Config::Tape::getName() { return name; } void Config::Tape::moveHead(int mvt) { head += mvt; } bool Config::endOfTapes() const { return inputAllRead && (tapes[0].headIsAtEnd() || rawInputHeadIndex >= (int)rawInput.size()); } bool Config::Tape::headIsAtEnd() const { return head >= ref.getLastIndex(); } int Config::Tape::size() { return refSize(); } int Config::Tape::dataSize() { return ref.getDataSize(); } int Config::Tape::refSize() { return ref.getLastIndex()+1; } int Config::Tape::hypSize() { return hyp.getLastIndex()+1; } void Config::Tape::addToHyp(const std::string & elem) { hyp.push(std::pair<std::string, float>(elem,totalEntropy)); } void Config::Tape::addToRef(const std::string & elem) { ref.push(elem); } void Config::Tape::clear() { head = 0; ref.clear(); hyp.clear(); } void Config::Tape::copyPart(Tape & other, unsigned int from, unsigned int to) { ref.copy(other.ref, from, to); hyp.copy(other.hyp, from, to); } void Config::Tape::clearDataForCopy() { ref.clearData(); hyp.clearData(); } void Config::setOutputFile(FILE * outputFile) { this->outputFile = outputFile; } int Config::Tape::getNextOverridenDataIndex() { return ref.getNextOverridenDataIndex(); } int Config::Tape::getNextOverridenRealIndex() { return ref.getNextOverridenRealIndex(); } void Config::printTheRest(bool forceRef) { if (!outputFile) return; updateIdsInSequence(); setGovsAsUD(forceRef); int tapeSize = tapes[0].size(); int goalPrintIndex = lastIndexPrinted; int realIndex = tapeSize - ((((tapes[0].dataSize()-(goalPrintIndex == -1 ? 0 : 0)))-(goalPrintIndex+1))+(goalPrintIndex)); for (int i = goalPrintIndex+1; i < tapes[0].dataSize(); i++) { printAsOutput(outputFile, i, realIndex, forceRef); realIndex++; } for (int i = 0; i < goalPrintIndex; i++) { printAsOutput(outputFile, i, realIndex, forceRef); realIndex++; } } void Config::setEntropy(float entropy) { totalEntropy = entropy; for (auto & tape : tapes) tape.setTotalEntropy(totalEntropy); } float Config::getEntropy() const { return totalEntropy; } void Config::addToEntropy(float entropy) { totalEntropy += entropy; for (auto & tape : tapes) tape.setTotalEntropy(totalEntropy); } void Config::Tape::setTotalEntropy(float entropy) { totalEntropy = entropy; } void Config::Tape::maskIndex(int index) { ref.maskIndex(index); } void Config::printColumnInfos(unsigned int index) { for (auto & tape : tapes) fprintf(stderr, "%s\t: %s\n", tape.getName().c_str(), tape[index-getHead()].c_str()); fprintf(stderr, "\n"); } void Config::addToActionsHistory(std::string & state, const std::string & action, int cost) { if (actionsHistory.size() > 2000) for (auto it = actionsHistory.cbegin(); it != actionsHistory.cend();) { try { if (std::stoi(split(it->first, '_').back()) < head-20) { it = actionsHistory.erase(it); continue; } } catch (std::exception &) {fprintf(stderr, "ERROR (%s) : calling std::stoi on \'%s\'.aborting.\n", ERRINFO, split(it->first, '_').back().c_str()); exit(1);} it++; } actionsHistory[state+"_"+std::to_string(head)].emplace_back(action, cost); } std::vector< std::pair<std::string, int> > & Config::getActionsHistory(std::string & state) { return actionsHistory[state+"_"+std::to_string(head)]; } float Config::Tape::getScore(int from, int to) { float res = 0.0; for (int i = from; i <= to; i++) if (getRef(i-head) == getHyp(i-head)) res += 1; return 100.0*res / (1+to-from); } int Config::Tape::getHead() { return head; } void Config::transformSymbol(const std::string & from, const std::string & to) { for (auto & tape : tapes) for (int i = 0; i < tape.size(); i++) if (tape.getHyp(i-tape.getHead()) == from) tape.setHyp(i-tape.getHead(), to); } void Config::setLastIndexPrinted(int lastIndexPrinted) { this->lastIndexPrinted = lastIndexPrinted; } void Config::setGovsAsUD(bool ref) { auto & ids = getTape("ID"); auto & govs = getTape("GOV"); if (ref) for (int i = 0; i < ids.refSize(); i++) { try { int relativeIndex = std::stoi(govs.getRef(i-head)); if (relativeIndex == 0) continue; auto idOfGov = ids.getRef(i+relativeIndex-head); govs.setRef(i-head, idOfGov); } catch (std::exception &) {continue;} } else for (int i = 0; i < ids.hypSize(); i++) { try { int relativeIndex = std::stoi(govs.getHyp(i-head)); if (relativeIndex == 0) continue; auto idOfGov = ids.getHyp(i+relativeIndex-head); govs.setHyp(i-head, idOfGov); } catch (std::exception &) {continue;} } } void Config::updateIdsInSequence() { int sentenceEnd = getHead(); auto & eos = getTape(ProgramParameters::sequenceDelimiterTape); auto & ids = getTape("ID"); while (sentenceEnd >= 0 && eos[sentenceEnd-getHead()] != ProgramParameters::sequenceDelimiter) sentenceEnd--; int sentenceStart = std::max(0,sentenceEnd-1); while (sentenceStart >= 0 && eos[sentenceStart-getHead()] != ProgramParameters::sequenceDelimiter) sentenceStart--; sentenceStart++; if (sentenceEnd < 0) { sentenceStart = 0; sentenceEnd = eos.hypSize()-1; } int curId = 1; int digitIndex = 1; for (int i = sentenceStart; i <= sentenceEnd; i++) { auto splited = split(ids.getRef(i-getHead()), '-'); if (splited.size() == 1) { auto splited2 = split(ids.getRef(i-getHead()), '.'); if (splited2.size() == 1) { ids.setHyp(i-getHead(), std::to_string(curId++)); digitIndex = 1; continue; } ids.setHyp(i-getHead(), std::to_string(curId)+"."+std::to_string(digitIndex)); digitIndex++; continue; } int multiWordSize = splited.size(); ids.setHyp(i-getHead(), std::to_string(curId)+"-"+std::to_string(curId+multiWordSize-1)); digitIndex = 1; } }