diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp index bfca7e98ec783a42b525df40afe9c7e9a77811a3..371947a8455b8d5a73ed22219506cf5e3cf879d0 100644 --- a/decoder/src/Decoder.cpp +++ b/decoder/src/Decoder.cpp @@ -253,7 +253,7 @@ void Decoder::decodeNoBeam() if (ProgramParameters::errorAnalysis) errors.printStats(); - config.printTheRest(); + config.printTheRest(false); if (ProgramParameters::interactive) fprintf(stderr, " \n"); @@ -486,7 +486,7 @@ void Decoder::decodeBeam() for (auto node : beam) { node->config.setOutputFile(outputFile); - node->config.printTheRest(); + node->config.printTheRest(false); } if (ProgramParameters::interactive) diff --git a/maca_common/include/ProgramOutput.hpp b/maca_common/include/ProgramOutput.hpp index dca9bfeddfc6c61941ba6cbea9e829a87f2ff2de..e531d0397a3d3f092913702f23725ed82df4c24a 100644 --- a/maca_common/include/ProgramOutput.hpp +++ b/maca_common/include/ProgramOutput.hpp @@ -23,7 +23,7 @@ struct ProgramOutput public : void print(FILE * output); - void addLine(const std::vector< std::pair<std::string, float> > & line, unsigned int index); + void addLine(FILE * output, const std::vector< std::pair<std::string, float> > & line, unsigned int index); }; #endif diff --git a/maca_common/include/util.hpp b/maca_common/include/util.hpp index c9033fccf22b843fd2365007557cb974b2dc884f..245a082cd22d071acd8a03037434648552d90938 100644 --- a/maca_common/include/util.hpp +++ b/maca_common/include/util.hpp @@ -221,6 +221,7 @@ int getEndIndexOfNthSymbol(const std::string & s, int n); int getEndIndexOfNthSymbolFrom(const std::string::iterator & s, const std::string::iterator & end, int n); unsigned int getNbSymbols(const std::string & s); std::string shrinkString(const std::string & base, int maxSize, const std::string token); +std::string strip(const std::string & s); /// @brief Macro giving informations about an error. #define ERRINFO (getFilenameFromPath(std::string(__FILE__))+ ":l." + std::to_string(__LINE__)).c_str() diff --git a/maca_common/src/ProgramOutput.cpp b/maca_common/src/ProgramOutput.cpp index e9bde67e1d68c1afa92e4b7443d52635b37fed0a..bdab98c874eb63b802a42a7d49ac6462814a0466 100644 --- a/maca_common/src/ProgramOutput.cpp +++ b/maca_common/src/ProgramOutput.cpp @@ -13,12 +13,12 @@ void ProgramOutput::print(FILE * output) fprintf(output, "%s%s%s", ProgramParameters::printOutputEntropy ? ("<"+float2str(line[i].second,"%f")+">").c_str() : "", line[i].first.c_str(), i == line.size()-1 ? "\n" : "\t"); } -void ProgramOutput::addLine(const std::vector< std::pair<std::string, float> > & line, unsigned int index) +void ProgramOutput::addLine(FILE * output, const std::vector< std::pair<std::string, float> > & line, unsigned int index) { if (!ProgramParameters::delayedOutput) { for (unsigned int i = 0; i < line.size(); i++) - fprintf(stdout, "%s%s", line[i].first.c_str(), i == line.size()-1 ? "\n" : "\t"); + fprintf(output, "%s%s", line[i].first.c_str(), i == line.size()-1 ? "\n" : "\t"); return; } diff --git a/maca_common/src/util.cpp b/maca_common/src/util.cpp index 6dc2c199881971ce0eece40e4c4b115a2720d21b..9048efe3291c35ddb7cfc8b53450ffffda91a27d 100644 --- a/maca_common/src/util.cpp +++ b/maca_common/src/util.cpp @@ -568,3 +568,15 @@ std::string shrinkString(const std::string & base, int maxSize, const std::strin return result; } +std::string strip(const std::string & s) +{ + std::string res; + unsigned int i; + while (i < s.size() && isSeparator(s[i])) + i++; + while (i < s.size() && !isSeparator(s[i])) + res.push_back(s[i++]); + + return res; +} + diff --git a/trainer/src/TrainInfos.cpp b/trainer/src/TrainInfos.cpp index c866df798347b510a5b1a5e9bcac2609f5267af0..7cf5e946ad62f152f3715a33eab0759e85fe7269 100644 --- a/trainer/src/TrainInfos.cpp +++ b/trainer/src/TrainInfos.cpp @@ -155,18 +155,64 @@ float TrainInfos::computeScoreOnTapes(Config & c, std::vector<std::string> tapes void TrainInfos::computeTrainScores(Config & c) { + std::string name; + { + File tmpOutTrain("bin/"+ProgramParameters::expName+"/tmpOutTrain.txt", "w"); + name = tmpOutTrain.getName(); + c.setOutputFile(tmpOutTrain.getDescriptor()); + c.printTheRest(false); + c.setOutputFile(nullptr); + c.setLastIndexPrinted(-1); + } + + std::string name2; + { + File tmpOutTrain("bin/"+ProgramParameters::expName+"/tmpOutTrainRef.txt", "w"); + name2 = tmpOutTrain.getName(); + c.setOutputFile(tmpOutTrain.getDescriptor()); + c.printTheRest(true); + c.setOutputFile(nullptr); + c.setLastIndexPrinted(-1); + } + + { + FILE * trainInGoodConllFormat = popen(("../tools/conlluAddMissingColumns.py " + name + " data/conllu.mcd > bin/" + ProgramParameters::expName + "/tmpOutTrain.conllu").c_str(), "w"); + pclose(trainInGoodConllFormat); + } + { + FILE * trainInGoodConllFormat = popen(("../tools/conlluAddMissingColumns.py " + name2 + " data/conllu.mcd > bin/" + ProgramParameters::expName + "/tmpOutTrainRef.conllu").c_str(), "w"); + pclose(trainInGoodConllFormat); + } + + std::map<std::string, std::string> scoresStr; + std::map<std::string, float> scoresFloat; + { + FILE * evalFromUD = popen(("../scripts/conll18_ud_eval.py " + std::string(" bin/") + ProgramParameters::expName + "/tmpOutTrainRef.conllu " + std::string(" bin/") + ProgramParameters::expName + "/tmpOutTrain.conllu -v").c_str(), "r"); + char buffer[10000]; + while (fscanf(evalFromUD, "%[^\n]\n", buffer) == 1) + { + auto splited = split(buffer, '|'); + if (splited.size() > 2) + scoresStr[strip(splited[0])] = strip(splited[3]); + } + pclose(evalFromUD); + } + + for (auto & it : scoresStr) + try {scoresFloat[it.first] = std::stof(it.second);} catch(std::exception &){} + for (auto & it : topologyPrinted) { if (it.first == "Parser") - addTrainScore(it.first, computeScoreOnTapes(c, {"GOV", "LABEL"}, 0, c.getHead()-1)); - else if (it.first == "Tagger") - addTrainScore(it.first, computeScoreOnTapes(c, {"POS"}, 0, c.getHead()-1)); + addTrainScore(it.first, scoresFloat["MLAS"]); else if (it.first == "Tokenizer") - addTrainScore(it.first, computeScoreOnTapes(c, {"FORM"}, 0, c.getHead()-1)); + addTrainScore(it.first, scoresFloat["Tokens"]); + else if (it.first == "Tagger") + addTrainScore(it.first, scoresFloat["XPOS"]); else if (it.first == "Morpho") - addTrainScore(it.first, computeScoreOnTapes(c, {"MORPHO"}, 0, c.getHead()-1)); + addTrainScore(it.first, scoresFloat["UFeats"]); else if (it.first == "Lemmatizer_Rules") - addTrainScore(it.first, computeScoreOnTapes(c, {"LEMMA"}, 0, c.getHead()-1)); + addTrainScore(it.first, scoresFloat["Lemmas"]); else if (split(it.first, '_')[0] == "Error") addTrainScore(it.first, 100.0); else @@ -179,20 +225,50 @@ void TrainInfos::computeTrainScores(Config & c) void TrainInfos::computeDevScores(Config & c) { + std::string name; + { + File tmpOutDev("bin/"+ProgramParameters::expName+"/tmpOutDev.txt", "w"); + name = tmpOutDev.getName(); + c.setOutputFile(tmpOutDev.getDescriptor()); + c.printTheRest(false); + c.setOutputFile(nullptr); + c.setLastIndexPrinted(-1); + } + + { + FILE * devInGoodConllFormat = popen(("../tools/conlluAddMissingColumns.py " + name + " data/conllu.mcd > bin/" + ProgramParameters::expName + "/tmpOutDev.conllu").c_str(), "w"); + pclose(devInGoodConllFormat); + } + + std::map<std::string, std::string> scoresStr; + std::map<std::string, float> scoresFloat; + { + FILE * evalFromUD = popen(("../scripts/conll18_ud_eval.py " + ProgramParameters::devFilename + " bin/" + ProgramParameters::expName + "/tmpOutDev.conllu -v").c_str(), "r"); + char buffer[10000]; + while (fscanf(evalFromUD, "%[^\n]\n", buffer) == 1) + { + auto splited = split(buffer, '|'); + if (splited.size() > 2) + scoresStr[strip(splited[0])] = strip(splited[3]); + } + pclose(evalFromUD); + } + + for (auto & it : scoresStr) + try {scoresFloat[it.first] = std::stof(it.second);} catch(std::exception &){} + for (auto & it : topologyPrinted) { if (it.first == "Parser") - addDevScore(it.first, computeScoreOnTapes(c, {"GOV", "LABEL"}, 0, c.getHead())); - else if (it.first == "Parser") - addDevScore(it.first, computeScoreOnTapes(c, {"GOV", "LABEL"}, 0, c.getHead())); + addDevScore(it.first, scoresFloat["MLAS"]); else if (it.first == "Tokenizer") - addDevScore(it.first, computeScoreOnTapes(c, {"FORM"}, 0, c.getHead())); + addDevScore(it.first, scoresFloat["Tokens"]); else if (it.first == "Tagger") - addDevScore(it.first, computeScoreOnTapes(c, {"POS"}, 0, c.getHead())); + addDevScore(it.first, scoresFloat["XPOS"]); else if (it.first == "Morpho") - addDevScore(it.first, computeScoreOnTapes(c, {"MORPHO"}, 0, c.getHead())); + addDevScore(it.first, scoresFloat["UFeats"]); else if (it.first == "Lemmatizer_Rules") - addDevScore(it.first, computeScoreOnTapes(c, {"LEMMA"}, 0, c.getHead())); + addDevScore(it.first, scoresFloat["Lemmas"]); else if (split(it.first, '_')[0] == "Error") addDevScore(it.first, 100.0); else diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp index b917de5c3698c88ba26842f630f236f73758c095..856617ff0aed37aabfa34df9df72464b03fca989 100644 --- a/trainer/src/Trainer.cpp +++ b/trainer/src/Trainer.cpp @@ -62,7 +62,7 @@ void Trainer::computeScoreOnDev() auto pastTime = std::chrono::high_resolution_clock::now(); std::vector<float> entropies; - while (!devConfig->isFinal()) + while (true) { setDebugValue(); devConfig->setCurrentStateName(tm.getCurrentClassifier()->name); @@ -117,7 +117,11 @@ void Trainer::computeScoreOnDev() } if (pAction.empty()) + { + if (ProgramParameters::debug) + fprintf(stderr, "No action predicted\n"); break; + } if (ProgramParameters::devLoss) { @@ -527,7 +531,7 @@ void Trainer::train() while (TI.getEpoch() <= ProgramParameters::nbIter) { resetAndShuffle(); - while (!trainConfig.isFinal()) + while (true) { setDebugValue(); trainConfig.setCurrentStateName(tm.getCurrentClassifier()->name); diff --git a/transition_machine/include/Config.hpp b/transition_machine/include/Config.hpp index 9ba2cb886a8890576dcb78cc6471fac5950145aa..b4996c3d35c4725c4061b5876919409f3edd9f43 100644 --- a/transition_machine/include/Config.hpp +++ b/transition_machine/include/Config.hpp @@ -234,7 +234,8 @@ class Config /// @param output Where to print. /// @param dataIndex Index of line to print. /// @param realIndex Index of line to print. - void printAsOutput(FILE * output, int dataIndex, int realIndex); + /// @param forceRef True to force the output to be the ref tape. + void printAsOutput(FILE * output, int dataIndex, int realIndex, bool forceRef); /// @brief Print the Config without information loss. /// /// @param output Where to print. @@ -354,7 +355,7 @@ class Config /// @brief Set the output file. void setOutputFile(FILE * outputFile); /// @brief Print the cells that have not been printed. - void printTheRest(); + void printTheRest(bool forceRef); void setEntropy(float entropy); float getEntropy() const; void addToEntropy(float entropy); @@ -365,6 +366,7 @@ class Config void addToActionsHistory(std::string & state, const std::string & action, int cost); std::vector< std::pair<std::string, int> > & getActionsHistory(std::string & state); void transformSymbol(const std::string & from, const std::string & to); + void setLastIndexPrinted(int lastIndexPrinted); }; #endif diff --git a/transition_machine/src/ActionBank.cpp b/transition_machine/src/ActionBank.cpp index 7bb981986bd13fa9868e38c206685a0aabc248aa..5742db20145fa9d456063aba3af109491d0f3ff4 100644 --- a/transition_machine/src/ActionBank.cpp +++ b/transition_machine/src/ActionBank.cpp @@ -833,9 +833,6 @@ bool ActionBank::simpleBufferWriteAppliable(Config & config, int index = config.getHead() + relativeIndex; - if (config.endOfTapes()) - return true; - return !(index < 0) && index < tape.size(); } diff --git a/transition_machine/src/Config.cpp b/transition_machine/src/Config.cpp index cd85b318962ec80a3a5bc928713d84a6fddc283b..a86b5db373ef3016104f6c69f54910678cfe1e6f 100644 --- a/transition_machine/src/Config.cpp +++ b/transition_machine/src/Config.cpp @@ -45,7 +45,7 @@ Config::Config(const Config & other) : bd(other.bd), hashHistory(other.hashHisto this->file.reset(new File(*other.file.get())); } -Config::Tape::Tape(const std::string & name, bool isKnown) : ref(ProgramParameters::readSize*4+1, Dict::unknownValueStr), hyp(ProgramParameters::readSize*4+1, std::make_pair(Dict::unknownValueStr, 0.0)) +Config::Tape::Tape(const std::string & name, bool isKnown) : ref(ProgramParameters::readSize, Dict::unknownValueStr), hyp(ProgramParameters::readSize, std::make_pair(Dict::unknownValueStr, 0.0)) { this->head = 0; this->name = name; @@ -141,6 +141,8 @@ void Config::fillTapesWithInput() prefix = "\n"; else if (choiceWithProbability(0.3)) prefix = ""; + if (rawInput.empty()) + prefix = ""; rawInput += prefix + std::string(word.begin()+9, word.end()); continue; } @@ -183,17 +185,12 @@ void Config::fillTapesWithInput() int maxTapeSize = 0; for(auto & tape : tapes) maxTapeSize = std::max<unsigned int>(maxTapeSize, tape.refSize()); - for(auto & tape : tapes) { - while(tape.refSize() < maxTapeSize) + while (tape.refSize() < maxTapeSize) tape.addToRef(""); - - while(tape.hypSize() < maxTapeSize) + while (tape.hypSize() < maxTapeSize) tape.addToHyp(""); - - tape.addToRef("0"); - tape.addToHyp(""); } } @@ -264,7 +261,7 @@ void Config::printAsExample(FILE *) exit(1); } -void Config::printAsOutput(FILE * output, int dataIndex, int realIndex) +void Config::printAsOutput(FILE * output, int dataIndex, int realIndex, bool forceRef) { if (dataIndex == -1 || !output) return; @@ -275,10 +272,23 @@ void Config::printAsOutput(FILE * output, int dataIndex, int realIndex) for (unsigned int j = 0; j < tapes.size(); j++) { if(bd.mustPrintLine(j)) - toPrint.emplace_back(tapes[j][dataIndex-head].empty() ? "_" : tapes[j][dataIndex-head].c_str(), tapes[j].getEntropy(dataIndex-head)); + { + if (!forceRef) + toPrint.emplace_back(tapes[j][dataIndex-head].empty() ? "_" : tapes[j][dataIndex-head].c_str(), tapes[j].getEntropy(dataIndex-head)); + else + toPrint.emplace_back(tapes[j].getRef(dataIndex-head).empty() ? "_" : tapes[j].getRef(dataIndex-head).c_str(), tapes[j].getEntropy(dataIndex-head)); + } + } + + ProgramOutput::instance.addLine(output, toPrint, realIndex); + + if (!ProgramParameters::delayedOutput) + { + auto eos = forceRef ? getTape(ProgramParameters::sequenceDelimiterTape).getRef(dataIndex-head) : getTape(ProgramParameters::sequenceDelimiterTape)[dataIndex-head]; + if (eos == ProgramParameters::sequenceDelimiter) + fprintf(output, "\n"); } - ProgramOutput::instance.addLine(toPrint, realIndex); } void Config::moveHead(int mvt) @@ -292,9 +302,6 @@ void Config::moveHead(int mvt) for (auto & tape : tapes) tape.moveHead(mvt); - - if (mvt > 0 && head % ProgramParameters::readSize == 0 && head >= (3*ProgramParameters::readSize)) - readInput(); } else if (!endOfTapes()) { @@ -327,10 +334,10 @@ void Config::moveRawInputHead(int mvt) bool Config::isFinal() { - if (!ProgramParameters::rawInput) - return endOfTapes() && stack.empty(); + if (rawInputHeadIndex > 0 && !rawInput.empty()) + return (rawInputHeadIndex >= (int)rawInput.size()); - return (rawInputHeadIndex >= (int)rawInput.size()); + return endOfTapes() && stack.empty(); } void Config::reset() @@ -619,7 +626,7 @@ int Config::Tape::getNextOverridenRealIndex() return ref.getNextOverridenRealIndex(); } -void Config::printTheRest() +void Config::printTheRest(bool forceRef) { if (!outputFile) return; @@ -630,12 +637,12 @@ void Config::printTheRest() int realIndex = tapeSize - 1 - ((((tapes[0].dataSize()-(goalPrintIndex == -1 ? 0 : 0)))-(goalPrintIndex+1))+(goalPrintIndex)); for (int i = goalPrintIndex+1; i < (tapes[0].dataSize()-(goalPrintIndex == -1 ? 1 : 0)); i++) { - printAsOutput(outputFile, i, realIndex); + printAsOutput(outputFile, i, realIndex, forceRef); realIndex++; } for (int i = 0; i < goalPrintIndex; i++) { - printAsOutput(outputFile, i, realIndex); + printAsOutput(outputFile, i, realIndex, forceRef); realIndex++; } } @@ -726,3 +733,8 @@ void Config::transformSymbol(const std::string & from, const std::string & to) tape.setHyp(i-tape.getHead(), to); } +void Config::setLastIndexPrinted(int lastIndexPrinted) +{ + this->lastIndexPrinted = lastIndexPrinted; +} + diff --git a/transition_machine/src/Oracle.cpp b/transition_machine/src/Oracle.cpp index 8ba8eb23fb5f5431f73e602511d14f8629ed5b25..8cd69d0fa57f741a7592d41b52e44bc2bcbda402 100644 --- a/transition_machine/src/Oracle.cpp +++ b/transition_machine/src/Oracle.cpp @@ -195,7 +195,7 @@ void Oracle::createDatabase() }, [](Config & c, Oracle *, const std::string & action) { - return (action == "WRITE b.0 POS " + c.getTape("POS").getRef(0) || c.endOfTapes()) ? 0 : 1; + return action == "WRITE b.0 POS " + c.getTape("POS").getRef(0) ? 0 : 1; }))); str2oracle.emplace("tokenizer", std::unique_ptr<Oracle>(new Oracle( @@ -263,7 +263,7 @@ void Oracle::createDatabase() }, [](Config & c, Oracle *, const std::string & action) { - return (action == "WRITE b.0 " + ProgramParameters::sequenceDelimiterTape + " " + (c.getTape(ProgramParameters::sequenceDelimiterTape).getRef(0) == std::string(ProgramParameters::sequenceDelimiter) ? std::string(ProgramParameters::sequenceDelimiter) : std::string("0")) || c.endOfTapes()) ? 0 : 1; + return action == "WRITE b.0 " + ProgramParameters::sequenceDelimiterTape + " " + (c.getTape(ProgramParameters::sequenceDelimiterTape).getRef(0) == std::string(ProgramParameters::sequenceDelimiter) ? std::string(ProgramParameters::sequenceDelimiter) : std::string("0")); }))); str2oracle.emplace("morpho", std::unique_ptr<Oracle>(new Oracle( @@ -279,7 +279,7 @@ void Oracle::createDatabase() }, [](Config & c, Oracle *, const std::string & action) { - return (action == "WRITE b.0 MORPHO " + c.getTape("MORPHO").getRef(0) || c.endOfTapes()) ? 0 : 1; + return action == "WRITE b.0 MORPHO " + c.getTape("MORPHO").getRef(0) ? 0 : 1; }))); str2oracle.emplace("strategy_morpho", std::unique_ptr<Oracle>(new Oracle( @@ -568,7 +568,7 @@ void Oracle::createDatabase() const std::string & lemma = c.getTape("LEMMA").getRef(0); std::string rule = getRule(toLowerCase(form), toLowerCase(lemma)); - return (action == std::string("RULE LEMMA ON FORM ") + rule || c.endOfTapes()) ? 0 : 1; + return action == std::string("RULE LEMMA ON FORM ") + rule ? 0 : 1; }))); str2oracle.emplace("parser", std::unique_ptr<Oracle>(new Oracle( @@ -656,9 +656,9 @@ void Oracle::createDatabase() if (object[0] == "b") { if (parts[2] == "LABEL") - return (action == "WRITE b.0 LABEL " + c.getTape("LABEL").getRef(0) || c.endOfTapes() || c.getTape("LABEL").getRef(0) == "root") ? 0 : 1; + return (action == "WRITE b.0 LABEL " + c.getTape("LABEL").getRef(0) || c.getTape("LABEL").getRef(0) == "root") ? 0 : 1; else if (parts[2] == "GOV") - return (action == "WRITE b.0 GOV " + c.getTape("GOV").getRef(0) || c.endOfTapes()) ? 0 : 1; + return (action == ("WRITE b.0 GOV " + c.getTape("GOV").getRef(0))) ? 0 : 1; } else if (object[0] == "s") {