From 0daac76c2ca564d507a8e32b8bf6f9169fe8dd22 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Tue, 24 Sep 2019 16:26:05 +0200 Subject: [PATCH] UD output for tokenizer and tagger --- decoder/src/Decoder.cpp | 15 +- trainer/src/TrainInfos.cpp | 10 +- trainer/src/Trainer.cpp | 21 ++- transition_machine/include/Config.hpp | 16 +-- transition_machine/src/ActionBank.cpp | 2 + transition_machine/src/BD.cpp | 11 -- transition_machine/src/Classifier.cpp | 6 +- transition_machine/src/Config.cpp | 190 ++++++++++++-------------- 8 files changed, 131 insertions(+), 140 deletions(-) diff --git a/decoder/src/Decoder.cpp b/decoder/src/Decoder.cpp index fec7ae7..bfca7e9 100644 --- a/decoder/src/Decoder.cpp +++ b/decoder/src/Decoder.cpp @@ -64,13 +64,13 @@ void printAdvancement(Config & config, float currentSpeed, int nbActionsCutoff) { int totalSize = ProgramParameters::tapeSize; int steps = config.getHead(); - if (steps && (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff)) + if (ProgramParameters::rawInput) { - if (ProgramParameters::rawInput) - fprintf(stderr, "Decode : %.2f%% speed : %s actions/s\r", 100.0*config.rawInputHeadIndex/config.rawInput.size(), int2humanStr((int)currentSpeed).c_str()); - else - fprintf(stderr, "Decode : %.2f%% speed : %s actions/s\r", 100.0*steps/totalSize, int2humanStr((int)currentSpeed).c_str()); + totalSize = config.rawInput.size(); + steps = config.rawInputHeadIndex; } + if (steps && (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff)) + fprintf(stderr, "Decode : %.2f%% speed : %s actions/s\r", 100.0*config.rawInputHeadIndex/config.rawInput.size(), int2humanStr((int)currentSpeed).c_str()); } } @@ -183,12 +183,13 @@ void computeAndRecordEntropy(Config & config, Classifier::WeightedActions & weig void applyActionAndTakeTransition(TransitionMachine & tm, const std::string & actionName, Config & config) { - if (ProgramParameters::debug) - fprintf(stderr, "Applying action=<%s>\n", actionName.c_str()); + Action * action = tm.getCurrentClassifier()->getAction(actionName); TransitionMachine::Transition * transition = tm.getTransition(actionName); action->setInfos(tm.getCurrentClassifier()->name); config.addToActionsHistory(tm.getCurrentClassifier()->name, actionName, 0); + if (ProgramParameters::debug) + fprintf(stderr, "Applying action=<%s>\n", action->name.c_str()); action->apply(config); tm.takeTransition(transition); } diff --git a/trainer/src/TrainInfos.cpp b/trainer/src/TrainInfos.cpp index 8677127..c866df7 100644 --- a/trainer/src/TrainInfos.cpp +++ b/trainer/src/TrainInfos.cpp @@ -158,15 +158,15 @@ void TrainInfos::computeTrainScores(Config & c) for (auto & it : topologyPrinted) { if (it.first == "Parser") - addTrainScore(it.first, computeScoreOnTapes(c, {"GOV", "LABEL"}, 0, c.getHead())); + addTrainScore(it.first, computeScoreOnTapes(c, {"GOV", "LABEL"}, 0, c.getHead()-1)); else if (it.first == "Tagger") - addTrainScore(it.first, computeScoreOnTapes(c, {"POS"}, 0, c.getHead())); + addTrainScore(it.first, computeScoreOnTapes(c, {"POS"}, 0, c.getHead()-1)); else if (it.first == "Tokenizer") - addTrainScore(it.first, computeScoreOnTapes(c, {"FORM"}, 0, c.getHead())); + addTrainScore(it.first, computeScoreOnTapes(c, {"FORM"}, 0, c.getHead()-1)); else if (it.first == "Morpho") - addTrainScore(it.first, computeScoreOnTapes(c, {"MORPHO"}, 0, c.getHead())); + addTrainScore(it.first, computeScoreOnTapes(c, {"MORPHO"}, 0, c.getHead()-1)); else if (it.first == "Lemmatizer_Rules") - addTrainScore(it.first, computeScoreOnTapes(c, {"LEMMA"}, 0, c.getHead())); + addTrainScore(it.first, computeScoreOnTapes(c, {"LEMMA"}, 0, c.getHead()-1)); else if (split(it.first, '_')[0] == "Error") addTrainScore(it.first, 100.0); else diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp index dd9fb08..b917de5 100644 --- a/trainer/src/Trainer.cpp +++ b/trainer/src/Trainer.cpp @@ -87,6 +87,11 @@ void Trainer::computeScoreOnDev() { int totalSize = ProgramParameters::devTapeSize; int steps = devConfig->getHead(); + if (devConfig->rawInputHeadIndex > 0) + { + totalSize = devConfig->rawInput.size(); + steps = devConfig->rawInputHeadIndex; + } if (steps && (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff)) { fprintf(stderr, " \r"); @@ -197,7 +202,7 @@ void Trainer::resetAndShuffle() trainConfig.reset(); if(ProgramParameters::shuffleExamples) - trainConfig.shuffle(ProgramParameters::sequenceDelimiterTape, ProgramParameters::sequenceDelimiter); + trainConfig.shuffle(); } void Trainer::doStepNoTrain() @@ -233,6 +238,11 @@ void Trainer::doStepTrain() { int totalSize = ProgramParameters::iterationSize == -1 ? ProgramParameters::tapeSize : ProgramParameters::iterationSize; int steps = ProgramParameters::iterationSize == -1 ? trainConfig.getHead() : nbSteps; + if (trainConfig.rawInputHeadIndex > 0) + { + totalSize = trainConfig.rawInput.size(); + steps = trainConfig.rawInputHeadIndex; + } if (steps % nbActionsCutoff == 0 || totalSize-steps < nbActionsCutoff) { fprintf(stderr, " \r"); @@ -270,7 +280,14 @@ void Trainer::doStepTrain() } if (oAction.empty()) + { oAction = tm.getCurrentClassifier()->getDefaultAction(); + if(!tm.getCurrentClassifier()->getAction(oAction)->appliable(trainConfig)) + oAction.clear(); + } + + if (oAction.empty()) + oAction = pAction; if (oAction.empty()) { @@ -544,6 +561,8 @@ void Trainer::train() void Trainer::printScoresAndSave(FILE * output) { + trainConfig.transformSymbol("", "_"); + devConfig->transformSymbol("", "_"); TI.computeTrainScores(trainConfig); computeScoreOnDev(); TI.computeMustSaves(); diff --git a/transition_machine/include/Config.hpp b/transition_machine/include/Config.hpp index f988f2b..9ba2cb8 100644 --- a/transition_machine/include/Config.hpp +++ b/transition_machine/include/Config.hpp @@ -73,6 +73,7 @@ class Config /// @param relativeIndex The index of the cell relatively to the head. /// @param elem The new content of the cell. void setHyp(int relativeIndex, const std::string & elem); + int getHead(); /// @brief Return true if the head of this tape is on the last cell. /// /// @return True if the head of this tape is on the last cell. @@ -189,6 +190,8 @@ class Config int rawInputHeadIndex; /// @brief Index of current word in the sentence, as in conll format. int currentWordIndex; + /// @brief The conll input as it was read. + std::vector< std::vector<std::string> > inputContent; public : @@ -221,6 +224,7 @@ class Config Tape & getTapeByInputCol(int col); /// @brief Read a part of a formated input file (mcf) and use it to fill the tapes. void readInput(); + void fillTapesWithInput(); /// @brief Print the Config for debug purposes. /// /// @param output Where to print. @@ -274,13 +278,8 @@ class Config /// /// @return The history of entropies of the current state in the TransitionMachine. LimitedStack<float> & getCurrentStateEntropyHistory(); - /// @brief Shuffle the segments of the Config. - /// - /// For instance if you call shuffle("EOS", "1");\n - /// Sentences will be preserved, but their order will be shuffled. - /// @param delimiterTape The tape containing the delimiters of segments. - /// @param delimiter The delimiters of segments. - void shuffle(const std::string & delimiterTape, const std::string & delimiter); + /// @brief Shuffle the Config per sequences. + void shuffle(); /// @brief Get element from the stack at depth index. /// /// @param index The depth of the requested element. @@ -352,8 +351,6 @@ class Config /// /// @return True if the head is at the end of the tapes. bool endOfTapes() const; - /// @brief Update rawInput according to the tape TEXT. - void updateRawInput(); /// @brief Set the output file. void setOutputFile(FILE * outputFile); /// @brief Print the cells that have not been printed. @@ -367,6 +364,7 @@ class Config void printColumnInfos(unsigned int index); void addToActionsHistory(std::string & state, const std::string & action, int cost); std::vector< std::pair<std::string, int> > & getActionsHistory(std::string & state); + void transformSymbol(const std::string & from, const std::string & to); }; #endif diff --git a/transition_machine/src/ActionBank.cpp b/transition_machine/src/ActionBank.cpp index 26d167e..7bb9819 100644 --- a/transition_machine/src/ActionBank.cpp +++ b/transition_machine/src/ActionBank.cpp @@ -321,6 +321,8 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na } else if(std::string(b1) == "ADDCHARTOWORD") { + sequence.emplace_back(increaseTapesIfNeeded(0)); + auto apply = [](Config & c, Action::BasicAction &) {addCharToBuffer(c, "FORM", 0);}; auto undo = [](Config & c, Action::BasicAction &) diff --git a/transition_machine/src/BD.cpp b/transition_machine/src/BD.cpp index 957c01a..b9eb9c5 100644 --- a/transition_machine/src/BD.cpp +++ b/transition_machine/src/BD.cpp @@ -40,17 +40,6 @@ BD::BD(const std::string & BDfilename, const std::string & MCDfilename) exit(1); } - if(mcdCol2Str.find(col) != mcdCol2Str.end()) - { - fprintf(stderr, "ERROR (%s) : MCD column \'%d\' already exists. Aborting.\n", ERRINFO, col); - exit(1); - } - if(mcdStr2Col.find(name) != mcdStr2Col.end()) - { - fprintf(stderr, "ERROR (%s) : MCD column \'%s\' already exists. Aborting.\n", ERRINFO, name); - exit(1); - } - mcdCol2Str[col] = name; mcdStr2Col[name] = col; } diff --git a/transition_machine/src/Classifier.cpp b/transition_machine/src/Classifier.cpp index f68f1cc..25b16a6 100644 --- a/transition_machine/src/Classifier.cpp +++ b/transition_machine/src/Classifier.cpp @@ -279,14 +279,16 @@ std::vector<std::string> Classifier::getZeroCostActions(Config & config) result.emplace_back(a.name); if (result.empty() && as->hasDefaultAction) - result.emplace_back(as->getDefaultAction()->name); + if (as->getDefaultAction()->appliable(config)) + result.emplace_back(as->getDefaultAction()->name); return result; } std::string Classifier::getDefaultAction() const { - return as->getDefaultAction()->name; + if (as->hasDefaultAction) + return as->getDefaultAction()->name; return std::string(); } diff --git a/transition_machine/src/Config.cpp b/transition_machine/src/Config.cpp index 2871956..cd85b31 100644 --- a/transition_machine/src/Config.cpp +++ b/transition_machine/src/Config.cpp @@ -20,6 +20,7 @@ Config::Config(BD & bd, const std::string inputFilename) : bd(bd), hashHistory(H for(int i = 0; i < bd.getNbLines(); i++) tapes.emplace_back(bd.getNameOfLine(i), bd.lineIsKnown(i)); this->totalEntropy = 0; + readInput(); } Config::Config(const Config & other) : bd(other.bd), hashHistory(other.hashHistory), pastActions(other.pastActions) @@ -94,46 +95,88 @@ void Config::readInput() FILE * fd = file->getDescriptor(); char buffer[100000]; - std::vector<std::string> cols; - unsigned int usualColsSize = 0; - int toRead = ProgramParameters::readSize; - int haveRead = 0; + int lineIndex = 0; - while(haveRead < toRead && fscanf(fd, "%[^\n]\n", buffer) == 1) + while (fscanf(fd, "%[^\n]\n", buffer) == 1) { + lineIndex++; + if (!utf8::is_valid(buffer, buffer+std::strlen(buffer))) { - fprintf(stderr, "ERROR (%s) : input (%s) line %d is not toally utf-8 formated. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size()); + fprintf(stderr, "ERROR (%s) : input (%s) line %d is not toally utf-8 formated. Aborting.\n", ERRINFO, inputFilename.c_str(), lineIndex); exit(1); } - cols = split(buffer, '\t'); - if (!usualColsSize) - usualColsSize = cols.size(); + if (std::strlen(buffer) <= 3) + continue; - if (cols.size() != usualColsSize) - { - fprintf(stderr, "ERROR (%s) : input (%s) line %d has %lu columns instead of %u. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size(), cols.size(), usualColsSize); - exit(1); - } + if (split(buffer, '=')[0] == "# sent_id ") + inputContent.emplace_back(); + else if (buffer[0] == '#' && split(buffer, '=')[0] != "# text ") + continue; + + inputContent.back().emplace_back(buffer); + } - printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex(), tapes[0].getNextOverridenRealIndex()); + inputAllRead = true; + fillTapesWithInput(); +} + +void Config::fillTapesWithInput() +{ + rawInput = ""; + std::vector<std::string> cols; + unsigned int usualColsSize = 0; - for(unsigned int i = 0; i < cols.size(); i++) - if(bd.hasLineOfInputCol(i)) + for (auto & sentence : inputContent) + { + for (unsigned int wordIndex = 0; wordIndex < sentence.size(); wordIndex++) + { + auto & word = sentence[wordIndex]; + if (split(word, '=')[0] == "# text ") { - auto & tape = getTapeByInputCol(i); + std::string prefix = rawInput.empty() ? "" : " "; + if (choiceWithProbability(0.3)) + prefix = "\n"; + else if (choiceWithProbability(0.3)) + prefix = ""; + rawInput += prefix + std::string(word.begin()+9, word.end()); + continue; + } + else if (word[0] == '#') + continue; - tape.addToRef(cols[i]); - tape.addToHyp(""); + cols = split(word, '\t'); + if (!usualColsSize) + usualColsSize = cols.size(); - if (tape.getName() == ProgramParameters::tapeToMask) - if (choiceWithProbability(ProgramParameters::maskRate)) - tape.maskIndex(tape.refSize()-1); + if (cols.size() != usualColsSize) + { + fprintf(stderr, "ERROR (%s) : input (%s) line %d has %lu columns instead of %u. Aborting.\n", ERRINFO, inputFilename.c_str(), tapes[0].size(), cols.size(), usualColsSize); + exit(1); } - haveRead++; + for(unsigned int i = 0; i < cols.size(); i++) + if(bd.hasLineOfInputCol(i)) + { + auto & tape = getTapeByInputCol(i); + + tape.addToRef(cols[i]); + tape.addToHyp(""); + + if (tape.getName() == ProgramParameters::tapeToMask) + if (choiceWithProbability(ProgramParameters::maskRate)) + tape.maskIndex(tape.refSize()-1); + if (tape.getName() == ProgramParameters::sequenceDelimiterTape) + { + fprintf(stderr, "ERROR (%s) : Tape \'%s\' must not be given as a column in the input since it's the sequence delimiter. Aborting.\n", ERRINFO, tape.getName().c_str()); + exit(1); + } + } + getTape(ProgramParameters::sequenceDelimiterTape).addToRef(wordIndex == sentence.size()-1 ? ProgramParameters::sequenceDelimiter : "_"); + getTape(ProgramParameters::sequenceDelimiterTape).addToHyp(""); + } } // Making all tapes the same size @@ -141,12 +184,6 @@ void Config::readInput() for(auto & tape : tapes) maxTapeSize = std::max<unsigned int>(maxTapeSize, tape.refSize()); - if (haveRead < toRead || tapes[0].size() == ProgramParameters::tapeSize) - { - printAsOutput(outputFile, tapes[0].getNextOverridenDataIndex(), tapes[0].getNextOverridenRealIndex()); - inputAllRead = true; - } - for(auto & tape : tapes) { while(tape.refSize() < maxTapeSize) @@ -155,15 +192,9 @@ void Config::readInput() while(tape.hypSize() < maxTapeSize) tape.addToHyp(""); - if (inputAllRead) - { - tape.addToRef("0"); - tape.addToHyp(""); - } + tape.addToRef("0"); + tape.addToHyp(""); } - - if (hasTape("TEXT")) - updateRawInput(); } void Config::printForDebug(FILE * output) @@ -252,7 +283,7 @@ void Config::printAsOutput(FILE * output, int dataIndex, int realIndex) void Config::moveHead(int mvt) { - if (head + mvt < tapes[0].size()) + if (head + mvt <= tapes[0].size()) { head += mvt; @@ -316,15 +347,10 @@ void Config::reset() stack.clear(); stackHistory = -1; - inputAllRead = false; head = 0; rawInputHead = 0; rawInputHeadIndex = 0; currentWordIndex = 1; - - file.reset(); - while (tapes[0].size() < ProgramParameters::readSize*4 && !inputAllRead) - readInput(); } const std::string & Config::Tape::operator[](int relativeIndex) @@ -398,59 +424,11 @@ LimitedStack<float> & Config::getCurrentStateEntropyHistory() return entropyHistory.find(getCurrentStateName())->second; } -void Config::shuffle(const std::string & delimiterTape, const std::string & delimiter) +void Config::shuffle() { - struct Trio{unsigned int a; unsigned int b; unsigned int c; Trio(unsigned int a, unsigned int b, unsigned int c): a(a), b(b), c(c){}}; - std::vector<Trio> delimiters; - - if (delimiterTape == "0") - { - unsigned int previousIndex = 0; - for (int i = 0; i < tapes[0].refSize(); i++) - { - delimiters.emplace_back(previousIndex, i, delimiters.size()); - previousIndex = i+1; - } - } - else - { - auto & tape = getTape(delimiterTape); - unsigned int previousIndex = 0; - for (int i = 0; i < tape.refSize(); i++) - if (tape.getRef(i-head) == delimiter) - { - delimiters.emplace_back(previousIndex, i, delimiters.size()); - previousIndex = i+1; - } - } - - if (delimiters.empty()) - { - fprintf(stderr, "WARNING (%s) : Requested to shuffle based on tape \'%s\' with \'%s\' as a delimiter, but none has been found. Aborting.\n", ERRINFO, delimiterTape.c_str(), delimiter.c_str()); - return; - } - - std::pair<unsigned int, unsigned int> suffix = {delimiters.back().b+1, tapes[0].refSize()-1}; - - std::random_shuffle(delimiters.begin(), delimiters.end()); - - auto newTapes = tapes; - - for (unsigned int tape = 0; tape < tapes.size(); tape++) - { - newTapes[tape].clearDataForCopy(); - - for (auto & delimiter : delimiters) - newTapes[tape].copyPart(tapes[tape], delimiter.a, delimiter.b+1); - - if (suffix.first <= suffix.second) - newTapes[tape].copyPart(tapes[tape], suffix.first, suffix.second+1); - } - - tapes = newTapes; - - if (!rawInput.empty()) - updateRawInput(); + reset(); + std::random_shuffle(inputContent.begin(), inputContent.end()); + fillTapesWithInput(); } int Config::stackGetElem(int index) const @@ -568,7 +546,7 @@ void Config::Tape::moveHead(int mvt) bool Config::endOfTapes() const { - return inputAllRead && tapes[0].headIsAtEnd(); + return inputAllRead && (tapes[0].headIsAtEnd() || rawInputHeadIndex >= (int)rawInput.size()); } bool Config::Tape::headIsAtEnd() const @@ -735,14 +713,16 @@ float Config::Tape::getScore(int from, int to) return 100.0*res / (1+to-from); } -void Config::updateRawInput() +int Config::Tape::getHead() { - rawInput = ""; - auto & textTape = getTape("TEXT"); - for (int i = 0; i < textTape.size(); i++) - { - if (textTape[i] != "_") - rawInput += (rawInput.empty() ? std::string("") : (choiceWithProbability(0.5) ? std::string(" ") : std::string("\n"))) + textTape[i]; - } + return head; +} + +void Config::transformSymbol(const std::string & from, const std::string & to) +{ + for (auto & tape : tapes) + for (int i = 0; i < tape.size(); i++) + if (tape.getHyp(i-tape.getHead()) == from) + tape.setHyp(i-tape.getHead(), to); } -- GitLab