Skip to content
Snippets Groups Projects
Commit 8a517bf2 authored by Franck Dary's avatar Franck Dary
Browse files

Starting to optimize lazy read

parent 30e05206
Branches
No related tags found
No related merge requests found
......@@ -56,7 +56,7 @@ void printAdvancement(Config & config, float currentSpeed)
{
if (ProgramParameters::interactive)
{
int totalSize = config.tapes[0].hyp.size();
int totalSize = ProgramParameters::tapeSize;
int steps = config.getHead();
if (steps && (steps % 200 == 0 || totalSize-steps < 200))
fprintf(stderr, "Decode : %.2f%% speed : %s actions/s\r", 100.0*steps/totalSize, int2humanStr((int)currentSpeed).c_str());
......@@ -101,7 +101,7 @@ std::pair<float,std::string> getClassifierAction(Config & config, Classifier::We
if(!action->appliable(config) || nbValidActions-1 != index)
{
// First case the analysis is finished but without an empty stack
if (config.getHead() == (int)config.tapes[0].ref.size()-1)
if (config.endOfTapes())
{
while (!config.stackEmpty())
config.stackPop();
......
......@@ -47,6 +47,8 @@ po::options_description getOptionsDescription()
"The name of the buffer's tape that contains the delimiter token for a sequence")
("sequenceDelimiter", po::value<std::string>()->default_value("1"),
"The value of the token that act as a delimiter for sequences")
("tapeSize", po::value<int>()->default_value(100000),
"Number of lines in the input file.")
("showFeatureRepresentation", po::value<int>()->default_value(0),
"For each state of the Config, show its feature representation")
("interactive", po::value<bool>()->default_value(true),
......@@ -141,6 +143,7 @@ int main(int argc, char * argv[])
ProgramParameters::sequenceDelimiterTape = vm["sequenceDelimiterTape"].as<std::string>();
ProgramParameters::sequenceDelimiter = vm["sequenceDelimiter"].as<std::string>();
ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<int>();
ProgramParameters::tapeSize = vm["tapeSize"].as<int>();
ProgramParameters::beamSize = vm["beamSize"].as<int>();
ProgramParameters::nbChilds = vm["nbChilds"].as<int>();
ProgramParameters::optimizer = "none";
......@@ -171,8 +174,7 @@ int main(int argc, char * argv[])
TransitionMachine tapeMachine(false);
BD bd(ProgramParameters::bdFilename, ProgramParameters::mcdFilename);
Config config(bd);
config.readInput(ProgramParameters::input);
Config config(bd, ProgramParameters::input);
Decoder decoder(tapeMachine, config);
......
/// @file LimitedArray.hpp
/// @author Franck Dary
/// @version 1.0
/// @date 2019-02-08
#ifndef LIMITEDARRAY__H
#define LIMITEDARRAY__H
#include <vector>
template<typename T>
class LimitedArray
{
private :
std::vector<T> data;
int nbElements;
int lastElementDataIndex;
int lastElementRealIndex;
public :
LimitedArray(unsigned int limit) : data(limit)
{
clear();
}
void clear()
{
nbElements = 0;
lastElementDataIndex = -1;
lastElementRealIndex = -1;
}
void push(const T & elem)
{
nbElements++;
if (nbElements > data.size())
nbElements = data.size();
lastElementDataIndex++;
if (lastElementDataIndex >= data.size())
lastElementDataIndex = 0;
lastElementRealIndex++;
data[lastElementDataIndex] = elem;
}
const T & get(unsigned int index)
{
return data[index % data.size()];
}
int getLastIndex()
{
return lastElementRealIndex;
}
};
#endif
......@@ -63,6 +63,7 @@ struct ProgramParameters
static int nbIndividuals;
static int beamSize;
static int nbChilds;
static int tapeSize;
private :
......
......@@ -57,4 +57,5 @@ int ProgramParameters::nbErrorsToShow;
int ProgramParameters::nbIndividuals;
int ProgramParameters::beamSize;
int ProgramParameters::nbChilds;
int ProgramParameters::tapeSize;
......@@ -67,6 +67,8 @@ po::options_description getOptionsDescription()
"Remove identical training examples")
("showFeatureRepresentation", po::value<int>()->default_value(0),
"For each state of the Config, show its feature representation")
("tapeSize", po::value<int>()->default_value(100000),
"Number of lines in the input file.")
("interactive", po::value<bool>()->default_value(true),
"Is the shell interactive ? Display advancement informations")
("randomEmbeddings", po::value<bool>()->default_value(false),
......@@ -204,8 +206,7 @@ void launchTraining()
TransitionMachine transitionMachine(true);
BD trainBD(ProgramParameters::bdFilename, ProgramParameters::mcdFilename);
Config trainConfig(trainBD);
trainConfig.readInput(ProgramParameters::trainFilename);
Config trainConfig(trainBD, ProgramParameters::trainFilename);
std::unique_ptr<BD> devBD;
std::unique_ptr<Config> devConfig;
......@@ -219,8 +220,7 @@ void launchTraining()
else
{
devBD.reset(new BD(ProgramParameters::bdFilename, ProgramParameters::mcdFilename));
devConfig.reset(new Config(*devBD.get()));
devConfig->readInput(ProgramParameters::devFilename);
devConfig.reset(new Config(*devBD.get(), ProgramParameters::devFilename));
trainer.reset(new Trainer(transitionMachine, trainBD, trainConfig, devBD.get(), devConfig.get()));
}
......@@ -285,6 +285,7 @@ int main(int argc, char * argv[])
ProgramParameters::dynamicEpoch = vm["epochd"].as<int>();
ProgramParameters::dynamicProbability = vm["proba"].as<float>();
ProgramParameters::showFeatureRepresentation = vm["showFeatureRepresentation"].as<int>();
ProgramParameters::tapeSize = vm["tapeSize"].as<int>();
ProgramParameters::iterationSize = vm["iterationSize"].as<int>();
std::string featureModels = vm["featureModels"].as<std::string>();
if (!featureModels.empty())
......
......@@ -10,6 +10,9 @@
#include "BD.hpp"
#include "File.hpp"
#include "LimitedStack.hpp"
#include "LimitedArray.hpp"
//TODO IMPORTANT : il faut faire en sorte que quand on copie 2 configs, chacune ai sa propre version du pointeur vers le fichier input, que les lectures soient independentes.
class Action;
......@@ -22,25 +25,67 @@ class Config
/// @brief A Tape of the multi-tapes buffer.
///
/// Each cell can contain a string.
struct Tape
class Tape
{
private :
/// @brief The name of this Tape.
std::string name;
/// @brief Whether or not the content of this Tape is known at the start of the program (it is an input).
bool isKnown;
/// @brief Content of the cells of this Tape, that was given as an input to this program.
std::vector<std::string> ref;
/// @brief Content of the cells of this Tape, which have been predicted by the program so far.
std::vector<std::string> hyp;
/// @brief The head of this Tape, an index.
int head;
/// @brief Content of the cells of this Tape, that was given as an input to this program.
LimitedArray<std::string> ref;
/// @brief Content of the cells of this Tape, which have been predicted by the program so far.
LimitedArray<std::string> hyp;
public :
/// @brief Access the value of a cell.
///
/// If isKnown is true, the vector ref will be read, otherwise the vector hyp will be read.
/// @param index Index of the cell to access.
/// @param relativeIndex Index of the cell to access, relatively to the head.
///
/// @return Value of the cell.
const std::string & operator[](int index);
const std::string & operator[](int relativeIndex);
/// @brief Access the value of a cell of the ref.
///
/// @param relativeIndex The index of the cell relatively to the head.
///
/// @return The content of the cell.
const std::string & getRef(int relativeIndex);
/// @brief Access the value of a cell of the hyp.
///
/// @param relativeIndex The index of the cell relatively to the head.
///
/// @return The content of the cell.
const std::string & gethyp(int relativeIndex);
/// @brief Return true if the head of this tape is on the last cell.
///
/// @return True if the head of this tape is on the last cell.
bool headIsAtEnd();
public :
/// @brief construct an empty tape.
Tape(const std::string & name, bool isKnown);
/// @brief Get the name of this Tape.
///
/// @return The name of this Tape.
const std::string & getName();
/// @brief Set the name of this Tape.
///
/// @param name The desired name.
void setName(const std::string & name);
/// @brief Set if the content of this tape is known or predicted.
///
/// @param known The value to set.
void setKnown(bool known);
/// @brief Move the head of this tape.
///
/// @param mvt The relative movement to apply to the head.
void moveHead(int mvt);
};
private :
......@@ -62,6 +107,10 @@ class Config
BD & bd;
/// @brief The head of this Config. An index of the multi-tapes buffer.
int head;
/// @brief The file containing the input.
std::shared_ptr<File> file;
/// @brief If the end of input was reached during reading.
bool inputAllRead;
public :
......@@ -77,7 +126,8 @@ class Config
/// @brief Construct a new Config.
///
/// @param bd The BD that describes the tapes of this Config.
Config(BD & bd);
/// @param inputFilename The name of the input file.
Config(BD & bd, const std::string inputFilename);
/// @brief Get a Tape by its name.
///
/// @param name The name of the Tape.
......@@ -90,10 +140,8 @@ class Config
///
/// @return The corresponding Tape.
Tape & getTapeByInputCol(int col);
/// @brief Read a formated input file (mcf) and use it to fill the tapes.
///
/// @param filename The name of the file containing the input.
void readInput(const std::string & filename);
/// @brief Read a part of a formated input file (mcf) and use it to fill the tapes.
void readInput();
/// @brief Print the Config for debug purposes.
///
/// @param output Where to print.
......@@ -215,6 +263,10 @@ class Config
///
/// @return The head of the multi-tapes buffer.
int getHead() const;
/// @brief Return true if the head is at the end of the tapes.
///
/// @return True if the head is at the end of the tapes.
bool endOfTapes() const;
};
#endif
......@@ -176,7 +176,7 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
auto undo = [](Config & c, Action::BasicAction &)
{c.stackPop();};
auto appliable = [](Config & c, Action::BasicAction &)
{return c.getHead() < (int)c.tapes[0].ref.size()-1;};
{return !c.endOfTapes();};
Action::BasicAction basicAction =
{Action::BasicAction::Type::Push, "", apply, undo, appliable};
......@@ -264,7 +264,7 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
{
if (c.stackEmpty())
return false;
return !c.isFinal() && c.getHead() < (int)c.tapes[0].ref.size()-1;
return !c.isFinal() && !c.endOfTapes();
};
Action::BasicAction basicAction3 =
{Action::BasicAction::Type::Pop, b1, apply3, undo3, appliable3};
......@@ -322,7 +322,7 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
};
auto appliable3 = [](Config & c, Action::BasicAction &)
{
return !c.isFinal() && c.getHead() < (int)c.tapes[0].ref.size()-1;
return !c.isFinal() && !c.endOfTapes();
};
Action::BasicAction basicAction3 =
{Action::BasicAction::Type::Push, b1, apply3, undo3, appliable3};
......@@ -360,7 +360,7 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
for (int i = c.stackSize()-1; i >= 0; i--)
{
auto s = c.stackGetElem(i);
if (govs.hyp[s].empty() || govs.hyp[s] == "0")
if (govs.getHyp(s-b0).empty() || govs.getHyp(s-b0) == "0")
{
if (rootIndex == -1)
rootIndex = s;
......@@ -394,7 +394,7 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
for (int i = c.stackSize()-1; i >= 0; i--)
{
auto s = c.stackGetElem(i);
if (govs.hyp[s] == "0")
if (govs.getHyp[s-b0] == "0")
{
simpleBufferWrite(c, "GOV", "", s-b0);
break;
......@@ -423,7 +423,7 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
for (int i = c.stackSize()-1; i >= 0; i--)
{
auto s = c.stackGetElem(i);
if (labels.hyp[s].empty())
if (labels.getHyp(s-b0).empty())
{
if (rootIndex == -1)
{
......@@ -446,7 +446,7 @@ std::vector<Action::BasicAction> ActionBank::str2sequence(const std::string & na
for (int i = c.stackSize()-1; i >= 0; i--)
{
auto s = c.stackGetElem(i);
if (labels.hyp[s] == "root")
if (labels.getHyp(s-b0) == "root")
{
simpleBufferWrite(c, "LABEL", "", s-b0);
break;
......@@ -580,7 +580,7 @@ void ActionBank::simpleBufferWrite(Config & config, const std::string & tapeName
int index = config.getHead() + relativeIndex;
tape.hyp[index] = value;
tape.setHyp(index, value);
}
bool ActionBank::simpleBufferWriteAppliable(Config & config,
......@@ -590,10 +590,10 @@ bool ActionBank::simpleBufferWriteAppliable(Config & config,
int index = config.getHead() + relativeIndex;
if (index == (int)tape.hyp.size()-1)
if (c.endOfTapes())
return true;
return (!(index < 0 || index >= (int)tape.hyp.size()));
return !(index < 0);
}
bool ActionBank::isRuleAppliable(Config & config,
......@@ -609,9 +609,9 @@ void ActionBank::writeRuleResult(Config & config, const std::string & fromTapeNa
auto & fromTape = config.getTape(fromTapeName);
auto & toTape = config.getTape(targetTapeName);
auto & from = fromTape.ref[config.getHead() + relativeIndex];
auto & from = fromTape.getRef(relativeIndex);
toTape.hyp[config.getHead() + relativeIndex] = applyRule(from, rule);
toTape.setHyp(config.getHead() + relativeIndex, applyRule(from, rule));
}
int ActionBank::getLinkLength(const Config & c, const std::string & action)
......
......@@ -4,16 +4,22 @@
#include "ProgramParameters.hpp"
#include "Action.hpp"
Config::Config(BD & bd) : bd(bd), tapes(bd.getNbLines()), hashHistory(10), pastActions(100)
Config::Config(BD & bd, const std::string inputFilename) : bd(bd), hashHistory(10), pastActions(100)
{
this->stackHistory = -1;
this->currentStateName = nullptr;
this->inputFilename = inputFilename;
head = 0;
for(unsigned int i = 0; i < tapes.size(); i++)
{
tapes[i].name = bd.getNameOfLine(i);
tapes[i].isKnown = bd.lineIsKnown(i);
inputAllRead = false;
for(int i = 0; i < bd.getNbLines(); i++)
tapes.emplace_back(bd.getNameOfLine(i), bd.lineIsKnown(i));
}
Config::Tape::Tape(const std::string & name, bool isKnown) : ref(100), hyp(100)
{
this->head = 0;
this->name = name;
this->isKnown = isKnown;
}
Config::Tape & Config::getTape(const std::string & name)
......@@ -26,17 +32,22 @@ Config::Tape & Config::getTapeByInputCol(int col)
return tapes[bd.getLineOfInputCol(col)];
}
void Config::readInput(const std::string & filename)
void Config::readInput()
{
this->inputFilename = filename;
File file(filename, "r");
if (!file.get())
file.reset(new File(inputFilename, "r"));
FILE * fd = file.getDescriptor();
char buffer[10000];
std::vector<std::string> cols;
unsigned int usualColsSize = 0;
while(fscanf(fd, "%[^\n]\n", buffer) == 1)
int toRead = 100;
int haveRead = 0;
bool finishedFile = false;
while(fscanf(fd, "%[^\n]\n", buffer) == 1 && haveRead < toRead)
{
cols = split(buffer, '\t');
if (!usualColsSize)
......@@ -53,24 +64,32 @@ void Config::readInput(const std::string & filename)
{
auto & tape = getTapeByInputCol(i);
tape.ref.emplace_back(cols[i]);
tape.addToRef(cols[i]);
tape.addToHyp("");
}
haveRead++;
}
// Making all tapes the same size
unsigned int maxTapeSize = 0;
for(auto & tape : tapes)
maxTapeSize = std::max<unsigned int>(maxTapeSize, tape.ref.size());
maxTapeSize = std::max<unsigned int>(maxTapeSize, tape.refSize());
for(auto & tape : tapes)
{
while(tape.ref.size() < maxTapeSize)
tape.ref.emplace_back();
while(tape.refSize() < maxTapeSize)
tape.addToRef("");
tape.hyp.resize(tape.ref.size());
while(tape.hypSize() < maxTapeSize)
tape.addToHyp("");
tape.ref.emplace_back("0");
tape.hyp.emplace_back("");
if (haveRead < toRead)
{
tape.addToRef("0");
tape.addToHyp("");
inputAllRead = true;
}
}
}
......@@ -107,7 +126,7 @@ void Config::printForDebug(FILE * output)
for(auto & tape : tapes)
{
cols[0].emplace_back(tape.name);
cols[0].emplace_back(tape.getName());
for(int i = std::max(0, head-window); i < std::min((int)tape.hyp.size(), head+window); i++)
{
......@@ -170,7 +189,7 @@ void Config::moveHead(int mvt)
head += mvt;
for (auto & tape : tapes)
tape.head += mvt;
tape.moveHead(mvt);
}
}
......@@ -199,12 +218,22 @@ void Config::reset()
readInput(inputFilename);
}
const std::string & Config::Tape::operator[](int index)
const std::string & Config::Tape::operator[](int relativeIndex)
{
if(isKnown)
return ref[index];
return getRef[relativeIndex];
return hyp[index];
return getHyp[relativeIndex];
}
const std::string & Config::Tape::getRef(int relativeIndex)
{
return ref[head + relativeIndex];
}
const std::string & Config::Tape::gethyp(int relativeIndex)
{
return hyp[head + relativeIndex];
}
std::string & Config::getCurrentStateName()
......@@ -401,3 +430,33 @@ int Config::getHead() const
return head;
}
const std::string & Config::Tape::getName()
{
return name;
}
void Config::Tape::setName(const std::string & name)
{
this->name = name;
}
void Config::Tape::setKnown(bool known)
{
this->isKnown = known;
}
void Config::Tape::moveHead(int mvt)
{
head += mvt;
}
bool Config::endOfTapes() const
{
return inputAllRead && tapes[0].headIsAtEnd();
}
bool Config::Tape::headIsAtEnd()
{
return head == ref.getLastIndex();
}
......@@ -419,11 +419,11 @@ FeatureModel::FeatureValue FeatureBank::aggregateBuffer(Config & c, int from, in
for (auto & tape : c.tapes)
{
Dict * dict = c.getDictOfLine(tape.name);
Dict * dict = c.getDictOfLine(tape.getName());
auto policy = dictPolicy2FeaturePolicy(dict->policy);
bool ignored = false;
for (auto & except : exceptions)
if (except == tape.name)
if (except == tape.getName())
{
ignored = true;
break;
......@@ -434,7 +434,7 @@ FeatureModel::FeatureValue FeatureBank::aggregateBuffer(Config & c, int from, in
for (int i = from; i <= to; i++)
{
int index = c.getHead() + i;
std::string featName = "b."+std::to_string(i)+"."+tape.name;
std::string featName = "b."+std::to_string(i)+"."+tape.getName();
if(index < 0 || index >= (int)tape.hyp.size())
{
result.dicts.emplace_back(dict);
......@@ -468,11 +468,11 @@ FeatureModel::FeatureValue FeatureBank::aggregateStack(Config & c, int from, con
for (auto & tape : c.tapes)
{
Dict * dict = c.getDictOfLine(tape.name);
Dict * dict = c.getDictOfLine(tape.getName());
auto policy = dictPolicy2FeaturePolicy(dict->policy);
bool ignored = false;
for (auto & except : exceptions)
if (except == tape.name)
if (except == tape.getName())
{
ignored = true;
break;
......@@ -482,7 +482,7 @@ FeatureModel::FeatureValue FeatureBank::aggregateStack(Config & c, int from, con
for (int i = 0; i >= from; i--)
{
std::string featName = "s."+std::to_string(i)+"."+tape.name;
std::string featName = "s."+std::to_string(i)+"."+tape.getName();
if(!c.stackHasIndex(i))
{
result.dicts.emplace_back(dict);
......
......@@ -120,16 +120,16 @@ void Oracle::createDatabase()
auto & pos = c.getTape("POS");
if (c.getHead() > 0 && pos[c.getHead()-1] != pos.ref[c.getHead()-1] && pos[c.getHead()-1] == "det" && pos[c.getHead()] == "prorel")
if (c.getHead() > 0 && pos[-1] != pos.getRef(-1) && pos[-1] == "det" && pos[0] == "prorel")
return std::string("BACK 1");
if (c.getHead() > 0 && pos[c.getHead()-1] != pos.ref[c.getHead()-1] && pos[c.getHead()-1] == "det" && pos[c.getHead()] == "prep")
if (c.getHead() > 0 && pos[-1] != pos.getRef(-1) && pos[-1] == "det" && pos[0] == "prep")
return std::string("BACK 1");
if (c.getHead() > 0 && pos[c.getHead()-1] != pos.ref[c.getHead()-1] && pos[c.getHead()-1] == "nc" && pos[c.getHead()] == "nc")
if (c.getHead() > 0 && pos[-1] != pos.getRef(-1) && pos[-1] == "nc" && pos[0] == "nc")
return std::string("BACK 1");
if (c.getHead() > 0 && pos[c.getHead()-1] != pos.ref[c.getHead()-1] && pos[c.getHead()-1] == "nc" && pos[c.getHead()] == "prep")
if (c.getHead() > 0 && pos[-1] != pos.getRef(-1) && pos[-1] == "nc" && pos[0] == "prep")
return std::string("BACK 1");
return std::string("EPSILON");
......@@ -168,9 +168,9 @@ void Oracle::createDatabase()
if (c.getHead() <= 0)
return std::string("EPSILON");
auto & morphoRef = morpho.ref[c.getHead()-1];
auto & morpho0 = morpho[c.getHead()-1];
auto & morpho1 = morpho[c.getHead()];
auto & morphoRef = morpho.getRef(-1);
auto & morpho0 = morpho[-1];
auto & morpho1 = morpho[0];
if (morpho0 == morphoRef)
return std::string("EPSILON");
......@@ -204,7 +204,7 @@ void Oracle::createDatabase()
},
[](Config & c, Oracle *, const std::string & action)
{
return action == "WRITE b.0 POS " + c.getTape("POS").ref[c.getHead()] || c.getHead() >= (int)c.tapes[0].ref.size()-1 ? 0 : 1;
return action == "WRITE b.0 POS " + c.getTape("POS").getRef[0] || c.endOfTapes() ? 0 : 1;
})));
str2oracle.emplace("tokenizer", std::unique_ptr<Oracle>(new Oracle(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment