MacaonTrain.cpp

  
 #include "MacaonTrain.hpp"
#include <filesystem>
#include "util.hpp"
#include "NeuralNetwork.hpp"
#include "WordEmbeddings.hpp"

namespace po = boost::program_options;

po::options_description MacaonTrain::getOptionsDescription()
{
  po::options_description desc("Command-Line Arguments ");

  po::options_description req("Required");
  req.add_options()
    ("model", po::value<std::string>()->required(),
      "Directory containing the machine file to train")
    ("trainTSV", po::value<std::string>()->required(),
      "TSV file of the training corpus, in CONLLU format");

  po::options_description opt("Optional");
  opt.add_options()
    ("debug,d", "Print debuging infos on stderr")
    ("silent", "Don't print speed and progress")
    ("devScore", "Compute score on dev instead of loss (slower)")
    ("mcd", po::value<std::string>()->default_value("ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL"),
      "Comma separated column names that describes the input/output format")
    ("trainTXT", po::value<std::string>()->default_value(""),
      "Raw text file of the training corpus")
    ("devTSV", po::value<std::string>()->default_value(""),
      "TSV file of the development corpus, in CONLLU format")
    ("devTXT", po::value<std::string>()->default_value(""),
      "Raw text file of the development corpus")
    ("nbEpochs,n", po::value<int>()->default_value(5),
      "Number of training epochs")
    ("batchSize", po::value<int>()->default_value(64),
      "Number of examples per batch")
    ("explorationThreshold", po::value<float>()->default_value(0.1),
      "Maximum probability difference with the best scoring transition, for a transition to be explored during dynamic extraction of dataset")
    ("machine", po::value<std::string>()->default_value(""),
      "Reading machine file content")
    ("trainStrategy", po::value<std::string>()->default_value("0,ExtractGold,ResetParameters"),
      "Description of what should happen during training")
    ("loss", po::value<std::string>()->default_value("CrossEntropy"),
      "Loss function to use during training : CrossEntropy | bce | mse | hinge")
    ("seed", po::value<int>()->default_value(100),
      "Number of examples per batch")
    ("scaleGrad", "Scale embedding's gradient with its frequence in the minibatch")
    ("maxNorm", po::value<float>()->default_value(std::numeric_limits<float>::max()),
      "Max norm for the embeddings")
    ("help,h", "Produce this help message");

  desc.add(req).add(opt);

  return desc;
}

po::variables_map MacaonTrain::checkOptions(po::options_description & od)
{
  po::variables_map vm;

  try {po::store(po::parse_command_line(argc, argv, od), vm);}
  catch(std::exception & e) {util::myThrow(e.what());}

  if (vm.count("help"))
  {
    std::stringstream ss;
    ss << od;
    fmt::print(stderr, "{}\n", ss.str());
    exit(0);
  }