Added option memcheck to train

3c3acb33 · Franck Dary · e496576e · 3c3acb33 · 3c3acb33 · 3c3acb33
Commit 3c3acb33 authored Apr 30, 2021 by Franck Dary
--- a/common/include/util.hpp
+++ b/common/include/util.hpp
@@ -53,6 +53,7 @@ bool isUrl(const std::string & s);
 bool isNumber(const std::string & s);

 std::string getTime();
+std::string getMemUsage();

 long float2long(float f);
 float long2float(long l);

--- a/common/src/util.cpp
+++ b/common/src/util.cpp
@@ -2,6 +2,9 @@
 #include "utf8.hpp"
 #include <ctime>
 #include <algorithm>
+#include <iostream>
+#include <fstream>
+#include <unistd.h>
 #include "upper2lower"

 float util::long2float(long l)
@@ -236,6 +239,24 @@ std::string util::getTime()
  return std::string(buffer);
 }

+std::string util::getMemUsage()
+{
+  float vm_usage = 0.0;
+  float resident_set = 0.0;
+
+  unsigned long vsize;
+  long rss;
+  std::string ignore;
+  std::ifstream ifs("/proc/self/stat", std::ios_base::in);
+  ifs >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> vsize >> rss;
+
+  long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages
+  vm_usage = vsize / 1024.0;
+  resident_set = rss * page_size_kb;
+
+  return fmt::format("Virtual:{:.2f}Go Physical:{:.2f}Go", vm_usage/1000000.0, resident_set/1000000.0);
+}
+
 bool util::choiceWithProbability(float probability)
 {
  int maxVal = 100000;

--- a/trainer/include/Trainer.hpp
+++ b/trainer/include/Trainer.hpp
@@ -55,13 +55,13 @@ class Trainer

  private :

-  void extractExamples(std::vector<SubConfig> & configs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold);
+  void extractExamples(std::vector<SubConfig> & configs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold, bool memcheck);
  float processDataset(DataLoader & loader, bool train, bool printAdvancement, int nbExamples);

  public :

  Trainer(ReadingMachine & machine, int batchSize);
-  void createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold);
+  void createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold, bool memcheck);
  void extractActionSequence(BaseConfig & config);
  void makeDataLoader(std::filesystem::path dir);
  void makeDevDataLoader(std::filesystem::path dir);

--- a/trainer/src/MacaonTrain.cpp
+++ b/trainer/src/MacaonTrain.cpp
@@ -22,6 +22,7 @@ po::options_description MacaonTrain::getOptionsDescription()
  opt.add_options()
    ("debug,d", "Print debuging infos on stderr")
    ("silent", "Don't print speed and progress")
+    ("memcheck", "Regularly print memory usage on stderr")
    ("devScore", "Compute score on dev instead of loss (slower)")
    ("mcd", po::value<std::string>()->default_value("ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL"),
      "Comma separated column names that describes the input/output format")
@@ -133,6 +134,7 @@ int MacaonTrain::main()
  auto nbEpoch = variables["nbEpochs"].as<int>();
  auto batchSize = variables["batchSize"].as<int>();
  bool debug = variables.count("debug") == 0 ? false : true;
+  bool memcheck = variables.count("memcheck") == 0 ? false : true;
  bool printAdvancement = !debug && variables.count("silent") == 0 ? true : false;
  bool computeDevScore = variables.count("devScore") == 0 ? false : true;
  auto machineContent = variables["machine"].as<std::string>();
@@ -267,11 +269,11 @@ int MacaonTrain::main()
    if (trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractGold) or trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic))
    {
      machine.setDictsState(trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic) ? Dict::State::Closed : Dict::State::Open);
-      trainer.createDataset(goldConfigs, debug, modelPath/"examples/train", currentEpoch, trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic), explorationThreshold);
+      trainer.createDataset(goldConfigs, debug, modelPath/"examples/train", currentEpoch, trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic), explorationThreshold, memcheck);
      if (!computeDevScore)
      {
        machine.setDictsState(Dict::State::Closed);
-        trainer.createDataset(devGoldConfigs, debug, modelPath/"examples/dev", currentEpoch, trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic), explorationThreshold);
+        trainer.createDataset(devGoldConfigs, debug, modelPath/"examples/dev", currentEpoch, trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic), explorationThreshold, memcheck);
      }
    }
    if (trainStrategy[currentEpoch].count(Trainer::TrainAction::ResetParameters) or trainStrategy[currentEpoch].count(Trainer::TrainAction::ResetOptimizer))
@@ -392,6 +394,8 @@ int MacaonTrain::main()
    std::FILE * f = std::fopen(trainInfos.c_str(), "a");
    fmt::print(f, "{}\t{}\n", iterStr, devScoreMean);
    std::fclose(f);
+    if (memcheck)
+      fmt::print(stderr, "[{}] Memory : {}\n", util::getTime(), util::getMemUsage());
  }

  }

--- a/trainer/src/Trainer.cpp
+++ b/trainer/src/Trainer.cpp
@@ -18,7 +18,7 @@ void Trainer::makeDevDataLoader(std::filesystem::path dir)
  devDataLoader = torch::data::make_data_loader(*devDataset, torch::data::DataLoaderOptions(batchSize).workers(0).max_jobs(0));
 }

-void Trainer::createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold)
+void Trainer::createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold, bool memcheck)
 {
  std::vector<SubConfig> configs;
  for (auto & goldConfig : goldConfigs)
@@ -26,12 +26,12 @@ void Trainer::createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, s

  machine.trainMode(false);

-  extractExamples(configs, debug, dir, epoch, dynamicOracle, explorationThreshold);
+  extractExamples(configs, debug, dir, epoch, dynamicOracle, explorationThreshold, memcheck);

  machine.saveDicts();
 }

-void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold)
+void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold, bool memcheck)
 {
  torch::AutoGradMode useGrad(false);

@@ -50,10 +50,13 @@ void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std:

  std::atomic<int> totalNbExamples = 0;

+  if (memcheck)
+    fmt::print(stderr, "[{}] Memory : {}\n", util::getTime(), util::getMemUsage());
+
  NeuralNetworkImpl::setDevice(torch::kCPU);
  machine.to(NeuralNetworkImpl::getDevice());
  std::for_each(std::execution::par, configs.begin(), configs.end(),
-    [this, maxNbExamplesPerFile, &examplesPerState, &totalNbExamples, debug, dynamicOracle, explorationThreshold, dir, epoch, &examplesMutex](SubConfig & config)
+    [this, maxNbExamplesPerFile, &examplesPerState, &totalNbExamples, debug, memcheck, dynamicOracle, explorationThreshold, dir, epoch, &examplesMutex](SubConfig & config)
    {
      config.addPredicted(machine.getPredicted());
      config.setStrategy(machine.getStrategyDefinition());
@@ -189,7 +192,11 @@ void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std:

        if (config.needsUpdate())
          config.update();
+
      } // End while true
+
+    if (memcheck)
+      fmt::print(stderr, "[{}] Memory : {}\n", util::getTime(), util::getMemUsage());
  }); // End for on configs

  for (auto & it : examplesPerState)
@@ -203,6 +210,8 @@ void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std:
    util::myThrow(fmt::format("could not create file '{}'", currentEpochAllExtractedFile.c_str()));
  std::fclose(f);

+  if (memcheck)
+    fmt::print(stderr, "[{}] Memory : {}\n", util::getTime(), util::getMemUsage());
  fmt::print(stderr, "[{}] Extracted {} examples\n", util::getTime(), util::int2HumanStr(totalNbExamples));
 }