diff --git a/common/include/util.hpp b/common/include/util.hpp index b555e5531dc0bdc63e42e52469a006aa7d40efc2..165328ca938087fac12d88c2c310b41dab81b48b 100644 --- a/common/include/util.hpp +++ b/common/include/util.hpp @@ -53,6 +53,7 @@ bool isUrl(const std::string & s); bool isNumber(const std::string & s); std::string getTime(); +std::string getMemUsage(); long float2long(float f); float long2float(long l); diff --git a/common/src/util.cpp b/common/src/util.cpp index 85aa5d38404764a9fedc33069dc0c8f1461e5d51..9a9f21a23f0d6ccc7bd298c586bd5bb6e173d8c8 100644 --- a/common/src/util.cpp +++ b/common/src/util.cpp @@ -2,6 +2,9 @@ #include "utf8.hpp" #include <ctime> #include <algorithm> +#include <iostream> +#include <fstream> +#include <unistd.h> #include "upper2lower" float util::long2float(long l) @@ -236,6 +239,24 @@ std::string util::getTime() return std::string(buffer); } +std::string util::getMemUsage() +{ + float vm_usage = 0.0; + float resident_set = 0.0; + + unsigned long vsize; + long rss; + std::string ignore; + std::ifstream ifs("/proc/self/stat", std::ios_base::in); + ifs >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> vsize >> rss; + + long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; // in case x86-64 is configured to use 2MB pages + vm_usage = vsize / 1024.0; + resident_set = rss * page_size_kb; + + return fmt::format("Virtual:{:.2f}Go Physical:{:.2f}Go", vm_usage/1000000.0, resident_set/1000000.0); +} + bool util::choiceWithProbability(float probability) { int maxVal = 100000; diff --git a/trainer/include/Trainer.hpp b/trainer/include/Trainer.hpp index e285a3b3926f4442c5e9ddb06ef1c90bcea507ea..933e52e3cab892f5f36ea7111260f60f34124faf 100644 --- a/trainer/include/Trainer.hpp +++ b/trainer/include/Trainer.hpp @@ -55,13 +55,13 @@ class Trainer private : - void extractExamples(std::vector<SubConfig> & configs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold); + void extractExamples(std::vector<SubConfig> & configs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold, bool memcheck); float processDataset(DataLoader & loader, bool train, bool printAdvancement, int nbExamples); public : Trainer(ReadingMachine & machine, int batchSize); - void createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold); + void createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold, bool memcheck); void extractActionSequence(BaseConfig & config); void makeDataLoader(std::filesystem::path dir); void makeDevDataLoader(std::filesystem::path dir); diff --git a/trainer/src/MacaonTrain.cpp b/trainer/src/MacaonTrain.cpp index 940945180494e89df1a69ed49713eec621ffdf2f..d760d1f9e92cf46d2731ea57d47d7f04d76a2c6c 100644 --- a/trainer/src/MacaonTrain.cpp +++ b/trainer/src/MacaonTrain.cpp @@ -22,6 +22,7 @@ po::options_description MacaonTrain::getOptionsDescription() opt.add_options() ("debug,d", "Print debuging infos on stderr") ("silent", "Don't print speed and progress") + ("memcheck", "Regularly print memory usage on stderr") ("devScore", "Compute score on dev instead of loss (slower)") ("mcd", po::value<std::string>()->default_value("ID,FORM,LEMMA,UPOS,XPOS,FEATS,HEAD,DEPREL"), "Comma separated column names that describes the input/output format") @@ -133,6 +134,7 @@ int MacaonTrain::main() auto nbEpoch = variables["nbEpochs"].as<int>(); auto batchSize = variables["batchSize"].as<int>(); bool debug = variables.count("debug") == 0 ? false : true; + bool memcheck = variables.count("memcheck") == 0 ? false : true; bool printAdvancement = !debug && variables.count("silent") == 0 ? true : false; bool computeDevScore = variables.count("devScore") == 0 ? false : true; auto machineContent = variables["machine"].as<std::string>(); @@ -267,11 +269,11 @@ int MacaonTrain::main() if (trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractGold) or trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic)) { machine.setDictsState(trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic) ? Dict::State::Closed : Dict::State::Open); - trainer.createDataset(goldConfigs, debug, modelPath/"examples/train", currentEpoch, trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic), explorationThreshold); + trainer.createDataset(goldConfigs, debug, modelPath/"examples/train", currentEpoch, trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic), explorationThreshold, memcheck); if (!computeDevScore) { machine.setDictsState(Dict::State::Closed); - trainer.createDataset(devGoldConfigs, debug, modelPath/"examples/dev", currentEpoch, trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic), explorationThreshold); + trainer.createDataset(devGoldConfigs, debug, modelPath/"examples/dev", currentEpoch, trainStrategy[currentEpoch].count(Trainer::TrainAction::ExtractDynamic), explorationThreshold, memcheck); } } if (trainStrategy[currentEpoch].count(Trainer::TrainAction::ResetParameters) or trainStrategy[currentEpoch].count(Trainer::TrainAction::ResetOptimizer)) @@ -392,6 +394,8 @@ int MacaonTrain::main() std::FILE * f = std::fopen(trainInfos.c_str(), "a"); fmt::print(f, "{}\t{}\n", iterStr, devScoreMean); std::fclose(f); + if (memcheck) + fmt::print(stderr, "[{}] Memory : {}\n", util::getTime(), util::getMemUsage()); } } diff --git a/trainer/src/Trainer.cpp b/trainer/src/Trainer.cpp index c65b72adb0b6b1f36e7c1e4c688071d332d71c04..aaef532a65c87e973603afd0a770823ca06bbc30 100644 --- a/trainer/src/Trainer.cpp +++ b/trainer/src/Trainer.cpp @@ -18,7 +18,7 @@ void Trainer::makeDevDataLoader(std::filesystem::path dir) devDataLoader = torch::data::make_data_loader(*devDataset, torch::data::DataLoaderOptions(batchSize).workers(0).max_jobs(0)); } -void Trainer::createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold) +void Trainer::createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold, bool memcheck) { std::vector<SubConfig> configs; for (auto & goldConfig : goldConfigs) @@ -26,12 +26,12 @@ void Trainer::createDataset(std::vector<BaseConfig> & goldConfigs, bool debug, s machine.trainMode(false); - extractExamples(configs, debug, dir, epoch, dynamicOracle, explorationThreshold); + extractExamples(configs, debug, dir, epoch, dynamicOracle, explorationThreshold, memcheck); machine.saveDicts(); } -void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold) +void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std::filesystem::path dir, int epoch, bool dynamicOracle, float explorationThreshold, bool memcheck) { torch::AutoGradMode useGrad(false); @@ -50,10 +50,13 @@ void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std: std::atomic<int> totalNbExamples = 0; + if (memcheck) + fmt::print(stderr, "[{}] Memory : {}\n", util::getTime(), util::getMemUsage()); + NeuralNetworkImpl::setDevice(torch::kCPU); machine.to(NeuralNetworkImpl::getDevice()); std::for_each(std::execution::par, configs.begin(), configs.end(), - [this, maxNbExamplesPerFile, &examplesPerState, &totalNbExamples, debug, dynamicOracle, explorationThreshold, dir, epoch, &examplesMutex](SubConfig & config) + [this, maxNbExamplesPerFile, &examplesPerState, &totalNbExamples, debug, memcheck, dynamicOracle, explorationThreshold, dir, epoch, &examplesMutex](SubConfig & config) { config.addPredicted(machine.getPredicted()); config.setStrategy(machine.getStrategyDefinition()); @@ -189,7 +192,11 @@ void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std: if (config.needsUpdate()) config.update(); + } // End while true + + if (memcheck) + fmt::print(stderr, "[{}] Memory : {}\n", util::getTime(), util::getMemUsage()); }); // End for on configs for (auto & it : examplesPerState) @@ -203,6 +210,8 @@ void Trainer::extractExamples(std::vector<SubConfig> & configs, bool debug, std: util::myThrow(fmt::format("could not create file '{}'", currentEpochAllExtractedFile.c_str())); std::fclose(f); + if (memcheck) + fmt::print(stderr, "[{}] Memory : {}\n", util::getTime(), util::getMemUsage()); fmt::print(stderr, "[{}] Extracted {} examples\n", util::getTime(), util::int2HumanStr(totalNbExamples)); }