Skip to content
Snippets Groups Projects
Commit ff1a5ee6 authored by Franck Dary's avatar Franck Dary
Browse files

Reworked launchBatches.py to use slurm array feature

parent 8f110f50
No related branches found
No related tags found
No related merge requests found
......@@ -7,7 +7,7 @@ import time
###############################################################################
def printUsageAndExit() :
print("USAGE : %s (train | eval) (bash | oar | slurm) (gpu | cpu) batchesDescription.py (--time nbHours)"%sys.argv[0], file=sys.stderr)
print("USAGE : %s (train | eval) (gpu | cpu) batchesDescription.py nbHours jobName maxNbSimultaneousJobs"%sys.argv[0], file=sys.stderr)
exit(1)
###############################################################################
......@@ -18,62 +18,35 @@ def prepareExperiment(lang, template, expName) :
###############################################################################
###############################################################################
def launchTrain(device, mode, expName, arguments, launcher, nbHours, seed) :
if launcher == "bash" :
launchTrainBash(mode, expName, arguments, seed)
elif launcher == "oar" :
launchTrainOar(device, mode, expName, arguments, nbHours, seed)
elif launcher == "slurm" :
launchTrainSlurm(device, mode, expName, arguments, nbHours, seed)
else :
printUsageAndExit()
def addNamesAndCommandsTrain(names, commands, mode, expName, arguments, seed) :
names.append("\""+expName+"\"")
commands.append("\""+"./train.sh {} bin/{} {} --silent --seed {}".format(mode, expName,arguments,seed)+"\"")
###############################################################################
###############################################################################
def launchTrainBash(mode, expName, arguments, seed) :
subprocess.Popen("./train.sh %s bin/%s %s --silent --seed %d"%(mode,expName,arguments,seed),
shell=True, stdout=open("%s.stdout"%expName,'w'), stderr=open("%s.stderr"%expName,'w'))
def addNamesAndCommandsDecode(names, commands, mode, expName) :
names.append("\""+expName+"\"")
commands.append("\""+"./evaluate.sh {} bin/{} --silent".format(mode, expName)+"\"")
###############################################################################
###############################################################################
def nbMaxLongJobs() :
return 2
###############################################################################
###############################################################################
def launchTrainOar(device, mode, expName, arguments, nbHours, seed) :
bestEffort = getOarNbLongJobs() >= nbMaxLongJobs()
command = "oarsub"
command += " -t besteffort" if bestEffort else ""
command += " -t idempotent" if bestEffort else ""
command += " -n train:%s"%expName
command += " -E %s.stderr"%expName
command += " -O %s.stdout"%expName
if device == "gpu" :
command += " -p \"gpu IS NOT NULL%s\""%getBestHostConstraint()
command += " -l walltime=%d:00:00"%nbHours
else :
command += " -p \"gpu IS NULL\""
command += " -l /core=4,walltime=%d:00:00"%nbHours
command += " \'" + "./train.sh %s bin/%s %s --silent --seed %d"%(mode,expName,arguments,seed) + "\'"
subprocess.Popen(command, shell=True).wait()
###############################################################################
###############################################################################
def launchTrainSlurm(device, mode, expName, arguments, nbHours, seed) :
filename = "train.{}.slurm".format(expName)
def launchArray(names, commands, mode, jobName, device, nbHours, limit) :
filename = "{}.{}.slurm".format(mode,jobName)
sFile = open(filename, "w")
hostname = os.getenv("HOSTNAME")
commandList = " ".join(commands)
if hostname == "jean-zay1" :
print("""#! /usr/bin/env bash
#SBATCH --job-name=train:{}
#SBATCH --output={}.stdout
#SBATCH --error={}.stderr
#SBATCH --array=0-{}%{}
#SBATCH --job-name={}:{}
#SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10
......@@ -87,108 +60,21 @@ module purge
module load gcc/9.1.0
module load python/3.7.5
./train.sh {} bin/{} {} --silent --seed {}
""".format(expName, expName, expName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, mode, expName, arguments, seed), file=sFile)
sFile.close()
elif hostname == "sms.liscluster" :
print("""#! /usr/bin/env bash
#SBATCH --job-name=train:{}
#SBATCH --output={}.stdout
#SBATCH --error={}.stderr
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --hint=nomultithread
#SBATCH --partition={}
#SBATCH --exclude=sensei1,lifnode1,asfalda1
#SBATCH --time={}:00:00
module purge
names=({})
commands=({})
./train.sh {} bin/{} {} --silent --seed {}
""".format(expName, expName, expName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, mode, expName, arguments, seed), file=sFile)
sFile.close()
else :
print("ERROR : Unknown hostname \'%s\'"%hostname)
exit(1)
subprocess.Popen("sbatch {}".format(filename), shell=True).wait()
###############################################################################
###############################################################################
def launchEval(device, mode, expName, launcher, nbHours) :
if launcher == "bash" :
launchEvalBash(mode, expName)
elif launcher == "oar" :
launchEvalOar(device, mode, expName, nbHours)
elif launcher == "slurm" :
launchEvalSlurm(mode, expName, nbHours)
else :
printUsageAndExit()
###############################################################################
###############################################################################
def launchEvalBash(mode, expName) :
subprocess.Popen("./evaluate.sh %s bin/%s --silent"%(mode,expName),
shell=True, stdout=open("%s.stdout"%expName,'a'), stderr=open("%s.stderr"%expName,'a'))
###############################################################################
###############################################################################
def launchEvalOar(device, mode, expName, nbHours) :
bestEffort = getOarNbLongJobs() >= nbMaxLongJobs() and nbHours > 10
command = "oarsub"
command += " -t besteffort" if bestEffort else ""
command += " -t idempotent" if bestEffort else ""
command += " -n eval:%s"%expName
command += " -E %s.stderr"%expName
command += " -O %s.stdout"%expName
if device == "gpu" :
command += " -p \"gpu IS NOT NULL%s\""%getBestHostConstraint()
command += " -l walltime=%d:00:00"%nbHours
else :
command += " -p \"gpu IS NULL\""
command += " -l /core=4,walltime=%d:00:00"%nbHours
command += " \"" + "./evaluate.sh %s bin/%s --silent"%(mode,expName) + "\""
subprocess.Popen(command, shell=True).wait()
###############################################################################
###############################################################################
def launchEvalSlurm(mode, expName, nbHours) :
filename = "eval.{}.slurm".format(expName)
sFile = open(filename, "w")
hostname = os.getenv("HOSTNAME")
if hostname == "jean-zay1" :
print("""#! /usr/bin/env bash
#SBATCH --job-name=eval:{}
#SBATCH --output={}.stdout
#SBATCH --error={}.stderr
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10
#SBATCH --gres=gpu:1
#SBATCH --hint=nomultithread
#SBATCH --partition=gpu_p1
#SBATCH --time={}:00:00
module purge
module load gcc/9.1.0
module load python/3.7.5
./evaluate.sh {} bin/{} --silent
""".format(expName, expName, expName, nbHours, mode, expName), file=sFile)
mv $SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" ${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
mv $SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" ${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
${{commands[$SLURM_ARRAY_TASK_ID]}}
""".format(len(names), limit, mode, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile)
sFile.close()
elif hostname == "sms.liscluster" :
print("""#! /usr/bin/env bash
#SBATCH --job-name=eval:{}
#SBATCH --output={}.stdout
#SBATCH --error={}.stderr
#SBATCH --array=0-{}%{}
#SBATCH --job-name={}:{}
#SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
......@@ -199,122 +85,42 @@ module load python/3.7.5
module purge
./evaluate.sh {} bin/{} --silent
""".format(expName, expName, expName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, mode, expName), file=sFile)
names=({})
commands=({})
mv $SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" ${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
mv $SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" ${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
${{commands[$SLURM_ARRAY_TASK_ID]}}
""".format(len(names), limit, mode, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile)
sFile.close()
else :
print("ERROR : Unknown hostname \'%s\'"%hostname)
exit(1)
subprocess.Popen("sbatch {}".format(filename), shell=True).wait()
###############################################################################
###############################################################################
def getOarNbLongJobs() :
return int(subprocess.Popen('oarstat -u | grep "Q=long" | wc -l',
shell=True, stdout=subprocess.PIPE).stdout.read())
###############################################################################
###############################################################################
def getOarNbGpuPerNode() :
l = subprocess.Popen("oarnodes | grep gpunum=. | grep -o 'host=[^,]*' | cut -f2 -d= | sort | uniq -c", shell=True, stdout=subprocess.PIPE).stdout.read().decode('utf8').split('\n')
res = {}
for line in l :
splited = line.split()
if len(splited) != 2 :
continue
res[splited[1]] = int(splited[0])
return res
###############################################################################
###############################################################################
def getOarNbUsedGpuPerNode() :
l = subprocess.Popen("oarstat -f | grep 'assigned_hostnames =\|propert\|wanted_resources' | grep -i 'gpu is not null' -B 2 | grep [^-]", shell=True, stdout=subprocess.PIPE).stdout.read().decode("utf8").split('\n')
res = {}
for i in range(len(l)//3) :
ressources = l[3*i]
hostname = l[3*i+1].split()[-1]
cores = 1
gpunum = 1
if "core=" in ressources :
coresStr=""
coresStrBase = ressources.split("core=")[-1]
for symbol in coresStrBase :
if symbol < '0' or symbol > '9' :
break
coresStr = coresStr + symbol
cores = int(coresStr)
if "gpunum=" in ressources :
gpunum = int(ressources.split("gpunum=")[-1].split(',')[0])
if hostname not in res :
res[hostname] = 0
res[hostname] += cores * gpunum
return res
###############################################################################
###############################################################################
def getOarNotAliveNodes() :
res = subprocess.Popen("oarnodes | grep -B 2 'state : [^A]' | grep 'network_address' | sort --unique | awk '{print $3}'", shell=True, stdout=subprocess.PIPE).stdout.read().decode("utf8").split('\n')
return [node for node in res if len(node) > 0]
###############################################################################
###############################################################################
def getOarNbFreeGpuPerNode() :
gpus = getOarNbGpuPerNode()
notAlive = getOarNotAliveNodes()
usedGpus = getOarNbUsedGpuPerNode()
for gpu in gpus :
gpus[gpu] -= usedGpus[gpu] if gpu in usedGpus else 0
for host in notAlive :
gpus[host] = 0
return gpus
###############################################################################
###############################################################################
def getBestHostConstraint() :
return " and host!='lifnode1' and host!='sensei1'"
freeGpus = getOarNbFreeGpuPerNode()
if freeGpus["diflives1"] > 0 or freeGpus["lisnode2"] > 0 or freeGpus["lisnode3"] > 0 :
return " and host!='lifnode1' and host!='adnvideo1' and host!='asfalda1' and host!='see4c1' and host!='sensei1'"
return ""
###############################################################################
###############################################################################
if __name__ == "__main__" :
if len(sys.argv) < 5 :
if len(sys.argv) != 7 :
printUsageAndExit()
mode = sys.argv[1]
launcher = sys.argv[2]
device = sys.argv[3]
batchesDescription = sys.argv[4]
nbHours = 92
if len(sys.argv) > 5 :
if sys.argv[5] == "--time" :
if 6 not in range(5,len(sys.argv)) :
printUsageAndExit()
nbHours = int(sys.argv[6])
else :
printUsageAndExit()
device = sys.argv[2]
batchesDescription = sys.argv[3]
nbHours = sys.argv[4]
name = sys.argv[5]
limit = sys.argv[6]
if mode not in ["train","eval"] or launcher not in ["bash","oar","slurm"] or device not in ["cpu","gpu"] :
if mode not in ["train","eval"] or device not in ["cpu","gpu"] :
printUsageAndExit()
desc = __import__(os.path.splitext(batchesDescription)[0])
names = []
commands = []
for lang in desc.langs :
for xp in desc.templatesExperiments :
for i in desc.repRange :
......@@ -322,9 +128,10 @@ if __name__ == "__main__" :
xp['expName'] = xp['expName'].split('.')[0]+"."+lang+"."+str(i)
if mode == "train" :
prepareExperiment(xp['lang'],xp['template'],xp['expName'])
launchTrain(device,xp['mode'],xp['expName'],xp['arguments'],launcher,nbHours,seed=100+i)
addNamesAndCommandsTrain(names, commands, xp['mode'],xp['expName'],xp['arguments'],seed=100+i)
else :
launchEval(device,xp['mode'],xp['expName'],launcher,nbHours)
addNamesAndCommandsDecode(names, commands, xp['mode'],xp['expName'])
launchArray(names, commands, mode, name, device, nbHours, limit)
###############################################################################
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment