From e1b7432de2951af7c5fcd13a940867f5d2c843c7 Mon Sep 17 00:00:00 2001 From: Franck Dary <franck.dary@lis-lab.fr> Date: Thu, 17 Dec 2020 14:15:29 +0100 Subject: [PATCH] New script to launch slurm array --- UD_any/launchBatches.py | 219 ++++++++++++++++++------------------ scripts/launchSlurmArray.py | 109 ++++++++++++++++++ 2 files changed, 219 insertions(+), 109 deletions(-) create mode 100644 scripts/launchSlurmArray.py diff --git a/UD_any/launchBatches.py b/UD_any/launchBatches.py index f8a7427..eca3540 100755 --- a/UD_any/launchBatches.py +++ b/UD_any/launchBatches.py @@ -1,9 +1,10 @@ #! /usr/bin/env python3 -import sys import os import subprocess -import time +import sys +sys.path.insert(1, '../scripts') +from launchSlurmArray import launchSlurmArray ############################################################################### def printUsageAndExit() : @@ -19,121 +20,121 @@ def prepareExperiment(lang, template, expName) : ############################################################################### def addNamesAndCommandsTrain(names, commands, mode, expName, arguments, seed) : - names.append("\""+expName+"\"") + names.append(expName) - commands.append("'"+"./train.sh {} bin/{} {} --silent --seed {}".format(mode, expName,arguments,seed)+"'") + commands.append("./train.sh {} bin/{} {} --silent --seed {}".format(mode, expName,arguments,seed)) ############################################################################### ############################################################################### def addNamesAndCommandsDecode(names, commands, mode, expName) : - names.append("\""+expName+"\"") + names.append(expName) - commands.append("\""+"./evaluate.sh {} bin/{} --silent".format(mode, expName)+"\"") + commands.append("./evaluate.sh {} bin/{} --silent".format(mode, expName)) ############################################################################### ############################################################################### -def launchArray(names, commands, mode, jobName, device, nbHours, limit) : - filename = "{}.{}.slurm".format(mode,jobName) - sFile = open(filename, "w") - - hostname = os.getenv("HOSTNAME") - - commandList = " ".join(commands) - - if hostname == "jean-zay1" : - print("""#! /usr/bin/env bash - -#SBATCH --array=0-{}%{} -#SBATCH --job-name={}:{} -#SBATCH --output=%A_%a.out -#SBATCH --error=%A_%a.err -#SBATCH --open-mode=append -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=10 -#SBATCH --gres=gpu:1 -#SBATCH --hint=nomultithread -#SBATCH --partition=gpu_p1 -#SBATCH --qos={} -#SBATCH --time={}:00:00 - -module purge -module load gcc/9.1.0 -module load python/3.7.5 - -names=({}) -commands=({}) - -newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" -newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" -oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" -oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" -tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp" - -touch $newOut - -cp $newOut $tmpFile -mv $oldOut $newOut -cat $tmpFile >> $newOut - -touch $newErr - -cp $newErr $tmpFile -mv $oldErr $newErr -cat $tmpFile >> $newErr - -rm $tmpFile - -eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" -""".format(len(names)-1, limit, mode, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile) - sFile.close() - elif hostname == "sms.liscluster" : - print('''#! /usr/bin/env bash - -#SBATCH --array=0-{}%{} -#SBATCH --job-name={}:{} -#SBATCH --output=%A_%a.out -#SBATCH --error=%A_%a.err -#SBATCH --open-mode=append -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 -#SBATCH --hint=nomultithread -#SBATCH --partition={} -#SBATCH --exclude=sensei1,lifnode1,asfalda1 -#SBATCH --time={}:00:00 - -module purge - -names=({}) -commands=({}) - -newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" -newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" -oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" -oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" -tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp" - -touch $newOut - -cp $newOut $tmpFile -mv $oldOut $newOut -cat $tmpFile >> $newOut - -touch $newErr - -cp $newErr $tmpFile -mv $oldErr $newErr -cat $tmpFile >> $newErr - -rm $tmpFile - -eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" -'''.format(len(names)-1, limit, mode, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile) - sFile.close() - else : - print("ERROR : Unknown hostname \'%s\'"%hostname) - exit(1) - - subprocess.Popen("sbatch {}".format(filename), shell=True).wait() +#def launchArray(names, commands, mode, jobName, device, nbHours, limit) : +# filename = "{}.{}.slurm".format(mode,jobName) +# sFile = open(filename, "w") +# +# hostname = os.getenv("HOSTNAME") +# +# commandList = " ".join(commands) +# +# if hostname == "jean-zay1" : +# print("""#! /usr/bin/env bash +# +##SBATCH --array=0-{}%{} +##SBATCH --job-name={}:{} +##SBATCH --output=%A_%a.out +##SBATCH --error=%A_%a.err +##SBATCH --open-mode=append +##SBATCH --ntasks=1 +##SBATCH --cpus-per-task=10 +##SBATCH --gres=gpu:1 +##SBATCH --hint=nomultithread +##SBATCH --partition=gpu_p1 +##SBATCH --qos={} +##SBATCH --time={}:00:00 +# +#module purge +#module load gcc/9.1.0 +#module load python/3.7.5 +# +#names=({}) +#commands=({}) +# +#newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" +#newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" +#oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" +#oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" +#tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp" +# +#touch $newOut +# +#cp $newOut $tmpFile +#mv $oldOut $newOut +#cat $tmpFile >> $newOut +# +#touch $newErr +# +#cp $newErr $tmpFile +#mv $oldErr $newErr +#cat $tmpFile >> $newErr +# +#rm $tmpFile +# +#eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" +#""".format(len(names)-1, limit, mode, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile) +# sFile.close() +# elif hostname == "sms.liscluster" : +# print('''#! /usr/bin/env bash +# +##SBATCH --array=0-{}%{} +##SBATCH --job-name={}:{} +##SBATCH --output=%A_%a.out +##SBATCH --error=%A_%a.err +##SBATCH --open-mode=append +##SBATCH --ntasks=1 +##SBATCH --cpus-per-task=1 +##SBATCH --hint=nomultithread +##SBATCH --partition={} +##SBATCH --exclude=sensei1,lifnode1,asfalda1 +##SBATCH --time={}:00:00 +# +#module purge +# +#names=({}) +#commands=({}) +# +#newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" +#newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" +#oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" +#oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" +#tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp" +# +#touch $newOut +# +#cp $newOut $tmpFile +#mv $oldOut $newOut +#cat $tmpFile >> $newOut +# +#touch $newErr +# +#cp $newErr $tmpFile +#mv $oldErr $newErr +#cat $tmpFile >> $newErr +# +#rm $tmpFile +# +#eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" +#'''.format(len(names)-1, limit, mode, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile) +# sFile.close() +# else : +# print("ERROR : Unknown hostname \'%s\'"%hostname) +# exit(1) +# +# subprocess.Popen("sbatch {}".format(filename), shell=True).wait() ############################################################################### ############################################################################### @@ -168,6 +169,6 @@ if __name__ == "__main__" : else : addNamesAndCommandsDecode(names, commands, xp['mode'],xp['expName']) - launchArray(names, commands, mode, name, device, nbHours, limit) + launchSlurmArray(names, commands, name, device, nbHours, limit) ############################################################################### diff --git a/scripts/launchSlurmArray.py b/scripts/launchSlurmArray.py new file mode 100644 index 0000000..eac17b7 --- /dev/null +++ b/scripts/launchSlurmArray.py @@ -0,0 +1,109 @@ +import os +import subprocess + +def launchSlurmArray(names, commands, jobName, device, nbHours, limit) : + commands = ["'%s'"%s for s in commands] + names = ["'%s'"%s for s in names] + + filename = "{}.slurm".format(jobName) + sFile = open(filename, "w") + + hostname = os.getenv("HOSTNAME") + + commandList = " ".join(commands) + + if hostname == "jean-zay1" : + print("""#! /usr/bin/env bash + +#SBATCH --array=0-{}%{} +#SBATCH --job-name={} +#SBATCH --output=%A_%a.out +#SBATCH --error=%A_%a.err +#SBATCH --open-mode=append +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=10 +#SBATCH --gres=gpu:1 +#SBATCH --hint=nomultithread +#SBATCH --partition=gpu_p1 +#SBATCH --qos={} +#SBATCH --time={}:00:00 + +module purge +module load gcc/9.1.0 +module load python/3.7.5 + +names=({}) +commands=({}) + +newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" +newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" +oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" +oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" +tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp" + +touch $newOut + +cp $newOut $tmpFile +mv $oldOut $newOut +cat $tmpFile >> $newOut + +touch $newErr + +cp $newErr $tmpFile +mv $oldErr $newErr +cat $tmpFile >> $newErr + +rm $tmpFile + +eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" +""".format(len(names)-1, limit, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile) + sFile.close() + elif hostname == "sms.liscluster" : + print('''#! /usr/bin/env bash + +#SBATCH --array=0-{}%{} +#SBATCH --job-name={} +#SBATCH --output=%A_%a.out +#SBATCH --error=%A_%a.err +#SBATCH --open-mode=append +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 +#SBATCH --hint=nomultithread +#SBATCH --partition={} +#SBATCH --exclude=sensei1,lifnode1,asfalda1 +#SBATCH --time={}:00:00 + +module purge + +names=({}) +commands=({}) + +newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" +newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" +oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" +oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" +tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp" + +touch $newOut + +cp $newOut $tmpFile +mv $oldOut $newOut +cat $tmpFile >> $newOut + +touch $newErr + +cp $newErr $tmpFile +mv $oldErr $newErr +cat $tmpFile >> $newErr + +rm $tmpFile + +eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" +'''.format(len(names)-1, limit, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile) + sFile.close() + else : + print("ERROR : Unknown hostname \'%s\'"%hostname) + exit(1) + + subprocess.Popen("sbatch {}".format(filename), shell=True).wait() + -- GitLab