Skip to content
Snippets Groups Projects
Commit e1b7432d authored by Franck Dary's avatar Franck Dary
Browse files

New script to launch slurm array

parent d6c37255
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3 #! /usr/bin/env python3
import sys
import os import os
import subprocess import subprocess
import time import sys
sys.path.insert(1, '../scripts')
from launchSlurmArray import launchSlurmArray
############################################################################### ###############################################################################
def printUsageAndExit() : def printUsageAndExit() :
...@@ -19,121 +20,121 @@ def prepareExperiment(lang, template, expName) : ...@@ -19,121 +20,121 @@ def prepareExperiment(lang, template, expName) :
############################################################################### ###############################################################################
def addNamesAndCommandsTrain(names, commands, mode, expName, arguments, seed) : def addNamesAndCommandsTrain(names, commands, mode, expName, arguments, seed) :
names.append("\""+expName+"\"") names.append(expName)
commands.append("'"+"./train.sh {} bin/{} {} --silent --seed {}".format(mode, expName,arguments,seed)+"'") commands.append("./train.sh {} bin/{} {} --silent --seed {}".format(mode, expName,arguments,seed))
############################################################################### ###############################################################################
############################################################################### ###############################################################################
def addNamesAndCommandsDecode(names, commands, mode, expName) : def addNamesAndCommandsDecode(names, commands, mode, expName) :
names.append("\""+expName+"\"") names.append(expName)
commands.append("\""+"./evaluate.sh {} bin/{} --silent".format(mode, expName)+"\"") commands.append("./evaluate.sh {} bin/{} --silent".format(mode, expName))
############################################################################### ###############################################################################
############################################################################### ###############################################################################
def launchArray(names, commands, mode, jobName, device, nbHours, limit) : #def launchArray(names, commands, mode, jobName, device, nbHours, limit) :
filename = "{}.{}.slurm".format(mode,jobName) # filename = "{}.{}.slurm".format(mode,jobName)
sFile = open(filename, "w") # sFile = open(filename, "w")
#
hostname = os.getenv("HOSTNAME") # hostname = os.getenv("HOSTNAME")
#
commandList = " ".join(commands) # commandList = " ".join(commands)
#
if hostname == "jean-zay1" : # if hostname == "jean-zay1" :
print("""#! /usr/bin/env bash # print("""#! /usr/bin/env bash
#
#SBATCH --array=0-{}%{} ##SBATCH --array=0-{}%{}
#SBATCH --job-name={}:{} ##SBATCH --job-name={}:{}
#SBATCH --output=%A_%a.out ##SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err ##SBATCH --error=%A_%a.err
#SBATCH --open-mode=append ##SBATCH --open-mode=append
#SBATCH --ntasks=1 ##SBATCH --ntasks=1
#SBATCH --cpus-per-task=10 ##SBATCH --cpus-per-task=10
#SBATCH --gres=gpu:1 ##SBATCH --gres=gpu:1
#SBATCH --hint=nomultithread ##SBATCH --hint=nomultithread
#SBATCH --partition=gpu_p1 ##SBATCH --partition=gpu_p1
#SBATCH --qos={} ##SBATCH --qos={}
#SBATCH --time={}:00:00 ##SBATCH --time={}:00:00
#
module purge #module purge
module load gcc/9.1.0 #module load gcc/9.1.0
module load python/3.7.5 #module load python/3.7.5
#
names=({}) #names=({})
commands=({}) #commands=({})
#
newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" #newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" #newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" #oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" #oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp" #tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
#
touch $newOut #touch $newOut
#
cp $newOut $tmpFile #cp $newOut $tmpFile
mv $oldOut $newOut #mv $oldOut $newOut
cat $tmpFile >> $newOut #cat $tmpFile >> $newOut
#
touch $newErr #touch $newErr
#
cp $newErr $tmpFile #cp $newErr $tmpFile
mv $oldErr $newErr #mv $oldErr $newErr
cat $tmpFile >> $newErr #cat $tmpFile >> $newErr
#
rm $tmpFile #rm $tmpFile
#
eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" #eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
""".format(len(names)-1, limit, mode, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile) #""".format(len(names)-1, limit, mode, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile)
sFile.close() # sFile.close()
elif hostname == "sms.liscluster" : # elif hostname == "sms.liscluster" :
print('''#! /usr/bin/env bash # print('''#! /usr/bin/env bash
#
#SBATCH --array=0-{}%{} ##SBATCH --array=0-{}%{}
#SBATCH --job-name={}:{} ##SBATCH --job-name={}:{}
#SBATCH --output=%A_%a.out ##SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err ##SBATCH --error=%A_%a.err
#SBATCH --open-mode=append ##SBATCH --open-mode=append
#SBATCH --ntasks=1 ##SBATCH --ntasks=1
#SBATCH --cpus-per-task=1 ##SBATCH --cpus-per-task=1
#SBATCH --hint=nomultithread ##SBATCH --hint=nomultithread
#SBATCH --partition={} ##SBATCH --partition={}
#SBATCH --exclude=sensei1,lifnode1,asfalda1 ##SBATCH --exclude=sensei1,lifnode1,asfalda1
#SBATCH --time={}:00:00 ##SBATCH --time={}:00:00
#
module purge #module purge
#
names=({}) #names=({})
commands=({}) #commands=({})
#
newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" #newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" #newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" #oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" #oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp" #tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
#
touch $newOut #touch $newOut
#
cp $newOut $tmpFile #cp $newOut $tmpFile
mv $oldOut $newOut #mv $oldOut $newOut
cat $tmpFile >> $newOut #cat $tmpFile >> $newOut
#
touch $newErr #touch $newErr
#
cp $newErr $tmpFile #cp $newErr $tmpFile
mv $oldErr $newErr #mv $oldErr $newErr
cat $tmpFile >> $newErr #cat $tmpFile >> $newErr
#
rm $tmpFile #rm $tmpFile
#
eval "${{commands[$SLURM_ARRAY_TASK_ID]}}" #eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
'''.format(len(names)-1, limit, mode, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile) #'''.format(len(names)-1, limit, mode, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile)
sFile.close() # sFile.close()
else : # else :
print("ERROR : Unknown hostname \'%s\'"%hostname) # print("ERROR : Unknown hostname \'%s\'"%hostname)
exit(1) # exit(1)
#
subprocess.Popen("sbatch {}".format(filename), shell=True).wait() # subprocess.Popen("sbatch {}".format(filename), shell=True).wait()
############################################################################### ###############################################################################
############################################################################### ###############################################################################
...@@ -168,6 +169,6 @@ if __name__ == "__main__" : ...@@ -168,6 +169,6 @@ if __name__ == "__main__" :
else : else :
addNamesAndCommandsDecode(names, commands, xp['mode'],xp['expName']) addNamesAndCommandsDecode(names, commands, xp['mode'],xp['expName'])
launchArray(names, commands, mode, name, device, nbHours, limit) launchSlurmArray(names, commands, name, device, nbHours, limit)
############################################################################### ###############################################################################
import os
import subprocess
def launchSlurmArray(names, commands, jobName, device, nbHours, limit) :
commands = ["'%s'"%s for s in commands]
names = ["'%s'"%s for s in names]
filename = "{}.slurm".format(jobName)
sFile = open(filename, "w")
hostname = os.getenv("HOSTNAME")
commandList = " ".join(commands)
if hostname == "jean-zay1" :
print("""#! /usr/bin/env bash
#SBATCH --array=0-{}%{}
#SBATCH --job-name={}
#SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10
#SBATCH --gres=gpu:1
#SBATCH --hint=nomultithread
#SBATCH --partition=gpu_p1
#SBATCH --qos={}
#SBATCH --time={}:00:00
module purge
module load gcc/9.1.0
module load python/3.7.5
names=({})
commands=({})
newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
touch $newOut
cp $newOut $tmpFile
mv $oldOut $newOut
cat $tmpFile >> $newOut
touch $newErr
cp $newErr $tmpFile
mv $oldErr $newErr
cat $tmpFile >> $newErr
rm $tmpFile
eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
""".format(len(names)-1, limit, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile)
sFile.close()
elif hostname == "sms.liscluster" :
print('''#! /usr/bin/env bash
#SBATCH --array=0-{}%{}
#SBATCH --job-name={}
#SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --hint=nomultithread
#SBATCH --partition={}
#SBATCH --exclude=sensei1,lifnode1,asfalda1
#SBATCH --time={}:00:00
module purge
names=({})
commands=({})
newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
touch $newOut
cp $newOut $tmpFile
mv $oldOut $newOut
cat $tmpFile >> $newOut
touch $newErr
cp $newErr $tmpFile
mv $oldErr $newErr
cat $tmpFile >> $newErr
rm $tmpFile
eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
'''.format(len(names)-1, limit, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile)
sFile.close()
else :
print("ERROR : Unknown hostname \'%s\'"%hostname)
exit(1)
subprocess.Popen("sbatch {}".format(filename), shell=True).wait()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment