Skip to content
Snippets Groups Projects
Commit e1b7432d authored by Franck Dary's avatar Franck Dary
Browse files

New script to launch slurm array

parent d6c37255
No related branches found
No related tags found
No related merge requests found
#! /usr/bin/env python3
import sys
import os
import subprocess
import time
import sys
sys.path.insert(1, '../scripts')
from launchSlurmArray import launchSlurmArray
###############################################################################
def printUsageAndExit() :
......@@ -19,121 +20,121 @@ def prepareExperiment(lang, template, expName) :
###############################################################################
def addNamesAndCommandsTrain(names, commands, mode, expName, arguments, seed) :
names.append("\""+expName+"\"")
names.append(expName)
commands.append("'"+"./train.sh {} bin/{} {} --silent --seed {}".format(mode, expName,arguments,seed)+"'")
commands.append("./train.sh {} bin/{} {} --silent --seed {}".format(mode, expName,arguments,seed))
###############################################################################
###############################################################################
def addNamesAndCommandsDecode(names, commands, mode, expName) :
names.append("\""+expName+"\"")
names.append(expName)
commands.append("\""+"./evaluate.sh {} bin/{} --silent".format(mode, expName)+"\"")
commands.append("./evaluate.sh {} bin/{} --silent".format(mode, expName))
###############################################################################
###############################################################################
def launchArray(names, commands, mode, jobName, device, nbHours, limit) :
filename = "{}.{}.slurm".format(mode,jobName)
sFile = open(filename, "w")
hostname = os.getenv("HOSTNAME")
commandList = " ".join(commands)
if hostname == "jean-zay1" :
print("""#! /usr/bin/env bash
#SBATCH --array=0-{}%{}
#SBATCH --job-name={}:{}
#SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10
#SBATCH --gres=gpu:1
#SBATCH --hint=nomultithread
#SBATCH --partition=gpu_p1
#SBATCH --qos={}
#SBATCH --time={}:00:00
module purge
module load gcc/9.1.0
module load python/3.7.5
names=({})
commands=({})
newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
touch $newOut
cp $newOut $tmpFile
mv $oldOut $newOut
cat $tmpFile >> $newOut
touch $newErr
cp $newErr $tmpFile
mv $oldErr $newErr
cat $tmpFile >> $newErr
rm $tmpFile
eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
""".format(len(names)-1, limit, mode, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile)
sFile.close()
elif hostname == "sms.liscluster" :
print('''#! /usr/bin/env bash
#SBATCH --array=0-{}%{}
#SBATCH --job-name={}:{}
#SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --hint=nomultithread
#SBATCH --partition={}
#SBATCH --exclude=sensei1,lifnode1,asfalda1
#SBATCH --time={}:00:00
module purge
names=({})
commands=({})
newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
touch $newOut
cp $newOut $tmpFile
mv $oldOut $newOut
cat $tmpFile >> $newOut
touch $newErr
cp $newErr $tmpFile
mv $oldErr $newErr
cat $tmpFile >> $newErr
rm $tmpFile
eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
'''.format(len(names)-1, limit, mode, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile)
sFile.close()
else :
print("ERROR : Unknown hostname \'%s\'"%hostname)
exit(1)
subprocess.Popen("sbatch {}".format(filename), shell=True).wait()
#def launchArray(names, commands, mode, jobName, device, nbHours, limit) :
# filename = "{}.{}.slurm".format(mode,jobName)
# sFile = open(filename, "w")
#
# hostname = os.getenv("HOSTNAME")
#
# commandList = " ".join(commands)
#
# if hostname == "jean-zay1" :
# print("""#! /usr/bin/env bash
#
##SBATCH --array=0-{}%{}
##SBATCH --job-name={}:{}
##SBATCH --output=%A_%a.out
##SBATCH --error=%A_%a.err
##SBATCH --open-mode=append
##SBATCH --ntasks=1
##SBATCH --cpus-per-task=10
##SBATCH --gres=gpu:1
##SBATCH --hint=nomultithread
##SBATCH --partition=gpu_p1
##SBATCH --qos={}
##SBATCH --time={}:00:00
#
#module purge
#module load gcc/9.1.0
#module load python/3.7.5
#
#names=({})
#commands=({})
#
#newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
#newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
#oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
#oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
#tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
#
#touch $newOut
#
#cp $newOut $tmpFile
#mv $oldOut $newOut
#cat $tmpFile >> $newOut
#
#touch $newErr
#
#cp $newErr $tmpFile
#mv $oldErr $newErr
#cat $tmpFile >> $newErr
#
#rm $tmpFile
#
#eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
#""".format(len(names)-1, limit, mode, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile)
# sFile.close()
# elif hostname == "sms.liscluster" :
# print('''#! /usr/bin/env bash
#
##SBATCH --array=0-{}%{}
##SBATCH --job-name={}:{}
##SBATCH --output=%A_%a.out
##SBATCH --error=%A_%a.err
##SBATCH --open-mode=append
##SBATCH --ntasks=1
##SBATCH --cpus-per-task=1
##SBATCH --hint=nomultithread
##SBATCH --partition={}
##SBATCH --exclude=sensei1,lifnode1,asfalda1
##SBATCH --time={}:00:00
#
#module purge
#
#names=({})
#commands=({})
#
#newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
#newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
#oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
#oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
#tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
#
#touch $newOut
#
#cp $newOut $tmpFile
#mv $oldOut $newOut
#cat $tmpFile >> $newOut
#
#touch $newErr
#
#cp $newErr $tmpFile
#mv $oldErr $newErr
#cat $tmpFile >> $newErr
#
#rm $tmpFile
#
#eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
#'''.format(len(names)-1, limit, mode, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile)
# sFile.close()
# else :
# print("ERROR : Unknown hostname \'%s\'"%hostname)
# exit(1)
#
# subprocess.Popen("sbatch {}".format(filename), shell=True).wait()
###############################################################################
###############################################################################
......@@ -168,6 +169,6 @@ if __name__ == "__main__" :
else :
addNamesAndCommandsDecode(names, commands, xp['mode'],xp['expName'])
launchArray(names, commands, mode, name, device, nbHours, limit)
launchSlurmArray(names, commands, name, device, nbHours, limit)
###############################################################################
import os
import subprocess
def launchSlurmArray(names, commands, jobName, device, nbHours, limit) :
commands = ["'%s'"%s for s in commands]
names = ["'%s'"%s for s in names]
filename = "{}.slurm".format(jobName)
sFile = open(filename, "w")
hostname = os.getenv("HOSTNAME")
commandList = " ".join(commands)
if hostname == "jean-zay1" :
print("""#! /usr/bin/env bash
#SBATCH --array=0-{}%{}
#SBATCH --job-name={}
#SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10
#SBATCH --gres=gpu:1
#SBATCH --hint=nomultithread
#SBATCH --partition=gpu_p1
#SBATCH --qos={}
#SBATCH --time={}:00:00
module purge
module load gcc/9.1.0
module load python/3.7.5
names=({})
commands=({})
newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
touch $newOut
cp $newOut $tmpFile
mv $oldOut $newOut
cat $tmpFile >> $newOut
touch $newErr
cp $newErr $tmpFile
mv $oldErr $newErr
cat $tmpFile >> $newErr
rm $tmpFile
eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
""".format(len(names)-1, limit, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile)
sFile.close()
elif hostname == "sms.liscluster" :
print('''#! /usr/bin/env bash
#SBATCH --array=0-{}%{}
#SBATCH --job-name={}
#SBATCH --output=%A_%a.out
#SBATCH --error=%A_%a.err
#SBATCH --open-mode=append
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=1
#SBATCH --hint=nomultithread
#SBATCH --partition={}
#SBATCH --exclude=sensei1,lifnode1,asfalda1
#SBATCH --time={}:00:00
module purge
names=({})
commands=({})
newOut=${{names[$SLURM_ARRAY_TASK_ID]}}".stdout"
newErr=${{names[$SLURM_ARRAY_TASK_ID]}}".stderr"
oldOut=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out"
oldErr=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err"
tmpFile=$SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".tmp"
touch $newOut
cp $newOut $tmpFile
mv $oldOut $newOut
cat $tmpFile >> $newOut
touch $newErr
cp $newErr $tmpFile
mv $oldErr $newErr
cat $tmpFile >> $newErr
rm $tmpFile
eval "${{commands[$SLURM_ARRAY_TASK_ID]}}"
'''.format(len(names)-1, limit, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile)
sFile.close()
else :
print("ERROR : Unknown hostname \'%s\'"%hostname)
exit(1)
subprocess.Popen("sbatch {}".format(filename), shell=True).wait()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment