diff --git a/UD_any/launchBatches.py b/UD_any/launchBatches.py index 599e20f1cb07e9382164b45039ff27b8f4b47ee0..ba00da1b2815c65dae5fc8dd49d88a1001fd6dc1 100755 --- a/UD_any/launchBatches.py +++ b/UD_any/launchBatches.py @@ -7,7 +7,7 @@ import time ############################################################################### def printUsageAndExit() : - print("USAGE : %s (train | eval) (bash | oar | slurm) (gpu | cpu) batchesDescription.py (--time nbHours)"%sys.argv[0], file=sys.stderr) + print("USAGE : %s (train | eval) (gpu | cpu) batchesDescription.py nbHours jobName maxNbSimultaneousJobs"%sys.argv[0], file=sys.stderr) exit(1) ############################################################################### @@ -18,62 +18,35 @@ def prepareExperiment(lang, template, expName) : ############################################################################### ############################################################################### -def launchTrain(device, mode, expName, arguments, launcher, nbHours, seed) : - if launcher == "bash" : - launchTrainBash(mode, expName, arguments, seed) - elif launcher == "oar" : - launchTrainOar(device, mode, expName, arguments, nbHours, seed) - elif launcher == "slurm" : - launchTrainSlurm(device, mode, expName, arguments, nbHours, seed) - else : - printUsageAndExit() +def addNamesAndCommandsTrain(names, commands, mode, expName, arguments, seed) : + names.append("\""+expName+"\"") + + commands.append("\""+"./train.sh {} bin/{} {} --silent --seed {}".format(mode, expName,arguments,seed)+"\"") ############################################################################### ############################################################################### -def launchTrainBash(mode, expName, arguments, seed) : - subprocess.Popen("./train.sh %s bin/%s %s --silent --seed %d"%(mode,expName,arguments,seed), - shell=True, stdout=open("%s.stdout"%expName,'w'), stderr=open("%s.stderr"%expName,'w')) +def addNamesAndCommandsDecode(names, commands, mode, expName) : + names.append("\""+expName+"\"") + + commands.append("\""+"./evaluate.sh {} bin/{} --silent".format(mode, expName)+"\"") ############################################################################### ############################################################################### -def nbMaxLongJobs() : - return 2 -############################################################################### - -############################################################################### -def launchTrainOar(device, mode, expName, arguments, nbHours, seed) : - bestEffort = getOarNbLongJobs() >= nbMaxLongJobs() - - command = "oarsub" - command += " -t besteffort" if bestEffort else "" - command += " -t idempotent" if bestEffort else "" - command += " -n train:%s"%expName - command += " -E %s.stderr"%expName - command += " -O %s.stdout"%expName - if device == "gpu" : - command += " -p \"gpu IS NOT NULL%s\""%getBestHostConstraint() - command += " -l walltime=%d:00:00"%nbHours - else : - command += " -p \"gpu IS NULL\"" - command += " -l /core=4,walltime=%d:00:00"%nbHours - command += " \'" + "./train.sh %s bin/%s %s --silent --seed %d"%(mode,expName,arguments,seed) + "\'" - - subprocess.Popen(command, shell=True).wait() -############################################################################### - -############################################################################### -def launchTrainSlurm(device, mode, expName, arguments, nbHours, seed) : - filename = "train.{}.slurm".format(expName) +def launchArray(names, commands, mode, jobName, device, nbHours, limit) : + filename = "{}.{}.slurm".format(mode,jobName) sFile = open(filename, "w") hostname = os.getenv("HOSTNAME") + commandList = " ".join(commands) + if hostname == "jean-zay1" : print("""#! /usr/bin/env bash -#SBATCH --job-name=train:{} -#SBATCH --output={}.stdout -#SBATCH --error={}.stderr +#SBATCH --array=0-{}%{} +#SBATCH --job-name={}:{} +#SBATCH --output=%A_%a.out +#SBATCH --error=%A_%a.err #SBATCH --open-mode=append #SBATCH --ntasks=1 #SBATCH --cpus-per-task=10 @@ -87,108 +60,21 @@ module purge module load gcc/9.1.0 module load python/3.7.5 -./train.sh {} bin/{} {} --silent --seed {} -""".format(expName, expName, expName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, mode, expName, arguments, seed), file=sFile) - sFile.close() - elif hostname == "sms.liscluster" : - print("""#! /usr/bin/env bash - -#SBATCH --job-name=train:{} -#SBATCH --output={}.stdout -#SBATCH --error={}.stderr -#SBATCH --open-mode=append -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=1 -#SBATCH --hint=nomultithread -#SBATCH --partition={} -#SBATCH --exclude=sensei1,lifnode1,asfalda1 -#SBATCH --time={}:00:00 - -module purge +names=({}) +commands=({}) -./train.sh {} bin/{} {} --silent --seed {} -""".format(expName, expName, expName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, mode, expName, arguments, seed), file=sFile) - sFile.close() - else : - print("ERROR : Unknown hostname \'%s\'"%hostname) - exit(1) - - subprocess.Popen("sbatch {}".format(filename), shell=True).wait() -############################################################################### - -############################################################################### -def launchEval(device, mode, expName, launcher, nbHours) : - if launcher == "bash" : - launchEvalBash(mode, expName) - elif launcher == "oar" : - launchEvalOar(device, mode, expName, nbHours) - elif launcher == "slurm" : - launchEvalSlurm(mode, expName, nbHours) - else : - printUsageAndExit() -############################################################################### - -############################################################################### -def launchEvalBash(mode, expName) : - subprocess.Popen("./evaluate.sh %s bin/%s --silent"%(mode,expName), - shell=True, stdout=open("%s.stdout"%expName,'a'), stderr=open("%s.stderr"%expName,'a')) -############################################################################### - -############################################################################### -def launchEvalOar(device, mode, expName, nbHours) : - bestEffort = getOarNbLongJobs() >= nbMaxLongJobs() and nbHours > 10 - - command = "oarsub" - command += " -t besteffort" if bestEffort else "" - command += " -t idempotent" if bestEffort else "" - command += " -n eval:%s"%expName - command += " -E %s.stderr"%expName - command += " -O %s.stdout"%expName - if device == "gpu" : - command += " -p \"gpu IS NOT NULL%s\""%getBestHostConstraint() - command += " -l walltime=%d:00:00"%nbHours - else : - command += " -p \"gpu IS NULL\"" - command += " -l /core=4,walltime=%d:00:00"%nbHours - command += " \"" + "./evaluate.sh %s bin/%s --silent"%(mode,expName) + "\"" - - subprocess.Popen(command, shell=True).wait() -############################################################################### - -############################################################################### -def launchEvalSlurm(mode, expName, nbHours) : - filename = "eval.{}.slurm".format(expName) - sFile = open(filename, "w") - - hostname = os.getenv("HOSTNAME") - - if hostname == "jean-zay1" : - print("""#! /usr/bin/env bash - -#SBATCH --job-name=eval:{} -#SBATCH --output={}.stdout -#SBATCH --error={}.stderr -#SBATCH --open-mode=append -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=10 -#SBATCH --gres=gpu:1 -#SBATCH --hint=nomultithread -#SBATCH --partition=gpu_p1 -#SBATCH --time={}:00:00 - -module purge -module load gcc/9.1.0 -module load python/3.7.5 - -./evaluate.sh {} bin/{} --silent -""".format(expName, expName, expName, nbHours, mode, expName), file=sFile) +mv $SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" ${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" +mv $SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" ${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" +${{commands[$SLURM_ARRAY_TASK_ID]}} +""".format(len(names), limit, mode, jobName, "qos_gpu-t4" if nbHours > 20 else "qos_gpu-t3", nbHours, " ".join(names), " ".join(commands)), file=sFile) sFile.close() elif hostname == "sms.liscluster" : print("""#! /usr/bin/env bash -#SBATCH --job-name=eval:{} -#SBATCH --output={}.stdout -#SBATCH --error={}.stderr +#SBATCH --array=0-{}%{} +#SBATCH --job-name={}:{} +#SBATCH --output=%A_%a.out +#SBATCH --error=%A_%a.err #SBATCH --open-mode=append #SBATCH --ntasks=1 #SBATCH --cpus-per-task=1 @@ -199,122 +85,42 @@ module load python/3.7.5 module purge -./evaluate.sh {} bin/{} --silent -""".format(expName, expName, expName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, mode, expName), file=sFile) +names=({}) +commands=({}) + +mv $SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".out" ${{names[$SLURM_ARRAY_TASK_ID]}}".stdout" +mv $SLURM_ARRAY_JOB_ID"_"$SLURM_ARRAY_TASK_ID".err" ${{names[$SLURM_ARRAY_TASK_ID]}}".stderr" +${{commands[$SLURM_ARRAY_TASK_ID]}} +""".format(len(names), limit, mode, jobName, "cpu" if device == "cpu" else "gpu\n#SBATCH --gres=gpu", nbHours, " ".join(names), commandList), file=sFile) sFile.close() else : print("ERROR : Unknown hostname \'%s\'"%hostname) exit(1) - subprocess.Popen("sbatch {}".format(filename), shell=True).wait() ############################################################################### -############################################################################### -def getOarNbLongJobs() : - return int(subprocess.Popen('oarstat -u | grep "Q=long" | wc -l', - shell=True, stdout=subprocess.PIPE).stdout.read()) -############################################################################### - -############################################################################### -def getOarNbGpuPerNode() : - l = subprocess.Popen("oarnodes | grep gpunum=. | grep -o 'host=[^,]*' | cut -f2 -d= | sort | uniq -c", shell=True, stdout=subprocess.PIPE).stdout.read().decode('utf8').split('\n') - - res = {} - for line in l : - splited = line.split() - if len(splited) != 2 : - continue - res[splited[1]] = int(splited[0]) - - return res -############################################################################### - -############################################################################### -def getOarNbUsedGpuPerNode() : - l = subprocess.Popen("oarstat -f | grep 'assigned_hostnames =\|propert\|wanted_resources' | grep -i 'gpu is not null' -B 2 | grep [^-]", shell=True, stdout=subprocess.PIPE).stdout.read().decode("utf8").split('\n') - - res = {} - - for i in range(len(l)//3) : - ressources = l[3*i] - hostname = l[3*i+1].split()[-1] - cores = 1 - gpunum = 1 - if "core=" in ressources : - coresStr="" - coresStrBase = ressources.split("core=")[-1] - for symbol in coresStrBase : - if symbol < '0' or symbol > '9' : - break - coresStr = coresStr + symbol - cores = int(coresStr) - if "gpunum=" in ressources : - gpunum = int(ressources.split("gpunum=")[-1].split(',')[0]) - - if hostname not in res : - res[hostname] = 0 - res[hostname] += cores * gpunum - - return res -############################################################################### - -############################################################################### -def getOarNotAliveNodes() : - res = subprocess.Popen("oarnodes | grep -B 2 'state : [^A]' | grep 'network_address' | sort --unique | awk '{print $3}'", shell=True, stdout=subprocess.PIPE).stdout.read().decode("utf8").split('\n') - return [node for node in res if len(node) > 0] -############################################################################### - -############################################################################### -def getOarNbFreeGpuPerNode() : - gpus = getOarNbGpuPerNode() - notAlive = getOarNotAliveNodes() - usedGpus = getOarNbUsedGpuPerNode() - - for gpu in gpus : - gpus[gpu] -= usedGpus[gpu] if gpu in usedGpus else 0 - - for host in notAlive : - gpus[host] = 0 - - return gpus -############################################################################### - -############################################################################### -def getBestHostConstraint() : - return " and host!='lifnode1' and host!='sensei1'" - freeGpus = getOarNbFreeGpuPerNode() - - if freeGpus["diflives1"] > 0 or freeGpus["lisnode2"] > 0 or freeGpus["lisnode3"] > 0 : - return " and host!='lifnode1' and host!='adnvideo1' and host!='asfalda1' and host!='see4c1' and host!='sensei1'" - return "" -############################################################################### - ############################################################################### if __name__ == "__main__" : - if len(sys.argv) < 5 : + if len(sys.argv) != 7 : printUsageAndExit() mode = sys.argv[1] - launcher = sys.argv[2] - device = sys.argv[3] - batchesDescription = sys.argv[4] - nbHours = 92 - - if len(sys.argv) > 5 : - if sys.argv[5] == "--time" : - if 6 not in range(5,len(sys.argv)) : - printUsageAndExit() - nbHours = int(sys.argv[6]) - else : - printUsageAndExit() + device = sys.argv[2] + batchesDescription = sys.argv[3] + nbHours = sys.argv[4] + name = sys.argv[5] + limit = sys.argv[6] - if mode not in ["train","eval"] or launcher not in ["bash","oar","slurm"] or device not in ["cpu","gpu"] : + if mode not in ["train","eval"] or device not in ["cpu","gpu"] : printUsageAndExit() desc = __import__(os.path.splitext(batchesDescription)[0]) + names = [] + commands = [] + for lang in desc.langs : for xp in desc.templatesExperiments : for i in desc.repRange : @@ -322,9 +128,10 @@ if __name__ == "__main__" : xp['expName'] = xp['expName'].split('.')[0]+"."+lang+"."+str(i) if mode == "train" : prepareExperiment(xp['lang'],xp['template'],xp['expName']) - launchTrain(device,xp['mode'],xp['expName'],xp['arguments'],launcher,nbHours,seed=100+i) + addNamesAndCommandsTrain(names, commands, xp['mode'],xp['expName'],xp['arguments'],seed=100+i) else : - launchEval(device,xp['mode'],xp['expName'],launcher,nbHours) + addNamesAndCommandsDecode(names, commands, xp['mode'],xp['expName']) + launchArray(names, commands, mode, name, device, nbHours, limit) ###############################################################################