yafe_slurm.py 4.26 KB
Newer Older
valentin.emiya's avatar
valentin.emiya committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# -*- coding: utf-8 -*-
"""

.. moduleauthor:: Valentin Emiya
"""

import importlib
import os
from pathlib import Path
import stat
import sys


def generate_slurm_script(script_file_path, xp_var_name, task_ids=None,
                          n_simultaneous_jobs=10, slurm_walltime='02:00:00',
                          activate_env_command=None, use_gpu=False):
    """Generate a script to launch an experiment using Slurm.

    Tasks are divided into batches that are executed by oar jobs.

    The resulting script is written in the experiment folder, and the command
    to launch the jobs with Slurm is displayed in the terminal.

valentin.emiya's avatar
doc    
valentin.emiya committed
24
25
    An example of a similar usage in the case of OAR (script
    :func:`yafe.utils.generate_oar_script`) is illustrated by in
valentin.emiya's avatar
valentin.emiya committed
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    :ref:`tutorial <tutorial_oar>`.

    Parameters
    ----------
    script_file_path : str
        File path to the script that defines the experiment.
    xp_var_name : str
        Name of the variable containing the experiment in the script.
    task_ids : list
        List of tasks ids to run.
        If ``task_ids`` is ``None``, the list of pending tasks of the
        experiment is used.
    batch_size : int
        Number of tasks run in each batch.
    slurm_walltime : str
        Wall time for each Slurm job ('HH:MM:SS').
    activate_env_command : str or None
        Optional command that must be run to activate a Python virtual
        environment before launching the experiment.
        Typically, this is a command of the form
        ``source some_virtual_env/bin/activate`` when using virtualenv and
        ``source activate some_conda_env`` when using conda.
        If ``activate_env_command`` is ``None``, no virtual environment is
        activated.
    use_gpu : bool
        Flag specifying if a gpu ressource is needed when running the
valentin.emiya's avatar
doc    
valentin.emiya committed
52
        experiment. This has not been implemented yet.
valentin.emiya's avatar
valentin.emiya committed
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    """
    script_file_path = Path(script_file_path)
    script_dir = script_file_path.parent
    script_name = script_file_path.stem

    sys.path.append(str(script_dir))
    mod = importlib.import_module(script_name)
    xp = getattr(mod, xp_var_name)
    script_dir = xp.xp_path / 'job_scripts'
    script_dir.mkdir(exist_ok=True)
    for f in script_dir.glob('*.sh'):
        os.remove(f)

    if task_ids is None:
        task_ids = xp.get_pending_task_ids()

    # generate and save script

    # Generate job script
valentin.emiya's avatar
valentin.emiya committed
72
73
    log_dir = xp.xp_path / 'logs'
    log_dir.mkdir(exist_ok=True)
valentin.emiya's avatar
valentin.emiya committed
74
75
76
77
78
    script = '#!/bin/sh\n'
    # define parameters
    script += '#SBATCH --job-name={}\n'.format(xp.name)
    script += '#SBATCH --array={}%{}\n'.format(
        ','.join(str(i) for i in task_ids), n_simultaneous_jobs)
valentin.emiya's avatar
valentin.emiya committed
79
80
    script += '#SBATCH --output={}/stdout_%A_%a.slurm\n'.format(log_dir)
    script += '#SBATCH --error={}/stderr_%A_%a.slurm\n'.format(log_dir)
valentin.emiya's avatar
valentin.emiya committed
81
82
83
84
85
86
    script += '#SBATCH --time={}\n'.format(slurm_walltime)
    # if use_gpu:
    #     script += '#SBATCH -p gpu IS NOT NULL\n'
    # else:
    #     script += '#SBATCH -p gpu IS NULL\n'

valentin.emiya's avatar
valentin.emiya committed
87
    script += 'srun -N1 -n1 {}/run_$SLURM_ARRAY_TASK_ID.sh'.format(script_dir)
valentin.emiya's avatar
valentin.emiya committed
88
89
90
91
92
93
94
95

    script_path = script_dir / 'script_slurm.sh'
    with script_path.open('w') as file:
        file.write(script)
    status = os.stat(script_path)
    os.chmod(script_path, status.st_mode | stat.S_IXUSR)

    # Generate a script of each array element
valentin.emiya's avatar
valentin.emiya committed
96
    for idt in task_ids:
valentin.emiya's avatar
valentin.emiya committed
97
98
99
100
101
102
103
        script = '#!/bin/sh\n'

        # activate the virtual env
        if activate_env_command is not None and len(activate_env_command) > 0:
            script += '{}\n'.format(activate_env_command)

        # python command
valentin.emiya's avatar
valentin.emiya committed
104
        script += 'echo "Running {}.launch_experiment(task_ids=[{}])"\n'\
valentin.emiya's avatar
valentin.emiya committed
105
106
107
            .format(xp_var_name, idt)
        script += 'python -c "import sys; sys.path.append(\'{0}\'); ' \
            'from {1} import {2}; ' \
valentin.emiya's avatar
valentin.emiya committed
108
            '{2}.launch_experiment(task_ids=[{3}])"\n'.format(
valentin.emiya's avatar
valentin.emiya committed
109
110
111
                script_dir, script_name, xp_var_name, idt)
        script += 'exit $?'

valentin.emiya's avatar
valentin.emiya committed
112
        script_i_path = script_dir / 'run_{}.sh'.format(idt)
valentin.emiya's avatar
valentin.emiya committed
113
114
115
116
117
118
119
120
121
        with script_i_path.open('w') as file:
            file.write(script)
        status = os.stat(script_i_path)
        os.chmod(script_i_path, status.st_mode | stat.S_IXUSR)

    print('*' * 80)
    print('Submit the job array using:')
    print('sbatch {}'.format(str(script_path)))
    print('*' * 80)