yafe_slurm.py 4.94 KB
Newer Older
valentin.emiya's avatar
valentin.emiya committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
"""

.. moduleauthor:: Valentin Emiya
"""

import importlib
import os
from pathlib import Path
import stat
import sys


def generate_slurm_script(script_file_path, xp_var_name, task_ids=None,
                          n_simultaneous_jobs=10, slurm_walltime='02:00:00',
                          activate_env_command=None, use_gpu=False):
    """Generate a script to launch an experiment using Slurm.

    Tasks are divided into batches that are executed by oar jobs.

    The resulting script is written in the experiment folder, and the command
    to launch the jobs with Slurm is displayed in the terminal.

    An example script illustrating how to use
    :func:`yafe.utils.generate_slurm_script` is available in the corresponding
    :ref:`tutorial <tutorial_oar>`.

    Parameters
    ----------
    script_file_path : str
        File path to the script that defines the experiment.
    xp_var_name : str
        Name of the variable containing the experiment in the script.
    task_ids : list
        List of tasks ids to run.
        If ``task_ids`` is ``None``, the list of pending tasks of the
        experiment is used.
    batch_size : int
        Number of tasks run in each batch.
    slurm_walltime : str
        Wall time for each Slurm job ('HH:MM:SS').
    activate_env_command : str or None
        Optional command that must be run to activate a Python virtual
        environment before launching the experiment.
        Typically, this is a command of the form
        ``source some_virtual_env/bin/activate`` when using virtualenv and
        ``source activate some_conda_env`` when using conda.
        If ``activate_env_command`` is ``None``, no virtual environment is
        activated.
    use_gpu : bool
        Flag specifying if a gpu ressource is needed when running the
        experiment.
    """
    # TODO update documentation
    script_file_path = Path(script_file_path)
    script_dir = script_file_path.parent
    script_name = script_file_path.stem

    sys.path.append(str(script_dir))
    mod = importlib.import_module(script_name)
    xp = getattr(mod, xp_var_name)
    script_dir = xp.xp_path / 'job_scripts'
    script_dir.mkdir(exist_ok=True)
    for f in script_dir.glob('*.sh'):
        os.remove(f)
    # script_dir.rm

    if task_ids is None:
        task_ids = xp.get_pending_task_ids()

    # split and save the tasks
    # task_ids = list(map(str, task_ids))
    # batches = [
    #     task_ids[i:(i + batch_size)]
    #     for i in range(0, len(task_ids), batch_size)
    # ]
    # file_path = xp.xp_path / 'listoftasks.txt'
    #
    # with open(str(file_path), 'wt') as fout:
    #     fout.write('\n'.join(map(lambda batch: ','.join(batch), batches)))

    # generate and save script
    # script_path = Path(os.path.abspath(script_file_path))
    # script_dir = script_path.parent
    # script_name = script_path.stem

    # Generate job script
valentin.emiya's avatar
valentin.emiya committed
88
89
    log_dir = xp.xp_path / 'logs'
    log_dir.mkdir(exist_ok=True)
valentin.emiya's avatar
valentin.emiya committed
90
91
92
93
94
    script = '#!/bin/sh\n'
    # define parameters
    script += '#SBATCH --job-name={}\n'.format(xp.name)
    script += '#SBATCH --array={}%{}\n'.format(
        ','.join(str(i) for i in task_ids), n_simultaneous_jobs)
valentin.emiya's avatar
valentin.emiya committed
95
96
    script += '#SBATCH --output={}/stdout_%A_%a.slurm\n'.format(log_dir)
    script += '#SBATCH --error={}/stderr_%A_%a.slurm\n'.format(log_dir)
valentin.emiya's avatar
valentin.emiya committed
97
98
99
100
101
102
    script += '#SBATCH --time={}\n'.format(slurm_walltime)
    # if use_gpu:
    #     script += '#SBATCH -p gpu IS NOT NULL\n'
    # else:
    #     script += '#SBATCH -p gpu IS NULL\n'

valentin.emiya's avatar
valentin.emiya committed
103
    script += 'srun -N1 -n1 {}/run_$SLURM_ARRAY_TASK_ID.sh'.format(script_dir)
valentin.emiya's avatar
valentin.emiya committed
104
105
106
107
108
109
110
111
112
113
114
    # script += 'echo "OAR_JOB_ID: $OAR_JOB_ID"\n'
    # script += 'echo "OAR_ARRAY_ID: $OAR_ARRAY_ID"\n'
    # script += 'echo "SLURM_ARRAY_TASK_ID: $SLURM_ARRAY_TASK_ID"\n'

    script_path = script_dir / 'script_slurm.sh'
    with script_path.open('w') as file:
        file.write(script)
    status = os.stat(script_path)
    os.chmod(script_path, status.st_mode | stat.S_IXUSR)

    # Generate a script of each array element
valentin.emiya's avatar
valentin.emiya committed
115
    for idt in task_ids:
valentin.emiya's avatar
valentin.emiya committed
116
117
118
119
120
121
122
        script = '#!/bin/sh\n'

        # activate the virtual env
        if activate_env_command is not None and len(activate_env_command) > 0:
            script += '{}\n'.format(activate_env_command)

        # python command
valentin.emiya's avatar
valentin.emiya committed
123
        script += 'echo "Running {}.launch_experiment(task_ids=[{}])"\n'\
valentin.emiya's avatar
valentin.emiya committed
124
125
126
            .format(xp_var_name, idt)
        script += 'python -c "import sys; sys.path.append(\'{0}\'); ' \
            'from {1} import {2}; ' \
valentin.emiya's avatar
valentin.emiya committed
127
            '{2}.launch_experiment(task_ids=[{3}])"\n'.format(
valentin.emiya's avatar
valentin.emiya committed
128
129
130
                script_dir, script_name, xp_var_name, idt)
        script += 'exit $?'

valentin.emiya's avatar
valentin.emiya committed
131
        script_i_path = script_dir / 'run_{}.sh'.format(idt)
valentin.emiya's avatar
valentin.emiya committed
132
133
134
135
136
137
138
139
140
        with script_i_path.open('w') as file:
            file.write(script)
        status = os.stat(script_i_path)
        os.chmod(script_i_path, status.st_mode | stat.S_IXUSR)

    print('*' * 80)
    print('Submit the job array using:')
    print('sbatch {}'.format(str(script_path)))
    print('*' * 80)