Source code for coexist.schedulers

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# File   : schedulers.py
# License: GNU v3.0
# Author : Andrei Leonard Nicusan <a.l.nicusan@bham.ac.uk>
# Date   : 04.04.2021


import  os
import  sys
from    abc         import  ABC, abstractmethod




[docs]class Scheduler(ABC):
    '''An abstract class defining the interface that concrete schedulers need
    to implement.

    A scheduler, in general, is a program that defines a way to split parallel
    workloads in some computing environment. More specifically, it allows
    calling a Python script with some command-line arguments in parallel; in
    the simplest case:

        $> ``python3 some_script.py arg1 arg2 arg3``

    Here, `python3` is the "scheduler", simply creating a new OS process in
    which a Python interpreter executes `some_script.py`. This is implemented
    in the ``LocalScheduler`` class.

    In a more complex, multi-cluster environment managed by SLURM:

        $> ``sbatch job_submission.sh some_script.py arg1 arg2 arg3``

    Here, the `sbatch job_submission.sh` is the scheduling part, and the
    `job_submission.sh` SLURM script must be generated beforehand. This is
    implemented in the ``SlurmScheduler`` class.

    **Subclassing:**

    If you want to implement a concrete scheduler for another system, subclass
    `Scheduler` and implement the `schedule(dirpath, index)` method (`dirpath`
    is the directory where the computation is run, e.g. "access_seed123" and
    `index` is the job number), which will be called when launching each
    simulation to:

    - Generate any files needed to schedule a Python script's execution.
    - Return a list of the system commands to be prepended to the Python
      scripts (e.g. ["python3"] and ["sbatch", "job_submission.sh"] in the
      examples above).
    '''

[docs]    @abstractmethod
    def schedule(self, dirpath, index):
        pass




[docs]class LocalScheduler(Scheduler):
    '''Schedule parallel workloads on the local / shared-memory machine.

    By default, it will use the ``sys.executable`` Python interpreter (i.e. the
    one used to execute the current code); you can set it to a different
    name, e.g. ``coexist.schedulers.LocalScheduler(["python3"])``.
    '''

[docs]    def __init__(self, python_executable = [sys.executable]):
        self.python_executable = list(python_executable)


[docs]    def schedule(self, dirpath, index):
        return self.python_executable


    def __repr__(self):
        return f"LocalScheduler(python_executable={self.python_executable})"




[docs]class SlurmScheduler(Scheduler):
    '''Launch simulations on a SLURM distributed cluster using ``sbatch``.

    First a bash script must be defined for launching each simulation job; this
    class generates this script, but some details must be defined by you; they
    are specified as class parameters, see examples below.

    Parameters
    ----------
    time : str
        The time allocated for *a single simulation*, given as a string, e.g.
        "1:0:0". Will be added as "#SBATCH --time 1:0:0".

    qos : str, optional
        The "#SBATCH --qos bbdefault" ``sbatch`` command.

    mail_type : str, default "FAIL"
        The "#SBATCH --mail-type FAIL" ``sbatch`` command.

    ntasks : str, default "1"
        The "#SBATCH --ntasks 1" ``sbatch`` command.

    mem : str, optional
        The "#SBATCH --mem 4G" ``sbatch`` command.

    commands : str or list[str], default "module load Python"
        Any other *non-SLURM* commands to run in the job submission script
        before executing the simulation; this is normally the setup work, e.g.
        loading necessary modules, environments, etc.

    interpreter : str, default os.path.split(sys.executable)[1]
        Name of the Python interpreter which will be used to execute the
        simulation script; this is normally set to the name of the executable
        used to run ACCES, e.g. "usr/bin/python3" -> "python3".

    **kwargs : other keyword arguments
        Other "#SBATCH" commands to include at the top of the job submission
        script; e.g. ``constraint = "cascadelake"`` is transformed into
        ``"#SBATCH --constraint cascadelake"``; ``mem_per_cpu = "4"`` is
        transformed into ``"#SBATCH --mem-per-cpu 4"``.

    Examples
    --------

    >>> from coexist.schedulers import SlurmScheduler
    >>> scheduler = SlurmScheduler(
    >>>     "10:0:0",          # Time allocated for a single simulation
    >>>     commands = """
    >>>         # Commands to add in the sbatch script after `#`
    >>>         set -e
    >>>         module purge; module load bluebear
    >>>         module load BEAR-Python-DataScience
    >>>     """,
    >>>     qos = "bbdefault",
    >>>     constraint = "cascadelake",   # Any other #SBATCH --<CMD> = "VAL"
    >>> )
    '''

[docs]    def __init__(
        self,
        time,
        qos = None,
        mail_type = "FAIL",
        ntasks = "1",
        mem = None,
        commands = "set -e\n",
        interpreter = os.path.split(sys.executable)[1],
        script = "access_slurm_submission.sh",
        **kwargs,
    ):
        self.time = str(time)
        self.commands = commands
        self.interpreter = str(interpreter)

        self.qos = str(qos) if qos is not None else None
        self.mail_type = str(mail_type) if mail_type is not None else None

        self.ntasks = str(ntasks) if ntasks is not None else None
        self.mem = str(mem) if mem is not None else None

        self.script = script
        self.kwargs = kwargs


[docs]    def generate(self, scriptpath):
        with open(scriptpath, "w") as f:
            f.write("#!/bin/bash\n")
            f.write(f"#SBATCH --time {self.time}\n")

            if self.qos is not None:
                f.write(f"#SBATCH --qos {self.qos}\n")

            if self.mail_type is not None:
                f.write(f"#SBATCH --mail-type {self.mail_type}\n")

            if self.ntasks is not None:
                f.write(f"#SBATCH --ntasks {self.ntasks}\n")

            if self.mem is not None:
                f.write(f"#SBATCH --mem {self.mem}\n")

            for key, val in self.kwargs.items():
                f.write(f"#SBATCH --{key.replace('_', '-')} {val}\n")

            f.write("#SBATCH --wait\n")

            f.write("\n\n")
            if isinstance(self.commands, str):
                f.write(self.commands)
            else:
                for cmd in self.commands:
                    # Small convenience, but if the strings in the list of
                    # commands don't end with a '\n', append it
                    if not cmd.endswith("\n"):
                        cmd += "\n"
                    f.write(cmd)

            f.write((
                "\n\n# Run a single function evaluation with all command-line "
                "arguments redirected to Python\n"
            ))
            f.write(f"{self.interpreter} $*\n")


[docs]    def schedule(self, dirpath, index):
        # Check directory exists
        if not os.path.isdir(dirpath):
            raise FileNotFoundError(
                f"The given `dirpath` = '{dirpath}' does not exist."
            )

        # Generate SLURM launch script if it doesn't exist
        scriptpath = os.path.join(dirpath, self.script)
        if not os.path.isfile(scriptpath):
            self.generate(scriptpath)

        # Check outputs directory exists and create it otherwise
        outputdir = os.path.join(dirpath, "outputs")
        if not os.path.isdir(outputdir):
            os.mkdir(outputdir)

        outputpath = os.path.join(outputdir, f"output.{index}.slurm-%j.out")
        return [
            "sbatch",
            f"--output={outputpath}",
            f"--job-name=aid{index}",
            scriptpath,
        ]


    def __repr__(self):
        # Return pretty string representation of an arbitrary object
        docs = []
        for attr in dir(self):
            if not attr.startswith("_"):
                memb = getattr(self, attr)
                if not callable(memb):
                    docs.append(f"{attr} = {memb}")

        name = self.__class__.__name__
        underline = "-" * len(name)
        return f"{name}\n{underline}\n" + "\n".join(docs)