ColossalAI/colossalai/cli/launcher/__init__.py

import click
from .run import launch_multi_processes
from colossalai.context import Config


@click.command(help="Launch distributed training on a single node or multiple nodes",
               context_settings=dict(ignore_unknown_options=True))
@click.option("-H", "-host", "--host", type=str, default=None, help="the list of machines to launch")
@click.option("--hostfile",
              type=str,
              default=None,
              help="Hostfile path that defines the device pool available to the job (e.g. worker-name:number of slots)")
@click.option(
    "--include",
    type=str,
    default=None,
    help=
    "Specify computing devices to use during execution. String format is NODE_SPEC@NODE_SPEC where NODE_SPEC=<worker-name>:<list-of-slots>"
)
@click.option(
    "--exclude",
    type=str,
    default=None,
    help=
    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include."
)
@click.option("--num_nodes", type=int, default=-1, help="Total number of worker nodes to use.")
@click.option("--nproc_per_node", type=int, default=-1, help="Number of GPUs to use on each node.")
@click.option("--master_port",
              type=int,
              default=29500,
              help="(optional) Port used by PyTorch distributed for communication during distributed training.")
@click.option("--master_addr",
              type=str,
              default="127.0.0.1",
              help="(optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.")
@click.option(
    "--launcher",
    type=click.Choice(['torch', 'openmpi', 'slurm'], case_sensitive=False),
    default="torch",
    help="(optional) choose launcher backend for multi-node training. Options currently include PDSH, OpenMPI, SLURM.")
@click.option("--launcher_args",
              type=str,
              default=None,
              help="(optional) pass launcher specific arguments as a single quoted argument.")
@click.argument("user_script", type=str)
@click.argument('user_args', nargs=-1)
def run(host: str, hostfile: str, num_nodes: int, nproc_per_node: int, include: str, exclude: str, master_addr: str,
        master_port: int, launcher: str, launcher_args: str, user_script: str, user_args: str):
    """
    To launch multiple processes on a single node or multiple nodes via command line.

    Usage::
        # run on the current node with all available GPUs
        colossalai run train.py

        # run with only 2 GPUs on the current node
        colossalai run --nprocs_per_node 2 train.py

        # run on two nodes
        colossalai run --host <host1>,<host2> train.py

        # run with hostfile
        colossalai run --hostfile <file_path> train.py
    """
    args_dict = locals()
    args = Config(args_dict)
    args.user_args = list(args.user_args)
    launch_multi_processes(args)
[cli] fixed a bug in user args and refactored the module structure 3 years ago			`import click`
			`from .run import launch_multi_processes`
			`from colossalai.context import Config`


			`@click.command(help="Launch distributed training on a single node or multiple nodes",`
			`context_settings=dict(ignore_unknown_options=True))`
			`@click.option("-H", "-host", "--host", type=str, default=None, help="the list of machines to launch")`
			`@click.option("--hostfile",`
			`type=str,`
			`default=None,`
			`help="Hostfile path that defines the device pool available to the job (e.g. worker-name:number of slots)")`
			`@click.option(`
			`"--include",`
			`type=str,`
			`default=None,`
			`help=`
			`"Specify computing devices to use during execution. String format is NODE_SPEC@NODE_SPEC where NODE_SPEC=<worker-name>:<list-of-slots>"`
			`)`
			`@click.option(`
			`"--exclude",`
			`type=str,`
			`default=None,`
			`help=`
			`"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include."`
			`)`
			`@click.option("--num_nodes", type=int, default=-1, help="Total number of worker nodes to use.")`
[cli] fixed single-node process launching 3 years ago			`@click.option("--nproc_per_node", type=int, default=-1, help="Number of GPUs to use on each node.")`
[cli] fixed a bug in user args and refactored the module structure 3 years ago			`@click.option("--master_port",`
			`type=int,`
			`default=29500,`
			`help="(optional) Port used by PyTorch distributed for communication during distributed training.")`
			`@click.option("--master_addr",`
			`type=str,`
			`default="127.0.0.1",`
			`help="(optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.")`
			`@click.option(`
			`"--launcher",`
			`type=click.Choice(['torch', 'openmpi', 'slurm'], case_sensitive=False),`
			`default="torch",`
			`help="(optional) choose launcher backend for multi-node training. Options currently include PDSH, OpenMPI, SLURM.")`
			`@click.option("--launcher_args",`
			`type=str,`
			`default=None,`
			`help="(optional) pass launcher specific arguments as a single quoted argument.")`
			`@click.argument("user_script", type=str)`
			`@click.argument('user_args', nargs=-1)`
[cli] fixed single-node process launching 3 years ago			`def run(host: str, hostfile: str, num_nodes: int, nproc_per_node: int, include: str, exclude: str, master_addr: str,`
[cli] fixed a bug in user args and refactored the module structure 3 years ago			`master_port: int, launcher: str, launcher_args: str, user_script: str, user_args: str):`
			`"""`
			`To launch multiple processes on a single node or multiple nodes via command line.`

			`Usage::`
			`# run on the current node with all available GPUs`
			`colossalai run train.py`

			`# run with only 2 GPUs on the current node`
			`colossalai run --nprocs_per_node 2 train.py`

			`# run on two nodes`
			`colossalai run --host <host1>,<host2> train.py`

			`# run with hostfile`
			`colossalai run --hostfile <file_path> train.py`
			`"""`
			`args_dict = locals()`
			`args = Config(args_dict)`
			`args.user_args = list(args.user_args)`
[cli] fixed single-node process launching 3 years ago			`launch_multi_processes(args)`