#!/usr/bin/env python # -*- encoding: utf-8 -*- import os # set CUDA_DEVICE_MAX_CONNECTIONS=1 to ensure that when overlapping communication and computation, # the order of of kernel launches on GPUs are the same as on the CPU so that comm is launched first. # see https://github.com/NVIDIA/Megatron-LM/issues/533 # https://forums.developer.nvidia.com/t/how-many-streams-maximum-number-of-streams/6571/16 os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" import torch import torch.distributed as dist from colossalai.accelerator import get_accelerator from colossalai.logging import get_dist_logger from colossalai.utils import set_seed def launch( rank: int, world_size: int, host: str, port: int, backend: str = "nccl", local_rank: int = None, seed: int = 1024, verbose: bool = True, ): """This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input arguments are not given. Then initialize and set distributed environment by calling global_context's functions. Args: config (Union[str, dict, Config]): Config file or config file path are both acceptable rank (int): Rank for the default process group world_size (int): World size of the default process group host (str): The master address for distributed training port (str): The master port for distributed training backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl`` local_rank (int, optional): Rank for the process on the node and is used to set the default CUDA device, defaults to None. If local_rank = None, the default device ordinal will be calculated automatically. seed (int, optional): Specified random seed for every process. Defaults to 1024. verbose (bool, optional): Whether to print logs. Defaults to True. Raises: Exception: Raise exception when config type is wrong """ cur_accelerator = get_accelerator() backend = cur_accelerator.communication_backend # init default process group if ":" in host: # IPv6 init_method = f"tcp://[{host}]:{port}" else: # IPv4 init_method = f"tcp://{host}:{port}" dist.init_process_group(rank=rank, world_size=world_size, backend=backend, init_method=init_method) # set cuda device # if local rank is not given, calculate automatically if cur_accelerator.support_set_device: cur_accelerator.set_device(local_rank) set_seed(seed) try: torch._dynamo.config.optimize_ddp = world_size > 1 except AttributeError: pass if verbose: logger = get_dist_logger() logger.info(f"Distributed environment is initialized, world size: {dist.get_world_size()}", ranks=[0]) def launch_from_slurm( host: str, port: int, backend: str = "nccl", seed: int = 1024, verbose: bool = True, ): """A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables set by SLURM Args: config (Union[str, dict, Config]): Config file or config file path are both acceptable host (str): The master address for distributed training port (str): The master port for distributed training backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl`` seed (int, optional): Specified random seed for every process. Defaults to 1024. verbose (bool, optional): Whether to print logs. Defaults to True. """ try: rank = int(os.environ["SLURM_PROCID"]) world_size = int(os.environ["SLURM_NPROCS"]) except KeyError as e: raise RuntimeError( f"Could not find {e} in the SLURM environment, visit https://www.colossalai.org/ for more information on launching with SLURM" ) launch( rank=rank, world_size=world_size, host=host, port=port, backend=backend, seed=seed, verbose=verbose, ) def launch_from_openmpi( host: str, port: int, backend: str = "nccl", seed: int = 1024, verbose: bool = True, ): """A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables set by OpenMPI Args: config (Union[str, dict, Config]): Config file or config file path are both acceptable host (str): The master address for distributed training port (str): The master port for distributed training backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl`` seed (int, optional): Specified random seed for every process. Defaults to 1024. verbose (bool, optional): Whether to print logs. Defaults to True. """ try: rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) local_rank = int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) world_size = int(os.environ["OMPI_COMM_WORLD_SIZE"]) except KeyError as e: raise RuntimeError( f"Could not find {e} in the OpenMPI environment, visit https://www.colossalai.org/ for more information on launching with OpenMPI" ) launch( local_rank=local_rank, rank=rank, world_size=world_size, host=host, port=port, backend=backend, seed=seed, verbose=verbose, ) def launch_from_torch(backend: str = "nccl", seed: int = 1024, verbose: bool = True): """A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size from the environment variables set by PyTorch Args: config (Union[str, dict, Config]): Config file or config file path are both acceptable backend (str, optional): Backend for ``torch.distributed``, defaults to ``nccl`` seed (int, optional): Specified random seed for every process. Defaults to 1024. verbose (bool, optional): Whether to print logs. Defaults to True. """ try: rank = int(os.environ["RANK"]) local_rank = int(os.environ["LOCAL_RANK"]) world_size = int(os.environ["WORLD_SIZE"]) host = os.environ["MASTER_ADDR"] port = int(os.environ["MASTER_PORT"]) except KeyError as e: raise RuntimeError( f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch" ) launch( local_rank=local_rank, rank=rank, world_size=world_size, host=host, port=port, backend=backend, seed=seed, verbose=verbose, )