from typing import List import torch.distributed as dist from torch.distributed import ProcessGroup class ProcessGroupManager: """ ProcessGroupManager is used to manage the process groups in the cluster. There are some terms used in this class: - pg: the short name for process group - pg_name: the name of the process group - pg_size: the world size of the process group - rank: the rank of the current process in the process group - world_size: the total number of processes in the process group """ def __init__(self): self.pg_store = dict() def create_process_group(self, name: str, ranks: List[int], backend: str = "nccl") -> ProcessGroup: """ Get a process group by name. If the process group does not exist, it will be created. Args: name (str): name of the process group ranks (List[int]): ranks of the process group backend (str, optional): backend of the process group. Defaults to 'nccl'. Returns: ProcessGroup: the process group """ if name not in self.pg_store: pg = dist.new_group(ranks=ranks, backend=backend) self.pg_store[name] = pg return pg else: raise ValueError(f"Process group {name} already exists.") def get(self, name: str) -> ProcessGroup: """ Get a process group by name. Args: name (str): name of the process group Returns: ProcessGroup: the process group """ if name in self.pg_store: return self.pg_store[name] else: raise ValueError(f"Process group {name} does not exist.") def destroy(self, name: str) -> None: """ Destroy a process group by name. Args: name (str): name of the process group """ if name in self.pg_store: dist.destroy_process_group(self.pg_store[name]) del self.pg_store[name] else: raise ValueError(f"Process group {name} does not exist.") def destroy_all(self) -> None: """ Destroy all process groups. """ for name in self.pg_store: dist.destroy_process_group(self.pg_store[name]) self.pg_store.clear()