from typing import List, Optional, Union import torch import torch.distributed as dist from colossalai.legacy.global_variables import tensor_parallel_env as env from colossalai.legacy.nn.layer.utils import divide from colossalai.legacy.tensor import ColoTensorSpec, ProcessGroup from colossalai.tensor import ColoTensor GeneralTensor = Union[ColoTensor, torch.Tensor] Number = Union[int, float] def convert_to_colo_tensor(tensor: Optional[GeneralTensor], pg: ProcessGroup) -> Optional[ColoTensor]: if tensor is not None and not isinstance(tensor, ColoTensor): tensor = ColoTensor.from_torch_tensor(tensor, ColoTensorSpec(pg)) return tensor def set_parallel_input(input_parallel: bool): env.parallel_input_1d = input_parallel def get_parallel_input(): return env.parallel_input_1d def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank): index_f = rank * per_partition_vocab_size index_l = index_f + per_partition_vocab_size return index_f, index_l def vocab_range_from_global_vocab_size(global_vocab_size, rank, world_size): per_partition_vocab_size = divide(global_vocab_size, world_size) return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank) def _reduce(input_, pg: ProcessGroup): # skip if only one rank involved if pg.tp_world_size() == 1: return input_ assert input_.device.type == "cuda" group = pg.tp_process_group() dist.all_reduce(input_, group=group) return input_ def _split(input_, pg: ProcessGroup, dim=-1): # skip if only one rank involved world_size = pg.tp_world_size() if world_size == 1: return input_ # Split along last dimension. dim_size = input_.size(dim) assert dim_size % world_size == 0, ( f"The dimension to split ({dim_size}) is not a multiple of world size ({world_size}), " f"cannot split tensor evenly" ) tensor_list = torch.split(input_, dim_size // world_size, dim=dim) rank = pg.tp_local_rank() output = tensor_list[rank].contiguous() return output def _gather(input_, pg: ProcessGroup, dim=-1): # skip if only one rank involved world_size = pg.tp_world_size() if world_size == 1: return input_ # all gather rank = pg.tp_local_rank() tensor_list = [torch.empty_like(input_) for _ in range(world_size)] tensor_list[rank] = input_ assert input_.device.type == "cuda" group = pg.tp_process_group() torch.distributed.all_gather(tensor_list, input_, group=group) # concat output = torch.cat(tensor_list, dim=dim).contiguous() return output class _ReduceGrad(torch.autograd.Function): """ Pass the input to the model parallel region. Args: input_: input matrix. process_group: parallel mode. """ @staticmethod def symbolic(graph, input_): return input_ @staticmethod def forward(ctx, input_, process_group): ctx.mode = process_group return input_ @staticmethod def backward(ctx, grad_output): return _reduce(grad_output, ctx.mode), None class _ReduceInput(torch.autograd.Function): """ All-reduce the input from the model parallel region. Args: input_: input matrix. process_group: parallel mode. """ @staticmethod def symbolic(graph, input_): return _reduce(input_) @staticmethod def forward(ctx, input_, process_group): return _reduce(input_, process_group) @staticmethod def backward(ctx, grad_output): return grad_output, None class _SplitForwardGatherBackward(torch.autograd.Function): """ Split the input and keep only the corresponding chuck to the rank. Args: input_: input matrix. process_group: parallel mode. dim: dimension """ @staticmethod def symbolic(graph, input_): return _split(input_) @staticmethod def forward(ctx, input_, process_group, dim): ctx.mode = process_group ctx.dim = dim return _split(input_, process_group, dim) @staticmethod def backward(ctx, grad_output): return _gather(grad_output, ctx.mode, ctx.dim), None, None class _GatherForwardSplitBackward(torch.autograd.Function): """Gather the input from model parallel region and concatenate. Args: input_: input matrix. process_group: parallel mode. dim: dimension """ @staticmethod def symbolic(graph, input_): return _gather(input_) @staticmethod def forward(ctx, input_, process_group, dim): ctx.mode = process_group ctx.dim = dim return _gather(input_, process_group, dim) @staticmethod def backward(ctx, grad_output): return _split(grad_output, ctx.mode, ctx.dim), None, None def reduce_grad(input_, process_group): return _ReduceGrad.apply(input_, process_group) def reduce_input(input_, process_group): return _ReduceInput.apply(input_, process_group) def split_forward_gather_backward(input_, process_group, dim): return _SplitForwardGatherBackward.apply(input_, process_group, dim) def gather_forward_split_backward(input_, process_group, dim): return _GatherForwardSplitBackward.apply(input_, process_group, dim) def _all_to_all(x: torch.Tensor, pg: ProcessGroup, scatter_dim: int, gather_dim: int) -> torch.Tensor: world_size = pg.tp_world_size() if world_size == 1: return x # TODO: enabling mpi backend to support CPU all_to_all assert x.device.type == "cuda", f"Currently, the collective function dual_all_to_all only supports nccl backend" shapes = list(x.size()) shapes[scatter_dim] = shapes[scatter_dim] // world_size scatter_list = [each.contiguous() for each in torch.tensor_split(x, world_size, scatter_dim)] gather_list = [torch.empty(*shapes, dtype=x.dtype, device=x.device) for _ in range(world_size)] torch.distributed.all_to_all(gather_list, scatter_list, group=pg.tp_process_group()) return torch.cat(gather_list, dim=gather_dim).contiguous() class _DualAllToAll(torch.autograd.Function): @staticmethod def forward(ctx, x, pg, scatter_dim, gather_dim): ctx.scatter_dim = scatter_dim ctx.gather_dim = gather_dim ctx.pg = pg return _all_to_all(x, pg, scatter_dim, gather_dim) @staticmethod def backward(ctx, grad): return _all_to_all(grad, ctx.pg, ctx.gather_dim, ctx.scatter_dim), None, None, None def dual_all_to_all(x, pg, scatter_dim: int, gather_dim: int): return _DualAllToAll.apply(x, pg, scatter_dim, gather_dim) # table wise embedding shard def _all_to_all_for_tablewise( x: torch.Tensor, pg: ProcessGroup, scatter_strides: List[int], gather_strides: List[int], forward=True ) -> torch.Tensor: world_size = pg.tp_world_size() rank = pg.tp_local_rank() if world_size == 1: return x assert x.device.type == "cuda", f"Currently, the collective function dual_all_to_all only supports nccl backend" if forward: scatter_list = list(x.split(scatter_strides, 0)) gather_list = [ torch.empty(scatter_strides[rank], gather_strides[i], dtype=x.dtype, device=x.device) for i in range(world_size) ] torch.distributed.all_to_all(gather_list, scatter_list, group=pg.tp_process_group()) return torch.cat(gather_list, 1).contiguous() else: # split on dim 1, lose contiguity scatter_list = [each.contiguous() for each in x.split(scatter_strides, 1)] gather_list = [ torch.empty(gather_strides[i], scatter_strides[rank], dtype=x.dtype, device=x.device) for i in range(world_size) ] torch.distributed.all_to_all(gather_list, scatter_list, group=pg.tp_process_group()) return torch.cat(gather_list, 0).contiguous() class _DualAllToAllForTablewise(torch.autograd.Function): @staticmethod def forward(ctx, x, pg, scatter_strides, gather_strides): ctx.pg = pg ctx.scatter_strides = scatter_strides ctx.gather_strides = gather_strides return _all_to_all_for_tablewise(x, pg, scatter_strides, gather_strides, forward=True) @staticmethod def backward(ctx, grad): return ( _all_to_all_for_tablewise(grad, ctx.pg, ctx.gather_strides, ctx.scatter_strides, forward=False), None, None, None, ) def dual_all_to_all_tablewise(x, pg, scatter_strides, gather_strides): return _DualAllToAllForTablewise.apply(x, pg, scatter_strides, gather_strides)