move all2all to utils

2024-01-08 13:16:17 +08:00 · 2024-01-08 13:16:17 +08:00 · fdd60691d3
parent 07c98c4a39
commit fdd60691d3
2 changed files with 31 additions and 25 deletions
--- a/internlm/moe/sharded_moe.py
+++ b/internlm/moe/sharded_moe.py
@ -4,7 +4,7 @@ https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/moe/experts.py
 Git commit hash: f3943cf9109226ed3ecf2d5dbb639a11cd925555
 We retain the following license from the original files:
 """
-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple
 import torch
 import torch.distributed as dist
@ -16,6 +16,7 @@ from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from .base_moe import BaseMoELayer
 from .utils import _AllToAll
 # global llm logger
 logger = get_logger(__file__)
@ -59,30 +60,6 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
    return gumbel(shape)
 # Based on https://github.com/pytorch/pytorch/pull/40762
 class _AllToAll(torch.autograd.Function):
    """
    All to all communication
    """
    @staticmethod
    def forward(
        ctx: Any,
        # TODO: replace with DS process group
        group: torch.distributed.ProcessGroup,
        inputs: Tensor,
    ) -> Tensor:  # type: ignore
        ctx.group = group
        inputs = inputs.contiguous()
        output = torch.empty_like(inputs)
        dist.all_to_all_single(output, inputs, group=group)
        return output
    @staticmethod
    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor]:
        return (None, _AllToAll.apply(ctx.group, *grad_output))
 # einsum rewrites are on par or more performant
 # switch can be bubbled up in future
 USE_EINSUM = True
--- a/internlm/moe/utils.py
+++ b/internlm/moe/utils.py
@ -0,0 +1,29 @@
 from typing import Any, Tuple
 import torch
 import torch.distributed as dist
 from torch import Tensor
 # Based on https://github.com/pytorch/pytorch/pull/40762
 class _AllToAll(torch.autograd.Function):
    """
    All to all communication
    """
    @staticmethod
    def forward(
        ctx: Any,
        # TODO: replace with DS process group
        group: torch.distributed.ProcessGroup,
        inputs: Tensor,
    ) -> Tensor:  # type: ignore
        ctx.group = group
        inputs = inputs.contiguous()
        output = torch.empty_like(inputs)
        dist.all_to_all_single(output, inputs, group=group)
        return output
    @staticmethod
    def backward(ctx: Any, *grad_output: Tensor) -> Tuple[None, Tensor]:
        return (None, _AllToAll.apply(ctx.group, *grad_output))