remove some unnecessary code

2023-08-02 18:24:03 +08:00 · 2023-08-02 18:24:03 +08:00 · 3dabb6d308
parent 3b1c04d7be
commit 3dabb6d308
3 changed files with 14 additions and 145 deletions
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@ -6,11 +6,12 @@ from typing import Optional
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch.distributed import ProcessGroup
+from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
+from flash_attn.utils.distributed import reduce_scatter, all_reduce

 from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import all_reduce, fused_dense_func_torch, reduce_scatter
+from internlm.model.utils import fused_dense_func_torch


 class ScaleColumnParallelLinear(nn.Linear):
@ -109,23 +110,7 @@ class RewardModelLinear(ScaleColumnParallelLinear):
        )


-class ColumnParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        if out_features % world_size != 0:
-            raise ValueError(f"out_features ({out_features}) must be divisible by " f"world_size ({world_size})")
-        super().__init__(in_features, out_features // world_size, bias=bias, device=device, dtype=dtype)
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
+class ColumnParallelLinearTorch(ColumnParallelLinear):

    def forward(self, x):
        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
@ -137,25 +122,7 @@ class ColumnParallelLinear(nn.Linear):
        )


-class RowParallelLinear(nn.Linear):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-        process_group: ProcessGroup,
-        bias: bool = True,
-        sequence_parallel=True,
-        device=None,
-        dtype=None,
-    ) -> None:
-        world_size = torch.distributed.get_world_size(process_group)
-        rank = torch.distributed.get_rank(process_group)
-        if in_features % world_size != 0:
-            raise ValueError(f"in_features ({in_features}) must be divisible by " f"world_size ({world_size})")
-        # Only rank 0 will have bias
-        super().__init__(in_features // world_size, out_features, bias=bias and rank == 0, device=device, dtype=dtype)
-        self.process_group = process_group
-        self.sequence_parallel = sequence_parallel
+class RowParallelLinearTorch(RowParallelLinear):

    def forward(self, x):
        """
@ -198,7 +165,7 @@ class FeedForward(nn.Module):

        hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)

-        self.w1 = ColumnParallelLinear(
+        self.w1 = ColumnParallelLinearTorch(
            in_features,
            hidden_features,
            process_group,
@ -207,10 +174,10 @@ class FeedForward(nn.Module):
            device=device,
            dtype=dtype,
        )
-        self.w2 = ColumnParallelLinear(
+        self.w2 = ColumnParallelLinearTorch(
            in_features, hidden_features, process_group, bias, sequence_parallel=False, device=device, dtype=dtype
        )
-        self.w3 = RowParallelLinear(
+        self.w3 = RowParallelLinearTorch(
            hidden_features,
            out_features,
            process_group,
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@ -17,7 +17,7 @@ from torch import nn
 from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.model.embedding import RotaryEmbedding
-from internlm.model.linear import ColumnParallelLinear, RowParallelLinear
+from internlm.model.linear import ColumnParallelLinearTorch, RowParallelLinearTorch


 class MHA(nn.Module):
@ -78,7 +78,7 @@ class MHA(nn.Module):
            self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, scale_base=rotary_emb_scale_base, device=device)

        # notice here should change bias=True
-        self.Wqkv = ColumnParallelLinear(
+        self.Wqkv = ColumnParallelLinearTorch(
            embed_dim,
            3 * embed_dim,
            process_group,
@ -95,7 +95,7 @@ class MHA(nn.Module):
        )

        # output projection always have the bias (for now)
-        self.out_proj = RowParallelLinear(
+        self.out_proj = RowParallelLinearTorch(
            embed_dim, embed_dim, process_group, sequence_parallel=sequence_parallel, **factory_kwargs
        )
        # need to assign tp attribute so that internlm know it is tensor parallel module
--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@ -7,8 +7,9 @@ import torch
 import torch.nn.functional as F
 from flash_attn.ops.fused_dense import FusedDenseFunc
 from torch import Tensor
-from torch.cuda.amp import custom_bwd, custom_fwd
+from torch.cuda.amp import custom_bwd
 from torch.distributed import ProcessGroup
+from flash_attn.utils.distributed import all_gather_raw, reduce_scatter_raw, all_reduce_raw

 from internlm.core.context import global_context as gpc

@ -80,67 +81,6 @@ def gather_forward_split_backward(input_, parallel_mode, dim):
    return _GatherForwardSplitBackward.apply(input_, parallel_mode, dim)


-# the following communicators are adapted from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/distributed.py
-def all_gather_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    world_size = torch.distributed.get_world_size(process_group)
-    output = torch.empty(world_size * input_.shape[0], *input_.shape[1:], dtype=input_.dtype, device=input_.device)
-    handle = torch.distributed.all_gather_into_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
-    )
-    return output, handle
-
-
-def reduce_scatter_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    world_size = torch.distributed.get_world_size(process_group)
-    assert input_.shape[0] % world_size == 0
-    output = torch.empty(input_.shape[0] // world_size, *input_.shape[1:], dtype=input_.dtype, device=input_.device)
-    handle = torch.distributed.reduce_scatter_tensor(
-        output, input_.contiguous(), group=process_group, async_op=async_op
-    )
-    return output, handle
-
-
-def all_reduce_raw(input_: Tensor, process_group: ProcessGroup, async_op: bool = False):
-    input_ = input_.contiguous()
-    handle = torch.distributed.all_reduce(input_, group=process_group, async_op=async_op)
-    return input_, handle
-
-
-class ReduceScatterFunc(torch.autograd.Function):
-    """Reduce scatter the input from the sequence parallel region and concatenate."""
-
-    @staticmethod
-    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
-        ctx.process_group = process_group
-        output, _ = reduce_scatter_raw(input_, process_group)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output: Tensor):
-        grad_input, _ = all_gather_raw(grad_output, ctx.process_group)
-        return grad_input, None
-
-
-class AllReduceFunc(torch.autograd.Function):
-    """Gather the input from sequence parallel region and concatenate."""
-
-    @staticmethod
-    def forward(ctx, input_: Tensor, process_group: ProcessGroup) -> Tensor:
-        ctx.process_group = process_group
-        output, _ = all_reduce_raw(input_, process_group)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output: Tensor):
-        return grad_output, None
-
-
-# Supports autograd, but does not support async
-reduce_scatter = ReduceScatterFunc.apply
-# Supports autograd, but does not support async
-all_reduce = AllReduceFunc.apply
-
-
 def linear_bias_wgrad_torch(input, grad_output, has_d_bias):
    assert input.dtype == grad_output.dtype
    grad_weight = torch.matmul(grad_output.t(), input)
@ -149,45 +89,7 @@ def linear_bias_wgrad_torch(input, grad_output, has_d_bias):


 # adpated from https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/ops/fused_dense.py
-class FusedDenseFuncTorch(torch.autograd.Function):
-    @staticmethod
-    @custom_fwd
-    def forward(ctx, x, weight, bias, return_residual=False, process_group=None, sequence_parallel=True):
-        """
-        If process_group is not None and sequence_parallel=True, we're doing Tensor Parallel
-        with sequence parallelism: we do an all_gather_raw of x before doing the matmul.
-        """
-        ctx.compute_weight_gradient = weight.requires_grad
-        ctx.return_residual = return_residual
-        ctx.process_group = process_group
-        ctx.sequence_parallel = sequence_parallel
-
-        if torch.is_autocast_enabled():
-            x = x.to(dtype=torch.get_autocast_gpu_dtype())
-        x = x.contiguous()
-        if process_group is not None and sequence_parallel:
-            # We want to kick off the all_gather early, before weight dtype conversion
-            total_x, handle_x = all_gather_raw(x, process_group, async_op=True)
-        else:
-            total_x = x
-
-        if torch.is_autocast_enabled():
-            weight = weight.to(dtype=torch.get_autocast_gpu_dtype())
-            bias = bias.to(dtype=torch.get_autocast_gpu_dtype()) if bias is not None else None
-        weight = weight.contiguous()
-        if process_group is not None and sequence_parallel:
-            handle_x.wait()
-        batch_shape, n = total_x.shape[:-1], total_x.shape[-1]
-        batch_dim = batch_shape.numel()
-        # https://github.com/pytorch/pytorch/blob/5b51849b48a7dbccd297286cc0110def4706f9e7/aten/src/ATen/native/cuda/Blas.cpp#L174
-        if min(batch_dim, n, *weight.shape) > 65535 * 32:
-            raise RuntimeError("fused_dense only supports matrix dims <= 2M")
-        output = F.linear(total_x, weight, bias)
-        if ctx.compute_weight_gradient:
-            ctx.save_for_backward(x, weight)
-        else:
-            ctx.save_for_backward(weight)
-        return output if not return_residual else (output, x)
+class FusedDenseFuncTorch(FusedDenseFunc):

    @staticmethod
    @custom_bwd