add share embedding weight support for moe

2023-10-18 11:39:04 +08:00 · 2023-10-18 11:39:04 +08:00 · bf6dbf07fa
parent eeef07934a
commit bf6dbf07fa
7 changed files with 180 additions and 9 deletions
--- a/configs/7B_MoE4_sft.py
+++ b/configs/7B_MoE4_sft.py
@ -129,6 +129,7 @@ model = dict(
    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
    num_attention_heads=NUM_ATTENTION_HEAD,
    embed_split_hidden=True,
+    tie_embeddings_and_output_weights=False,
    vocab_size=VOCAB_SIZE,
    embed_grad_scale=1,
    parallel_output=True,
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@ -48,6 +48,9 @@ class ParallelMode(Enum):
    # expert data parallel
    EXPERT_DATA = "expert_data"

+    # embedding share
+    EMBEDDING = "embedding"
+
    # dummy mode, only used during mode construction
    DUMMY = "dummy"

@ -236,8 +239,8 @@ class Initializer_Pipeline(ProcessGroupInitializer):
        process_group = None
        cpu_group = None
        group_world_size = None
-        mode = ParallelMode.PIPELINE

+        groups = []
        for i in range(self.data_parallel_size):
            for j in range(self.pipeline_stage_size):
                ranks = list(
@ -265,7 +268,37 @@ class Initializer_Pipeline(ProcessGroupInitializer):
                    cpu_group = group_cpu
                    ranks_in_group = ranks

-        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+                    groups.append(
+                        (local_rank, group_world_size, process_group, cpu_group, ranks_in_group, ParallelMode.PIPELINE)
+                    )
+
+                # create embedding commuication group
+                if len(ranks) > 1:
+                    embedding_ranks = [ranks[0], ranks[-1]]
+                else:
+                    embedding_ranks = ranks
+                embed_group = dist.new_group(embedding_ranks, timeout=LLM_NCCL_TIMEOUT)
+                if use_cpu:
+                    group_cpu = (
+                        dist.new_group(embedding_ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else embed_group
+                    )
+                else:
+                    group_cpu = None
+
+                if self.rank in ranks:
+                    local_rank = ranks.index(self.rank)
+                    group_world_size = len(embedding_ranks)
+                    process_group = embed_group
+                    cpu_group = group_cpu
+                    ranks_in_group = embedding_ranks
+
+                    groups.append(
+                        (local_rank, group_world_size, process_group, cpu_group, ranks_in_group, ParallelMode.EMBEDDING)
+                    )
+
+        return groups


 class Initializer_Tensor(ProcessGroupInitializer):
--- a/internlm/core/gradient_handler.py
+++ b/internlm/core/gradient_handler.py
@ -9,6 +9,7 @@ import torch.distributed as dist
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

 from internlm.core.context import global_context as gpc
+from internlm.core.context.process_group_initializer import ParallelMode


 class BaseGradientHandler(ABC):
@ -74,3 +75,26 @@ class PipelineSharedModuleGradientHandler(BaseGradientHandler):
                    dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
                        buf.copy_(synced)
+
+
+class EmbeddingSharedModuleGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in embedding share groups.
+    A all-reduce collective communication will be operated in
+    :func:`handle_gradient` among the first pipeline stage and the last pipeline stage.
+    For better performance, it bucketizes the gradients of all parameters that are
+    the same type to improve the efficiency of communication.
+
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+
+    def handle_gradient(self):
+        """A method running a all-reduce operation in sub pipeline parallel groups."""
+        if gpc.is_pipeline_first_stage() or gpc.is_pipeline_last_stage():
+            weight = self._model.model.shared_embedding_weight()
+            grad = weight.grad
+            # enable zero will cause grad to be None
+            if grad is None:
+                grad = torch.zeros_like(weight)
+            torch.distributed.all_reduce(grad, group=gpc.get_group(parallel_mode=ParallelMode.EMBEDDING))
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@ -14,7 +14,10 @@ from torch.utils.data import DataLoader
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.engine import Engine
-from internlm.core.gradient_handler import PipelineSharedModuleGradientHandler
+from internlm.core.gradient_handler import (
+    EmbeddingSharedModuleGradientHandler,
+    PipelineSharedModuleGradientHandler,
+)
 from internlm.core.scheduler import (
    InterleavedPipelineScheduler,
    NonPipelineScheduler,
@ -68,8 +71,12 @@ def initialize_trainer(
    assert isinstance(optimizer, BaseOptimizer), "optimizer must be instance of BaseOptimizer"

    # gradient handler, only support PipelineSharedModuleGradientHandler now
+    # TODO: can refactor code here
    if gpc.is_using_pp():
-        gpc.config.gradient_handler = [dict(type="PipelineSharedModuleGradientHandler")]
+        gpc.config.gradient_handler = [
+            dict(type="PipelineSharedModuleGradientHandler"),
+            dict(type="EmbeddingSharedModuleGradientHandler"),
+        ]
    gradient_handler_cfg = gpc.config.get("gradient_handler", [])
    gradient_handlers = []
    assert isinstance(gradient_handler_cfg, list), f"gradient_handler must be list but got {type(gradient_handler_cfg)}"
@ -77,6 +84,14 @@ def initialize_trainer(
        if isinstance(config, dict) and config.get("type") == "PipelineSharedModuleGradientHandler":
            handler = PipelineSharedModuleGradientHandler(model=model, optimizer=optimizer)
            gradient_handlers.append(handler)
+        if (
+            isinstance(config, dict)
+            and config.get("type") == "EmbeddingSharedModuleGradientHandler"
+            and gpc.config.model.get("tie_embeddings_and_output_weights", False)
+            and gpc.pipeline_parallel_size > 1
+        ):
+            handler = EmbeddingSharedModuleGradientHandler(model=model, optimizer=optimizer)
+            gradient_handlers.append(handler)

    # initialize scheduler for trainer
    scheduler = None
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -293,6 +293,8 @@ def args_sanity_check():
            model._add_item("moe_use_residual", False)
        if "moe_gate_k" not in model:
            model._add_item("moe_gate_k", 2)
+        if "tie_embeddings_and_output_weights" not in model:
+            model._add_item("tie_embeddings_and_output_weights", False)
        assert not (
            gpc.config.model.num_experts > 1 and gpc.config.parallel.zero1.fsdp
        ), "FSDP does not support num_experts > 1"
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@ -40,22 +40,30 @@ class ScaleColumnParallelLinear(nn.Linear):
        device: Optional[torch.device] = None,
        dtype: Optional[torch.dtype] = None,
        weight_scale: int = 1,
+        skip_weight_alloction: bool = False,
    ) -> None:
        world_size = torch.distributed.get_world_size(process_group)
        if out_features % world_size != 0:
            raise ValueError(f"out_features ({out_features}) must be divisible by " f"world_size ({world_size})")
        super().__init__(in_features, out_features // world_size, bias=bias, device=device, dtype=dtype)
+        if skip_weight_alloction:
+            del self.weight
+            self.register_parameter("weight", None)
        self.process_group = process_group
        self.weight_scale = weight_scale

-    def forward(self, input):  # pylint: disable=W0622
+    def forward(self, input, shared_weight: Optional[torch.Tensor] = None):  # pylint: disable=W0622
        # If self.sequence_parallel is True, we're doing Tensor Parallel with sequence parallelism:
        # we do an all_gather of x before doing the matmul.
        # If not, then the input is already gathered.
+        if shared_weight is None:
+            if self.weight is None:
+                raise RuntimeError("weight was not given in forward pass " "and skip_weight_allocation is True.")
+            shared_weight = self.weight
        if self.weight_scale != 1:
-            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
+            weight = shared_weight * self.weight_scale + (1 - self.weight_scale) * shared_weight.detach()
        else:
-            weight = self.weight
+            weight = shared_weight
        return fused_dense_func_torch(
            input,
            weight,
@ -91,7 +99,11 @@ class RewardModelLinear(ScaleColumnParallelLinear):
        device: Optional[torch.device] = None,
        dtype: Optional[torch.dtype] = None,
        weight_scale: int = 1,
+        skip_weight_alloction: bool = False,
    ) -> None:
+        # TODO have not use RewardModelLinear for now
+        assert not skip_weight_alloction, "shared weight not support here for now"
+
        super().__init__(in_features, out_features, process_group, bias, device, dtype, weight_scale)
        torch.distributed.broadcast(self.weight, gpc.get_ranks_in_group(ParallelMode.TENSOR)[0], process_group)
        if bias:
@ -102,7 +114,10 @@ class RewardModelLinear(ScaleColumnParallelLinear):
        # we do an all_gather of x before doing the matmul.
        # If not, then the input is already gathered.
        if self.weight_scale != 1:
-            weight = self.weight * self.weight_scale + (1 - self.weight_scale) * self.weight.detach()
+            weight = (
+                self.weight * self.weight_scale
+                + (1 - self.weight_scale) * self.weight.detach()  # pylint: disable=not-callable
+            )
        else:
            weight = self.weight
        return fused_dense_func_torch(
--- a/internlm/model/modeling_moe.py
+++ b/internlm/model/modeling_moe.py
@ -331,6 +331,7 @@ class PackedFlashInternLm1D(nn.Module):
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
                                          (https://arxiv.org/abs/2201.05596) layer.
+        tie_embeddings_and_output_weights: embedding and output layer share the same weight.
    """

    def __init__(
@ -370,9 +371,15 @@ class PackedFlashInternLm1D(nn.Module):
        moe_drop_tokens: bool = True,
        moe_use_rts: bool = True,
        moe_use_residual: bool = False,
+        tie_embeddings_and_output_weights: bool = True,
    ):
        super().__init__()

+        assert not (
+            embed_split_hidden and tie_embeddings_and_output_weights
+        ), "shared embedding weights is not supported when embed_split_hidden is True."
+        self.tie_embeddings_and_output_weights = tie_embeddings_and_output_weights
+
        checkpoint_layer_num = int(num_layers * checkpoint)

        if is_reward:
@ -446,6 +453,7 @@ class PackedFlashInternLm1D(nn.Module):
                device=device,
                dtype=dtype,
                weight_scale=embed_grad_scale,
+                skip_weight_alloction=self.tie_embeddings_and_output_weights,
            )
            for _, param in self.head.named_parameters():
                normal_(std=0.0052)(param)
@ -453,6 +461,9 @@ class PackedFlashInternLm1D(nn.Module):
                    setattr(param, IS_TENSOR_PARALLEL, True)
        self.parallel_output = parallel_output

+        if self.tie_embeddings_and_output_weights:
+            self.initialize_word_embeddings(hidden_size, vocab_size, dtype, device)
+
    def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
        # attention_mask: compute attention on the places where the value is 1
        # old condition may fail when use shared embedding
@ -491,12 +502,79 @@ class PackedFlashInternLm1D(nn.Module):
        if hasattr(self, "norm"):
            hidden_states = self.norm(hidden_states.float())
        if hasattr(self, "head"):
-            hidden_states = self.head(hidden_states)
+            if self.tie_embeddings_and_output_weights:
+                hidden_states = self.head(hidden_states, self.shared_embedding_weight())
+            else:
+                hidden_states = self.head(hidden_states)

        if not self.parallel_output:
            hidden_states = gather_forward_split_backward(hidden_states, ParallelMode.TENSOR, dim=-1)
        return hidden_states, moe_losses

+    def shared_embedding_weight(self):
+        if not self.tie_embeddings_and_output_weights:
+            raise Exception(
+                "shared_embedding_weight() called for last stage, but share_embeddings_and_output_weights is false"
+            )
+        assert isinstance(self.embedding, ParallelGPT2Embeddings)
+
+        return self.embedding.word_embeddings.weight
+
+    # TODO: refactor code
+    def initialize_word_embeddings(
+        self,
+        hidden_size: int = 768,
+        vocab_size: int = 50304,
+        dtype: torch.dtype = torch.float,
+        device: Optional[torch.device] = None,
+    ):
+        if not self.tie_embeddings_and_output_weights:
+            raise Exception("initialize_word_embeddings() was called but " "tie_embeddings_and_output_weights is false")
+
+        # This function just initializes the word embeddings in the final stage
+        # when we are using pipeline parallelism. Nothing to do if we aren't
+        # using pipeline parallelism.
+        if gpc.get_world_size(ParallelMode.PIPELINE) == 1:
+            return
+
+        # Parameters are shared between the word embeddings layers, and the
+        # heads at the end of the model. In a pipelined setup with more than
+        # one stage, the initial embedding layer and the head are on different
+        # workers, so we do the following:
+        # 1. Create a second copy of word_embeddings on the last stage, with
+        #    initial parameters of 0.0.
+        # 2. Do an all-reduce between the first and last stage to ensure that
+        #    the two copies of word_embeddings start off with the same
+        #    parameter values.
+        # 3. In the training loop, before an all-reduce between the grads of
+        #    the two word_embeddings layers to ensure that every applied weight
+        #    update is the same on both stages.
+        if gpc.is_pipeline_last_stage():
+            assert not gpc.is_pipeline_first_stage()
+            # set word_embeddings weights to 0 here, then copy first
+            # stage's weights using all_reduce below.
+            self.embedding = ParallelGPT2Embeddings(
+                embed_dim=hidden_size,
+                vocab_size=vocab_size,
+                max_position_embeddings=-1,
+                process_group=gpc.get_group(ParallelMode.TENSOR),
+                padding_idx=None,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
+                device=device,
+                dtype=dtype,
+            )
+            for _, param in self.embedding.named_parameters():
+                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                    setattr(param, IS_TENSOR_PARALLEL, True)
+            self.shared_embedding_weight().data.fill_(0)
+
+        # Ensure that first and last stages have the same initial parameter
+        # values.
+        if gpc.is_pipeline_first_stage() or gpc.is_pipeline_last_stage():
+            torch.distributed.all_reduce(
+                self.shared_embedding_weight().data, group=gpc.get_group(ParallelMode.EMBEDDING)
+            )
+

 def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"), **kwargs):
    """
@ -572,6 +650,7 @@ def build_model_with_moe_cfg(
    moe_drop_tokens: bool = True,
    moe_use_rts: bool = True,
    moe_use_residual: bool = False,
+    tie_embeddings_and_output_weights=False,
 ):
    """
    Build model with config.
@ -613,6 +692,7 @@ def build_model_with_moe_cfg(
        moe_use_rts (bool, optional): default=True, whether to use Random Token Selection.
        moe_use_residual (bool, optional): default=False, make this MoE layer a Residual MoE
                                           (https://arxiv.org/abs/2201.05596) layer.
+        tie_embeddings_and_output_weights: embedding and output layer share the same weight.
    """

    cfg = dict(
@ -646,6 +726,7 @@ def build_model_with_moe_cfg(
        moe_drop_tokens=moe_drop_tokens,
        moe_use_rts=moe_use_rts,
        moe_use_residual=moe_use_residual,
+        tie_embeddings_and_output_weights=tie_embeddings_and_output_weights,
    )

    return _build_generic_model_1d(num_layers=num_layers, num_chunks=num_chunks, **cfg)