[pipeline] Bert pipeline for shardformer and its tests (#4197)

* add pipeline forward * complete pipeline forward check * fix bert forward without pipeline * fix comments * discard useless line * add todo * clean prints * fix distribute layers
2023-07-10 13:58:58 +08:00 · 2023-07-10 13:58:58 +08:00 · 1094e0f0d3
parent 890774b2fb
commit 1094e0f0d3
5 changed files with 259 additions and 11 deletions
--- a/colossalai/shardformer/policies/base_policy.py
+++ b/colossalai/shardformer/policies/base_policy.py
@ -191,7 +191,7 @@ class Policy(ABC):
        # deal with the rest layers
        if remainder > 0:
-            start_position = num_layers // 2 - remainder // 2
+            start_position = num_stages // 2 - remainder // 2
            for i in range(start_position, start_position + remainder):
                layers_per_stage[i] += 1
        return layers_per_stage
--- a/colossalai/shardformer/policies/bert.py
+++ b/colossalai/shardformer/policies/bert.py
@ -13,6 +13,8 @@ from transformers.modeling_outputs import (
    CausalLMOutputWithCrossAttentions,
 )
 from transformers.models.bert.modeling_bert import (
    BertForMaskedLM,
    BertForNextSentencePrediction,
    BertForPreTraining,
    BertForPreTrainingOutput,
    BertLMHeadModel,
@ -135,7 +137,6 @@ class BertPolicy(Policy):
            ],
                                                        policy=policy,
                                                        target_key=BertLayer)
            # handle embedding layer
            self.append_or_create_submodule_replacement(
                description=[SubModuleReplacementDescription(
@ -144,6 +145,7 @@ class BertPolicy(Policy):
                )],
                policy=policy,
                target_key=BertEmbeddings)
        return policy
    def add_lm_head_policy(self, base_policy):
@ -177,6 +179,15 @@ class BertModelPolicy(BertPolicy):
    def __init__(self) -> None:
        super().__init__()
    def module_policy(self):
        module_policy = super().module_policy()
        from transformers.models.bert.modeling_bert import BertModel
        if self.pipeline_stage_manager:
            # set None as default
            module_policy[BertModel] = ModulePolicyDescription(
                method_replacement={'forward': partial(bert_model_forward, stage_manager=self.pipeline_stage_manager)})
        return module_policy
    def get_held_layers(self) -> List[Module]:
        """Get pipeline layers for current stage."""
        module = self.model
@ -444,6 +455,13 @@ def bert_model_forward(
            raise ValueError("You have to specify either input_ids or inputs_embeds")
        batch_size, seq_length = input_shape
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        if token_type_ids is None:
            if hasattr(self.embeddings, "token_type_ids"):
                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
                token_type_ids = buffered_token_type_ids_expanded
            else:
                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
    else:
        input_shape = hidden_states.size()[:-1]
        batch_size, seq_length = input_shape
@ -466,14 +484,6 @@ def bert_model_forward(
    if attention_mask is None:
        attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
    if token_type_ids is None:
        if hasattr(self.embeddings, "token_type_ids"):
            buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
            buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
            token_type_ids = buffered_token_type_ids_expanded
        else:
            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
    # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
    # ourselves in which case we just need to make it broadcastable to all heads.
    extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
@ -778,3 +788,131 @@ def bert_lmhead_forward(self: BertLMHeadModel,
        hidden_states = outputs.get('hidden_states')
        # intermediate stage always return dict
        return {'hidden_states': hidden_states}
 def bert_for_masked_lm_forward(
    self: BertForMaskedLM,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    encoder_hidden_states: Optional[torch.Tensor] = None,
    encoder_attention_mask: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    hidden_states: Optional[torch.Tensor] = None,
    stage_manager: Optional[PipelineStageManager] = None,
 ):
    #-> Union[Tuple[torch.Tensor], MaskedLMOutput]:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        """
    pass
 def bert_for_next_sentence_prediction_forward(
    self: BertForNextSentencePrediction,
    input_ids: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    token_type_ids: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    inputs_embeds: Optional[torch.Tensor] = None,
    labels: Optional[torch.Tensor] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    hidden_states: Optional[torch.Tensor] = None,
    stage_manager: Optional[PipelineStageManager] = None,
    **kwargs,
 ):
    #-> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see `input_ids` docstring). Indices should be in `[0, 1]`:
            - 0 indicates sequence B is a continuation of sequence A,
            - 1 indicates sequence B is a random sequence.
        Returns:
        Example:
        ```python
        >>> from transformers import AutoTokenizer, BertForNextSentencePrediction
        >>> import torch
        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        >>> model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
        >>> logits = outputs.logits
        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
        ```
        """
    if "next_sentence_label" in kwargs:
        warnings.warn(
            "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
            " `labels` instead.",
            FutureWarning,
        )
        labels = kwargs.pop("next_sentence_label")
    if output_attentions:
        logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
        output_attentions = False
    if output_hidden_states:
        logger.warning_once('output_hidden_states=True is not supported for pipeline models at the moment.')
        output_hidden_states = False
    if return_dict:
        logger.warning_once('return_dict is not supported for pipeline models at the moment')
        return_dict = False
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
    outputs = bert_model_forward(
        self.bert,
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids,
        position_ids=position_ids,
        head_mask=head_mask,
        inputs_embeds=inputs_embeds,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )
    if stage_manager.is_last_stage():
        pooled_output = outputs[1]
        seq_relationship_scores = self.cls(pooled_output)
        next_sentence_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
        if not return_dict:
            output = (seq_relationship_scores,) + outputs[2:]
            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
        return NextSentencePredictorOutput(
            loss=next_sentence_loss,
            logits=seq_relationship_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    else:
        hidden_states = outputs.get('hidden_states')
        # intermediate stage always return dict
        return {'hidden_states': hidden_states}
--- a/colossalai/shardformer/shard/sharder.py
+++ b/colossalai/shardformer/shard/sharder.py
@ -1,3 +1,4 @@
 from types import MethodType
 from typing import Any, Callable, Dict, List, Union
 import torch.nn as nn
@ -134,7 +135,8 @@ class ModelSharder(object):
    def _replace_method(self, module: nn.Module, method_replacement: Dict[str, Callable]):
        for method_name, new_method in method_replacement.items():
            # bind the new method to the module
-            setattr(module, method_name, new_method.__get__(module, module.__class__))
+            bound_method = MethodType(new_method, module)
            setattr(module, method_name, bound_method)
    def _replace_sub_module(
        self,
--- a/tests/test_shardformer/test_model/_utils.py
+++ b/tests/test_shardformer/test_model/_utils.py
@ -2,6 +2,7 @@ import copy
 from contextlib import nullcontext
 from colossalai.lazy import LazyInitContext
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.shardformer import ShardConfig, ShardFormer
@ -21,6 +22,28 @@ def build_model(model_fn, enable_fused_normalization=True, enable_tensor_paralle
    return org_model.cuda(), sharded_model.cuda()
 def build_pipeline_model(model_fn,
                         stage_manager=None,
                         enable_fused_normalization=False,
                         enable_tensor_parallelism=False,
                         use_lazy_init: bool = False):
    ctx = LazyInitContext() if use_lazy_init else nullcontext()
    with ctx:
        # create new model
        org_model = model_fn()
        model_copy = copy.deepcopy(org_model)
    if use_lazy_init:
        ctx.materialize(org_model)
    # shard model
    shard_config = ShardConfig(enable_fused_normalization=enable_fused_normalization,
                               enable_tensor_parallelism=enable_tensor_parallelism,
                               pipeline_stage_manager=stage_manager)
    shard_former = ShardFormer(shard_config=shard_config)
    sharded_model, shared_params = shard_former.optimize(model_copy)
    return org_model.cuda(), sharded_model.cuda()
 def run_forward(original_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
    # prepare input
    data = data_gen_fn()
--- a/tests/test_shardformer/test_model/test_shard_bert_pipeline.py
+++ b/tests/test_shardformer/test_model/test_shard_bert_pipeline.py
@ -0,0 +1,85 @@
 import pytest
 import torch
 import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.logging import disable_existing_loggers
 from colossalai.pipeline.stage_manager import PipelineStageManager
 from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
 from colossalai.testing import (
    assert_hf_output_close,
    clear_cache_before_run,
    parameterize,
    rerun_if_address_is_in_use,
    spawn,
 )
 from tests.kit.model_zoo import model_zoo
 from tests.test_shardformer.test_model._utils import build_model, build_pipeline_model, run_forward
 def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
    # check forward
    pass
@parameterize('enable_fused_normalization', [False])
@parameterize('enable_tensor_parallelism', [False])
@parameterize('use_lazy_init', [False])
 #TODO: merge this into test_shard_bert
 def run_bert_test(enable_fused_normalization, enable_tensor_parallelism, use_lazy_init):
    DP_DIM, PP_DIM = 0, 1
    DP_SIZE, PP_SIZE = 2, 2
    RANK_TO_COORDINATE = {
        0: (0, 0),
        1: (0, 1),
        2: (1, 0),
        3: (1, 1),
    }
    PP_RANKS_IN_GROUP = {
        0: [0, 1],
        1: [0, 1],
        2: [2, 3],
        3: [2, 3],
    }
    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
    sub_model_zoo = model_zoo.get_sub_registry('transformers_bert')
    x = torch.randint(0, 1000, (2, 3)).cuda()
    hidden_states = torch.randint(0, 1000, (2, 3, 128)).to(torch.float32).cuda()
    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
        if name == 'transformers_bert':
            org_model, sharded_model = build_pipeline_model(model_fn, stage_manager, enable_fused_normalization,
                                                            enable_tensor_parallelism, use_lazy_init)
            if stage_manager.stage == 0:
                attention_mask = torch.ones_like(x).cuda()
                output = sharded_model(input_ids=x, attention_mask=attention_mask, stage_manager=stage_manager)
                # print(output['hidden_states'].shape)
                assert output['hidden_states'].shape == (2, 3, 128)
            else:
                attention_mask = torch.ones((2, 3)).cuda()
                output = sharded_model(hidden_states=hidden_states,
                                       attention_mask=attention_mask,
                                       stage_manager=stage_manager)
                # print(output[0].shape)
                assert output[0].shape == (2, 3, 128)
    torch.cuda.empty_cache()
 def check_bert(rank, world_size, port):
    disable_existing_loggers()
    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
    run_bert_test()
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
 def test_bert():
    spawn(check_bert, 4)
 if __name__ == "__main__":
    test_bert()