From 9a290ab01333d63a331d43825acffdf114f30725 Mon Sep 17 00:00:00 2001 From: GuangyaoZhang Date: Fri, 14 Jun 2024 08:09:24 +0000 Subject: [PATCH] fix precommit --- colossalai/shardformer/layer/__init__.py | 2 +- colossalai/shardformer/layer/normalization.py | 1 - colossalai/shardformer/modeling/command.py | 22 ++----- colossalai/shardformer/policies/command.py | 8 ++- diff.output | 59 ------------------- tests/kit/model_zoo/transformers/command.py | 2 - .../test_model/test_shard_command.py | 27 ++++++++- 7 files changed, 35 insertions(+), 86 deletions(-) delete mode 100644 diff.output diff --git a/colossalai/shardformer/layer/__init__.py b/colossalai/shardformer/layer/__init__.py index 8c70a26b7..33e500034 100644 --- a/colossalai/shardformer/layer/__init__.py +++ b/colossalai/shardformer/layer/__init__.py @@ -4,7 +4,7 @@ from .dropout import DropoutForParallelInput, DropoutForReplicatedInput from .embedding import Embedding1D, PaddingEmbedding, VocabParallelEmbedding1D from .linear import Linear1D_Col, Linear1D_Row, PaddingLMHead, VocabParallelLMHead1D from .loss import cross_entropy_1d -from .normalization import FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm, CohereLayerNorm, FusedCohereLayerNorm +from .normalization import CohereLayerNorm, FusedCohereLayerNorm, FusedLayerNorm, FusedRMSNorm, LayerNorm, RMSNorm from .parallel_module import ParallelModule from .qkv_fused_linear import FusedLinear1D_Col, GPT2FusedLinearConv1D_Col, GPT2FusedLinearConv1D_Row diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py index 1f30c7741..34a126904 100644 --- a/colossalai/shardformer/layer/normalization.py +++ b/colossalai/shardformer/layer/normalization.py @@ -250,7 +250,6 @@ class FusedLayerNorm(BaseLayerNorm): return layernorm - class CohereLayerNorm(BaseLayerNorm): r""" This is a wrapper around the transformers.models.cohere.CohereLayerNorm. It is meant to be used only with the from_native_module interface. diff --git a/colossalai/shardformer/modeling/command.py b/colossalai/shardformer/modeling/command.py index d0e6ed0a6..85cf551b6 100644 --- a/colossalai/shardformer/modeling/command.py +++ b/colossalai/shardformer/modeling/command.py @@ -3,22 +3,12 @@ import warnings from typing import List, Optional, Tuple, Union import torch -import torch.nn.functional as F import torch.utils.checkpoint from torch import nn -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from torch.nn import CrossEntropyLoss from transformers.cache_utils import Cache, DynamicCache -from transformers.modeling_outputs import ( - BaseModelOutputWithPast, - CausalLMOutputWithPast, - SequenceClassifierOutputWithPast, -) -from transformers.models.cohere.modeling_cohere import ( - CohereForCausalLM, - CohereModel, - StaticCache, - repeat_kv, -) +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from transformers.models.cohere.modeling_cohere import CohereForCausalLM, CohereModel, StaticCache, repeat_kv from transformers.utils import logging from colossalai.pipeline.stage_manager import PipelineStageManager @@ -343,10 +333,9 @@ class CommandPipelineForwards: hidden_states = outputs.get("hidden_states") return {"hidden_states": hidden_states} + def get_command_flash_attention_forward(shard_config, sp_mode, sp_group, sp_size): - from transformers.models.cohere.modeling_cohere import CohereAttention, apply_rotary_pos_emb - from transformers.models.cohere.modeling_cohere import repeat_kv - + from transformers.models.cohere.modeling_cohere import CohereAttention, apply_rotary_pos_emb, repeat_kv def forward( self: CohereAttention, @@ -728,7 +717,6 @@ def get_command_seq_parallel_attention_forward(sp_mode, sp_size, sp_group): else: attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - attn_output = self.o_proj(attn_output) if not output_attentions: diff --git a/colossalai/shardformer/policies/command.py b/colossalai/shardformer/policies/command.py index 01fff3aa4..6c4785912 100644 --- a/colossalai/shardformer/policies/command.py +++ b/colossalai/shardformer/policies/command.py @@ -7,12 +7,12 @@ from torch import Tensor from torch.nn import Module from colossalai.shardformer.layer import ( + CohereLayerNorm, FusedCohereLayerNorm, Linear1D_Col, Linear1D_Row, PaddingEmbedding, PaddingLMHead, - CohereLayerNorm, VocabParallelEmbedding1D, VocabParallelLMHead1D, ) @@ -383,7 +383,9 @@ class CommandForCausalLMPolicy(CommandPolicy): if self.pipeline_stage_manager: # set None as default self.set_pipeline_forward( - model_cls=CohereForCausalLM, new_forward=CommandPipelineForwards.command_for_causal_lm_forward, policy=policy + model_cls=CohereForCausalLM, + new_forward=CommandPipelineForwards.command_for_causal_lm_forward, + policy=policy, ) return policy @@ -410,4 +412,4 @@ class CommandForCausalLMPolicy(CommandPolicy): self.pipeline_stage_manager.num_stages - 1: self.model.lm_head.weight, } ] - return [] \ No newline at end of file + return [] diff --git a/diff.output b/diff.output deleted file mode 100644 index 638edfee8..000000000 --- a/diff.output +++ /dev/null @@ -1,59 +0,0 @@ -diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py -index 5aa21260..01453a05 100644 ---- a/colossalai/shardformer/layer/normalization.py -+++ b/colossalai/shardformer/layer/normalization.py -@@ -165,7 +165,7 @@ class LayerNorm(BaseLayerNorm): - Raises: - AssertionError: If the provided module is not an instance of nn.LayerNorm. - """ -- assert isinstance(module, nn.LayerNorm), "Only support conversion from nn.LayerNorm." -+ # assert isinstance(module, nn.LayerNorm), "Only support conversion from nn.LayerNorm." - - LazyInitContext.materialize(module) - -@@ -174,7 +174,7 @@ class LayerNorm(BaseLayerNorm): - # aggregation of these gradients is necessary during backpropagation. - # Therefore, we annotate these parameters in advance to indicate the need for gradient aggregation. - SeqParallelUtils.marked_as_sp_partial_derived_param(module.weight) -- SeqParallelUtils.marked_as_sp_partial_derived_param(module.bias) -+ # SeqParallelUtils.marked_as_sp_partial_derived_param(module.bias) - - return module - -@@ -209,9 +209,12 @@ class FusedLayerNorm(BaseLayerNorm): - - LazyInitContext.materialize(module) - # get the attributes of the module -- normalized_shape = module.normalized_shape -- eps = module.eps -- elementwise_affine = module.elementwise_affine -+ # normalized_shape = module.normalized_shape -+ # eps = module.eps -+ # elementwise_affine = module.elementwise_affine -+ normalized_shape = module.weight.size(0) -+ eps = module.variance_epsilon -+ elementwise_affine = True - dtype = module.weight.dtype - device = module.weight.device - -@@ -244,7 +247,7 @@ class FusedLayerNorm(BaseLayerNorm): - # aggregation of these gradients is necessary during backpropagation. - # Therefore, we annotate these parameters in advance to indicate the need for gradient aggregation. - SeqParallelUtils.marked_as_sp_partial_derived_param(layernorm.weight) -- SeqParallelUtils.marked_as_sp_partial_derived_param(layernorm.bias) -+ # SeqParallelUtils.marked_as_sp_partial_derived_param(layernorm.bias) - - return layernorm - -diff --git a/tests/test_shardformer/test_model/test_shard_command.py b/tests/test_shardformer/test_model/test_shard_command.py -index 6075f836..a7166e38 100644 ---- a/tests/test_shardformer/test_model/test_shard_command.py -+++ b/tests/test_shardformer/test_model/test_shard_command.py -@@ -210,6 +210,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, - ], - ) - def run_command_test(test_config): -+ print(test_config) - sub_model_zoo = model_zoo.get_sub_registry("transformers_command", "transformers_command_for_casual_lm") - - for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): diff --git a/tests/kit/model_zoo/transformers/command.py b/tests/kit/model_zoo/transformers/command.py index 6b15792b4..a8b8842c5 100644 --- a/tests/kit/model_zoo/transformers/command.py +++ b/tests/kit/model_zoo/transformers/command.py @@ -16,8 +16,6 @@ if HAS_COMMAND: # =============================== def data_gen(): - - input_ids = torch.Tensor( [ [1, 15043, 29892, 590, 11203, 338, 274, 1082, 1, 15043, 29892, 590, 11203, 338, 274, 1082], diff --git a/tests/test_shardformer/test_model/test_shard_command.py b/tests/test_shardformer/test_model/test_shard_command.py index c4b640d97..32c67d60e 100644 --- a/tests/test_shardformer/test_model/test_shard_command.py +++ b/tests/test_shardformer/test_model/test_shard_command.py @@ -79,10 +79,24 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, else: atol, rtol = 5e-3, 5e-3 row_layer_grads = get_grad_tensors_for_check( - command_model, shard_command_model, row_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=0, verbose=False + command_model, + shard_command_model, + row_layer_for_check, + tp_group, + atol=atol, + rtol=rtol, + dim=0, + verbose=False, ) col_layer_grads = get_grad_tensors_for_check( - command_model, shard_command_model, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False + command_model, + shard_command_model, + col_layer_for_check, + tp_group, + atol=atol, + rtol=rtol, + dim=1, + verbose=False, ) norm_layer_grads = get_grad_tensors_for_check( command_model, @@ -121,7 +135,14 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, else: atol, rtol = 5e-3, 5e-3 check_weight( - command_model, shard_command_model, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False + command_model, + shard_command_model, + col_layer_for_check, + tp_group, + atol=atol, + rtol=rtol, + dim=1, + verbose=False, ) # check grads