From 049121d19d7ead4a4dcbeb091df9ff87ba991a63 Mon Sep 17 00:00:00 2001 From: digger yu Date: Tue, 5 Mar 2024 21:48:46 +0800 Subject: [PATCH] [hotfix] fix typo change enabel to enable under colossalai/shardformer/ (#5317) --- colossalai/shardformer/layer/_operation.py | 4 ++-- colossalai/shardformer/layer/normalization.py | 8 ++++---- colossalai/shardformer/modeling/bloom.py | 2 +- .../shardformer/modeling/chatglm2_6b/modeling_chatglm.py | 6 +++--- colossalai/shardformer/modeling/gptj.py | 4 ++-- colossalai/shardformer/modeling/llama.py | 4 ++-- colossalai/shardformer/modeling/opt.py | 2 +- colossalai/shardformer/modeling/t5.py | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/colossalai/shardformer/layer/_operation.py b/colossalai/shardformer/layer/_operation.py index d4960c7e4..241770901 100644 --- a/colossalai/shardformer/layer/_operation.py +++ b/colossalai/shardformer/layer/_operation.py @@ -173,7 +173,7 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function): Args: input_ (`torch.Tensor`): The input tensor from sequence parallel region. process_group (`torch.distributed.ProcessGroup`): The process group used for collective communication. - overlap (`bool`): Whther to overlap the all_gather op and gradient calculate in backward. + overlap (`bool`): Whether to overlap the all_gather op and gradient calculate in backward. """ @@ -534,7 +534,7 @@ class HookParameter(torch.autograd.Function): return grad_output, None, None -def hook_paramter_in_backward(input, weight=None, bias=None): +def hook_parameter_in_backward(input, weight=None, bias=None): return HookParameter.apply(input, weight, bias) diff --git a/colossalai/shardformer/layer/normalization.py b/colossalai/shardformer/layer/normalization.py index 4aa281290..43dd153af 100644 --- a/colossalai/shardformer/layer/normalization.py +++ b/colossalai/shardformer/layer/normalization.py @@ -7,7 +7,7 @@ import torch.nn as nn from colossalai.lazy import LazyInitContext -from ._operation import hook_paramter_in_backward +from ._operation import hook_parameter_in_backward from .utils import SeqParallelUtils __all__ = ["FusedLayerNorm", "FusedRMSNorm", "LayerNorm", "RMSNorm", "BaseLayerNorm"] @@ -29,7 +29,7 @@ try: def forward(self, input): output = super().forward(input) - output = hook_paramter_in_backward(output, self.weight, self.bias) + output = hook_parameter_in_backward(output, self.weight, self.bias) return output class FusedRMSNormWithHook(ApexFusedRMSNorm): @@ -38,7 +38,7 @@ try: def forward(self, input): output = super().forward(input) - output = hook_paramter_in_backward(output, self.weight) + output = hook_parameter_in_backward(output, self.weight) return output except ImportError: @@ -79,7 +79,7 @@ if EnableFastLayerNorm: def forward(self, input): output = super().forward(input) - output = hook_paramter_in_backward(output, self.weight, self.bias) + output = hook_parameter_in_backward(output, self.weight, self.bias) return output diff --git a/colossalai/shardformer/modeling/bloom.py b/colossalai/shardformer/modeling/bloom.py index cd8a02330..d94c30d29 100644 --- a/colossalai/shardformer/modeling/bloom.py +++ b/colossalai/shardformer/modeling/bloom.py @@ -699,7 +699,7 @@ class BloomPipelineForwards: return {"hidden_states": hidden_states} -def get_bloom_flash_attention_forward(enabel_jit_fused=False): +def get_bloom_flash_attention_forward(enable_jit_fused=False): try: from xformers.ops import memory_efficient_attention as me_attention except: diff --git a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py index 71aa2296e..bf581300a 100644 --- a/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py +++ b/colossalai/shardformer/modeling/chatglm2_6b/modeling_chatglm.py @@ -181,7 +181,7 @@ class RotaryEmbedding(nn.Module): cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1) - # this is to mimic the behaviour of complex32, else we will get different results + # this is to mimic the behavior of complex32, else we will get different results if dtype in (torch.float16, torch.bfloat16, torch.int8): cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half() return cache @@ -290,7 +290,7 @@ class CoreAttention(torch.nn.Module): # [sk, b, np, hn] -> [sk, b * np, hn] key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1) - # preallocting input tensor: [b * np, sq, sk] + # preallocating input tensor: [b * np, sq, sk] matmul_input_buffer = torch.empty( output_size[0] * output_size[1], output_size[2], @@ -1289,7 +1289,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel): if has_default_max_length and generation_config.max_new_tokens is None: warnings.warn( f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. " - "This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we" + "This behavior is deprecated and will be removed from the config in v5 of Transformers -- we" " recommend using `max_new_tokens` to control the maximum length of the generation.", UserWarning, ) diff --git a/colossalai/shardformer/modeling/gptj.py b/colossalai/shardformer/modeling/gptj.py index 22b0f7a90..1990d7df3 100644 --- a/colossalai/shardformer/modeling/gptj.py +++ b/colossalai/shardformer/modeling/gptj.py @@ -122,7 +122,7 @@ class GPTJPipelineForwards: # head_mask has shape n_layer x batch x num_attention_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.n_layer) - # position id to be asssigned not just for the first stage for attn input + # position id to be assigned not just for the first stage for attn input if position_ids is not None: position_ids = position_ids.view(-1, seq_length) else: @@ -593,7 +593,7 @@ def get_gptj_flash_attention_forward(): # key = key.permute(0, 2, 1, 3) # query = query.permute(0, 2, 1, 3) - key = key.to(dtype=value.dtype) # fp16 compatability + key = key.to(dtype=value.dtype) # fp16 compatibility query = query.to(dtype=value.dtype) if layer_past is not None: diff --git a/colossalai/shardformer/modeling/llama.py b/colossalai/shardformer/modeling/llama.py index 92c709218..f20ceb2d6 100644 --- a/colossalai/shardformer/modeling/llama.py +++ b/colossalai/shardformer/modeling/llama.py @@ -225,13 +225,13 @@ class LlamaPipelineForwards: >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) - >>> prompt = "Hey, are you consciours? Can you talk to me?" + >>> prompt = "Hey, are you conscious? Can you talk to me?" >>> inputs = tokenizer(prompt, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(inputs.input_ids, max_length=30) >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." ```""" logger = logging.get_logger(__name__) output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions diff --git a/colossalai/shardformer/modeling/opt.py b/colossalai/shardformer/modeling/opt.py index 7f6cbbbcf..d0e267eac 100644 --- a/colossalai/shardformer/modeling/opt.py +++ b/colossalai/shardformer/modeling/opt.py @@ -123,7 +123,7 @@ class OPTPipelineForwards: else: if hidden_states is None: - raise ValueError("hidden_states shouln't be None for intermediate stages.") + raise ValueError("hidden_states shouldn't be None for intermediate stages.") input_shape = hidden_states.size()[:-1] batch_size, seq_length = input_shape[0], input_shape[1] device = hidden_states.device diff --git a/colossalai/shardformer/modeling/t5.py b/colossalai/shardformer/modeling/t5.py index dcb178520..9c5ce3fb6 100644 --- a/colossalai/shardformer/modeling/t5.py +++ b/colossalai/shardformer/modeling/t5.py @@ -77,7 +77,7 @@ class T5PipelineForwards: if in_decoder != (stage >= decoder_starting_stage): raise ValueError("Config in T5Stack is not aligned with pipeline setting.") - # at_first_stage: current stage is the first stage of encoder/decoder, taking input_ids/input_embedds + # at_first_stage: current stage is the first stage of encoder/decoder, taking input_ids/input_embeds # at_last_stage: current stage is the last stage of encoder/decoder, making outputs the same form as huggingface at_first_stage = (stage == 0) or (stage == decoder_starting_stage) at_last_stage = (stage == decoder_starting_stage - 1) or (stage == stage_manager.num_stages - 1)