mirror of https://github.com/hpcaitech/ColossalAI
[hotfix] fix typo change enabel to enable under colossalai/shardformer/ (#5317)
parent
16c96d4d8c
commit
049121d19d
|
@ -173,7 +173,7 @@ class _LinearWithGatherForwardReduceScatterBackward(torch.autograd.Function):
|
||||||
Args:
|
Args:
|
||||||
input_ (`torch.Tensor`): The input tensor from sequence parallel region.
|
input_ (`torch.Tensor`): The input tensor from sequence parallel region.
|
||||||
process_group (`torch.distributed.ProcessGroup`): The process group used for collective communication.
|
process_group (`torch.distributed.ProcessGroup`): The process group used for collective communication.
|
||||||
overlap (`bool`): Whther to overlap the all_gather op and gradient calculate in backward.
|
overlap (`bool`): Whether to overlap the all_gather op and gradient calculate in backward.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -534,7 +534,7 @@ class HookParameter(torch.autograd.Function):
|
||||||
return grad_output, None, None
|
return grad_output, None, None
|
||||||
|
|
||||||
|
|
||||||
def hook_paramter_in_backward(input, weight=None, bias=None):
|
def hook_parameter_in_backward(input, weight=None, bias=None):
|
||||||
return HookParameter.apply(input, weight, bias)
|
return HookParameter.apply(input, weight, bias)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,7 +7,7 @@ import torch.nn as nn
|
||||||
|
|
||||||
from colossalai.lazy import LazyInitContext
|
from colossalai.lazy import LazyInitContext
|
||||||
|
|
||||||
from ._operation import hook_paramter_in_backward
|
from ._operation import hook_parameter_in_backward
|
||||||
from .utils import SeqParallelUtils
|
from .utils import SeqParallelUtils
|
||||||
|
|
||||||
__all__ = ["FusedLayerNorm", "FusedRMSNorm", "LayerNorm", "RMSNorm", "BaseLayerNorm"]
|
__all__ = ["FusedLayerNorm", "FusedRMSNorm", "LayerNorm", "RMSNorm", "BaseLayerNorm"]
|
||||||
|
@ -29,7 +29,7 @@ try:
|
||||||
|
|
||||||
def forward(self, input):
|
def forward(self, input):
|
||||||
output = super().forward(input)
|
output = super().forward(input)
|
||||||
output = hook_paramter_in_backward(output, self.weight, self.bias)
|
output = hook_parameter_in_backward(output, self.weight, self.bias)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
class FusedRMSNormWithHook(ApexFusedRMSNorm):
|
class FusedRMSNormWithHook(ApexFusedRMSNorm):
|
||||||
|
@ -38,7 +38,7 @@ try:
|
||||||
|
|
||||||
def forward(self, input):
|
def forward(self, input):
|
||||||
output = super().forward(input)
|
output = super().forward(input)
|
||||||
output = hook_paramter_in_backward(output, self.weight)
|
output = hook_parameter_in_backward(output, self.weight)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -79,7 +79,7 @@ if EnableFastLayerNorm:
|
||||||
|
|
||||||
def forward(self, input):
|
def forward(self, input):
|
||||||
output = super().forward(input)
|
output = super().forward(input)
|
||||||
output = hook_paramter_in_backward(output, self.weight, self.bias)
|
output = hook_parameter_in_backward(output, self.weight, self.bias)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -699,7 +699,7 @@ class BloomPipelineForwards:
|
||||||
return {"hidden_states": hidden_states}
|
return {"hidden_states": hidden_states}
|
||||||
|
|
||||||
|
|
||||||
def get_bloom_flash_attention_forward(enabel_jit_fused=False):
|
def get_bloom_flash_attention_forward(enable_jit_fused=False):
|
||||||
try:
|
try:
|
||||||
from xformers.ops import memory_efficient_attention as me_attention
|
from xformers.ops import memory_efficient_attention as me_attention
|
||||||
except:
|
except:
|
||||||
|
|
|
@ -181,7 +181,7 @@ class RotaryEmbedding(nn.Module):
|
||||||
|
|
||||||
cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
|
cache = torch.stack([torch.cos(idx_theta), torch.sin(idx_theta)], dim=-1)
|
||||||
|
|
||||||
# this is to mimic the behaviour of complex32, else we will get different results
|
# this is to mimic the behavior of complex32, else we will get different results
|
||||||
if dtype in (torch.float16, torch.bfloat16, torch.int8):
|
if dtype in (torch.float16, torch.bfloat16, torch.int8):
|
||||||
cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
|
cache = cache.bfloat16() if dtype == torch.bfloat16 else cache.half()
|
||||||
return cache
|
return cache
|
||||||
|
@ -290,7 +290,7 @@ class CoreAttention(torch.nn.Module):
|
||||||
# [sk, b, np, hn] -> [sk, b * np, hn]
|
# [sk, b, np, hn] -> [sk, b * np, hn]
|
||||||
key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
|
key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
|
||||||
|
|
||||||
# preallocting input tensor: [b * np, sq, sk]
|
# preallocating input tensor: [b * np, sq, sk]
|
||||||
matmul_input_buffer = torch.empty(
|
matmul_input_buffer = torch.empty(
|
||||||
output_size[0] * output_size[1],
|
output_size[0] * output_size[1],
|
||||||
output_size[2],
|
output_size[2],
|
||||||
|
@ -1289,7 +1289,7 @@ class ChatGLMForConditionalGeneration(ChatGLMPreTrainedModel):
|
||||||
if has_default_max_length and generation_config.max_new_tokens is None:
|
if has_default_max_length and generation_config.max_new_tokens is None:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
|
f"Using `max_length`'s default ({generation_config.max_length}) to control the generation length. "
|
||||||
"This behaviour is deprecated and will be removed from the config in v5 of Transformers -- we"
|
"This behavior is deprecated and will be removed from the config in v5 of Transformers -- we"
|
||||||
" recommend using `max_new_tokens` to control the maximum length of the generation.",
|
" recommend using `max_new_tokens` to control the maximum length of the generation.",
|
||||||
UserWarning,
|
UserWarning,
|
||||||
)
|
)
|
||||||
|
|
|
@ -122,7 +122,7 @@ class GPTJPipelineForwards:
|
||||||
# head_mask has shape n_layer x batch x num_attention_heads x N x N
|
# head_mask has shape n_layer x batch x num_attention_heads x N x N
|
||||||
head_mask = self.get_head_mask(head_mask, self.config.n_layer)
|
head_mask = self.get_head_mask(head_mask, self.config.n_layer)
|
||||||
|
|
||||||
# position id to be asssigned not just for the first stage for attn input
|
# position id to be assigned not just for the first stage for attn input
|
||||||
if position_ids is not None:
|
if position_ids is not None:
|
||||||
position_ids = position_ids.view(-1, seq_length)
|
position_ids = position_ids.view(-1, seq_length)
|
||||||
else:
|
else:
|
||||||
|
@ -593,7 +593,7 @@ def get_gptj_flash_attention_forward():
|
||||||
|
|
||||||
# key = key.permute(0, 2, 1, 3)
|
# key = key.permute(0, 2, 1, 3)
|
||||||
# query = query.permute(0, 2, 1, 3)
|
# query = query.permute(0, 2, 1, 3)
|
||||||
key = key.to(dtype=value.dtype) # fp16 compatability
|
key = key.to(dtype=value.dtype) # fp16 compatibility
|
||||||
query = query.to(dtype=value.dtype)
|
query = query.to(dtype=value.dtype)
|
||||||
|
|
||||||
if layer_past is not None:
|
if layer_past is not None:
|
||||||
|
|
|
@ -225,13 +225,13 @@ class LlamaPipelineForwards:
|
||||||
>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
>>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
||||||
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
||||||
|
|
||||||
>>> prompt = "Hey, are you consciours? Can you talk to me?"
|
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
||||||
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
||||||
|
|
||||||
>>> # Generate
|
>>> # Generate
|
||||||
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
||||||
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||||
"Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
|
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
|
||||||
```"""
|
```"""
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
|
|
@ -123,7 +123,7 @@ class OPTPipelineForwards:
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if hidden_states is None:
|
if hidden_states is None:
|
||||||
raise ValueError("hidden_states shouln't be None for intermediate stages.")
|
raise ValueError("hidden_states shouldn't be None for intermediate stages.")
|
||||||
input_shape = hidden_states.size()[:-1]
|
input_shape = hidden_states.size()[:-1]
|
||||||
batch_size, seq_length = input_shape[0], input_shape[1]
|
batch_size, seq_length = input_shape[0], input_shape[1]
|
||||||
device = hidden_states.device
|
device = hidden_states.device
|
||||||
|
|
|
@ -77,7 +77,7 @@ class T5PipelineForwards:
|
||||||
if in_decoder != (stage >= decoder_starting_stage):
|
if in_decoder != (stage >= decoder_starting_stage):
|
||||||
raise ValueError("Config in T5Stack is not aligned with pipeline setting.")
|
raise ValueError("Config in T5Stack is not aligned with pipeline setting.")
|
||||||
|
|
||||||
# at_first_stage: current stage is the first stage of encoder/decoder, taking input_ids/input_embedds
|
# at_first_stage: current stage is the first stage of encoder/decoder, taking input_ids/input_embeds
|
||||||
# at_last_stage: current stage is the last stage of encoder/decoder, making outputs the same form as huggingface
|
# at_last_stage: current stage is the last stage of encoder/decoder, making outputs the same form as huggingface
|
||||||
at_first_stage = (stage == 0) or (stage == decoder_starting_stage)
|
at_first_stage = (stage == 0) or (stage == decoder_starting_stage)
|
||||||
at_last_stage = (stage == decoder_starting_stage - 1) or (stage == stage_manager.num_stages - 1)
|
at_last_stage = (stage == decoder_starting_stage - 1) or (stage == stage_manager.num_stages - 1)
|
||||||
|
|
Loading…
Reference in New Issue