2024-01-30 02:31:46 +00:00
|
|
|
from functools import partial
|
|
|
|
|
|
|
|
import torch
|
2024-02-01 07:49:39 +00:00
|
|
|
from torch.nn import Parameter
|
|
|
|
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, LlamaRMSNorm
|
2024-01-30 02:31:46 +00:00
|
|
|
|
|
|
|
from colossalai.inference.modeling.models.nopadding_llama import (
|
2024-02-01 07:49:39 +00:00
|
|
|
NopadLlamaAttention,
|
|
|
|
NopadLlamaMLP,
|
2024-01-30 02:31:46 +00:00
|
|
|
llama_causal_lm_forward,
|
|
|
|
llama_decoder_layer_forward,
|
|
|
|
llama_model_forward,
|
|
|
|
)
|
|
|
|
from colossalai.inference.utils import init_to_get_rotary
|
2024-02-01 07:49:39 +00:00
|
|
|
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription
|
2024-01-30 02:31:46 +00:00
|
|
|
|
|
|
|
# import colossalai
|
|
|
|
from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy
|
|
|
|
|
|
|
|
try:
|
|
|
|
from colossalai.kernel.triton import rms_layernorm
|
|
|
|
|
|
|
|
HAS_TRITON_RMSNORM = True
|
|
|
|
except:
|
|
|
|
print("you should install triton from https://github.com/openai/triton")
|
|
|
|
HAS_TRITON_RMSNORM = False
|
|
|
|
|
|
|
|
|
|
|
|
def get_triton_rmsnorm_forward():
|
|
|
|
if HAS_TRITON_RMSNORM:
|
|
|
|
|
2024-02-21 05:23:57 +00:00
|
|
|
def _triton_rmsnorm_forward(
|
|
|
|
self: LlamaRMSNorm, hidden_states: torch.Tensor, norm_output: torch.Tensor, residual: torch.Tensor = None
|
|
|
|
):
|
|
|
|
return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output, residual)
|
2024-01-30 02:31:46 +00:00
|
|
|
|
|
|
|
return _triton_rmsnorm_forward
|
|
|
|
else:
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
class NoPaddingLlamaModelInferPolicy(LlamaForCausalLMPolicy):
|
|
|
|
def __init__(self) -> None:
|
|
|
|
super().__init__()
|
|
|
|
|
|
|
|
def module_policy(self):
|
|
|
|
policy = super().module_policy()
|
2024-02-01 07:49:39 +00:00
|
|
|
|
|
|
|
decoder_attribute_replacement = {
|
|
|
|
"lm_head.weight": Parameter(self.model.lm_head.weight.transpose(0, 1), requires_grad=False),
|
|
|
|
}
|
|
|
|
policy[LlamaForCausalLM] = ModulePolicyDescription(
|
|
|
|
attribute_replacement=decoder_attribute_replacement,
|
|
|
|
)
|
|
|
|
|
|
|
|
policy[LlamaDecoderLayer] = ModulePolicyDescription(
|
|
|
|
sub_module_replacement=[
|
|
|
|
SubModuleReplacementDescription(
|
|
|
|
suffix="mlp",
|
|
|
|
target_module=NopadLlamaMLP,
|
|
|
|
),
|
|
|
|
SubModuleReplacementDescription(
|
|
|
|
suffix="self_attn",
|
|
|
|
target_module=NopadLlamaAttention,
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
2024-01-30 02:31:46 +00:00
|
|
|
self.shard_config._infer()
|
|
|
|
|
|
|
|
infer_forward = llama_causal_lm_forward
|
|
|
|
method_replacement = {"forward": partial(infer_forward)}
|
|
|
|
self.append_or_create_method_replacement(
|
|
|
|
description=method_replacement, policy=policy, target_key=LlamaForCausalLM
|
|
|
|
)
|
|
|
|
|
|
|
|
infer_forward = llama_model_forward
|
|
|
|
method_replacement = {"forward": partial(infer_forward)}
|
|
|
|
self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=LlamaModel)
|
|
|
|
|
|
|
|
infer_forward = llama_decoder_layer_forward
|
|
|
|
method_replacement = {"forward": partial(infer_forward)}
|
|
|
|
self.append_or_create_method_replacement(
|
|
|
|
description=method_replacement, policy=policy, target_key=LlamaDecoderLayer
|
|
|
|
)
|
|
|
|
|
|
|
|
infer_forward = None
|
|
|
|
if HAS_TRITON_RMSNORM:
|
|
|
|
infer_forward = get_triton_rmsnorm_forward()
|
|
|
|
|
|
|
|
if infer_forward is not None:
|
|
|
|
method_replacement = {"forward": partial(infer_forward)}
|
|
|
|
self.append_or_create_method_replacement(
|
|
|
|
description=method_replacement, policy=policy, target_key=LlamaRMSNorm
|
|
|
|
)
|
|
|
|
|
|
|
|
return policy
|
|
|
|
|
|
|
|
def postprocess(self):
|
|
|
|
init_to_get_rotary(self.model.model)
|
|
|
|
return self.model
|