from functools import partial import torch from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, LlamaRMSNorm from colossalai.inference.modeling.models.padding_llama import ( PadLlamaAttention, llama_causal_lm_forward, llama_decoder_layer_forward, llama_model_forward, ) from colossalai.inference.utils import init_to_get_rotary from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription # import colossalai from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy try: from colossalai.kernel.triton import rms_layernorm HAS_TRITON_RMSNORM = True except: print("you should install triton from https://github.com/openai/triton") HAS_TRITON_RMSNORM = False def get_triton_rmsnorm_forward(): if HAS_TRITON_RMSNORM: def _triton_rmsnorm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor, norm_outpu: torch.Tensor): return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_outpu) return _triton_rmsnorm_forward else: return None class PaddingLlamaModelInferPolicy(LlamaForCausalLMPolicy): def __init__(self) -> None: super().__init__() def module_policy(self): policy = super().module_policy() policy[LlamaDecoderLayer] = ModulePolicyDescription( sub_module_replacement=[ SubModuleReplacementDescription( suffix="self_attn", target_module=PadLlamaAttention, ), ] ) self.shard_config._infer() infer_forward = llama_causal_lm_forward method_replacement = {"forward": partial(infer_forward)} self.append_or_create_method_replacement( description=method_replacement, policy=policy, target_key=LlamaForCausalLM ) infer_forward = llama_model_forward method_replacement = {"forward": partial(infer_forward)} self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=LlamaModel) infer_forward = llama_decoder_layer_forward method_replacement = {"forward": partial(infer_forward)} self.append_or_create_method_replacement( description=method_replacement, policy=policy, target_key=LlamaDecoderLayer ) infer_forward = None if HAS_TRITON_RMSNORM: infer_forward = get_triton_rmsnorm_forward() if infer_forward is not None: method_replacement = {"forward": partial(infer_forward)} self.append_or_create_method_replacement( description=method_replacement, policy=policy, target_key=LlamaRMSNorm ) return policy def postprocess(self): init_to_get_rotary(self.model.model) return self.model