ColossalAI/colossalai/inference/modeling/policy/nopadding_llama.py

from functools import partial

import torch
from torch.nn import Parameter
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, LlamaRMSNorm

from colossalai.inference.modeling.models.nopadding_llama import (
    NopadLlamaAttention,
    NopadLlamaMLP,
    llama_causal_lm_forward,
    llama_decoder_layer_forward,
    llama_model_forward,
)
from colossalai.inference.utils import init_to_get_rotary
from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription

# import colossalai
from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy

try:
    from colossalai.kernel.triton import rms_layernorm

    HAS_TRITON_RMSNORM = True
except:
    print("you should install triton from https://github.com/openai/triton")
    HAS_TRITON_RMSNORM = False


def get_triton_rmsnorm_forward():
    if HAS_TRITON_RMSNORM:

        def _triton_rmsnorm_forward(
            self: LlamaRMSNorm, hidden_states: torch.Tensor, norm_output: torch.Tensor, residual: torch.Tensor = None
        ):
            return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output, residual)

        return _triton_rmsnorm_forward
    else:
        return None


class NoPaddingLlamaModelInferPolicy(LlamaForCausalLMPolicy):
    def __init__(self) -> None:
        super().__init__()

    def module_policy(self):
        policy = super().module_policy()

        decoder_attribute_replacement = {
            "lm_head.weight": Parameter(self.model.lm_head.weight.transpose(0, 1), requires_grad=False),
        }
        policy[LlamaForCausalLM] = ModulePolicyDescription(
            attribute_replacement=decoder_attribute_replacement,
        )

        policy[LlamaDecoderLayer] = ModulePolicyDescription(
            sub_module_replacement=[
                SubModuleReplacementDescription(
                    suffix="mlp",
                    target_module=NopadLlamaMLP,
                ),
                SubModuleReplacementDescription(
                    suffix="self_attn",
                    target_module=NopadLlamaAttention,
                ),
            ]
        )

        self.shard_config._infer()

        infer_forward = llama_causal_lm_forward
        method_replacement = {"forward": partial(infer_forward)}
        self.append_or_create_method_replacement(
            description=method_replacement, policy=policy, target_key=LlamaForCausalLM
        )

        infer_forward = llama_model_forward
        method_replacement = {"forward": partial(infer_forward)}
        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=LlamaModel)

        infer_forward = llama_decoder_layer_forward
        method_replacement = {"forward": partial(infer_forward)}
        self.append_or_create_method_replacement(
            description=method_replacement, policy=policy, target_key=LlamaDecoderLayer
        )

        infer_forward = None
        if HAS_TRITON_RMSNORM:
            infer_forward = get_triton_rmsnorm_forward()

        if infer_forward is not None:
            method_replacement = {"forward": partial(infer_forward)}
            self.append_or_create_method_replacement(
                description=method_replacement, policy=policy, target_key=LlamaRMSNorm
            )

        return policy

    def postprocess(self):
        init_to_get_rotary(self.model.model)
        return self.model
[Inference]Add Nopadding Llama Modeling (#5327) * add nopadding llama modeling * add nopadding_llama.py * rm unused codes * fix bugs in test_xine_copy.py * fix code style 2024-01-30 02:31:46 +00:00			`from functools import partial`

			`import torch`
[Inference]Repalce Attention layer and MLP layer by shardformer to optimize the weight transpose operation，add fused_qkv and fused linear_add (#5340) * add fused qkv * replace attn and mlp by shardformer * fix bugs in mlp * add docstrings * fix test_inference_engine.py * add optimize unbind * add fused_addmm * rm squeeze(1) * refactor codes * fix ci bugs * rename ShardFormerLlamaMLP and ShardFormerLlamaAttention * Removed the dependency on LlamaFlashAttention2 * rollback test_inference_engine.py 2024-02-01 07:49:39 +00:00			`from torch.nn import Parameter`
			`from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, LlamaRMSNorm`
[Inference]Add Nopadding Llama Modeling (#5327) * add nopadding llama modeling * add nopadding_llama.py * rm unused codes * fix bugs in test_xine_copy.py * fix code style 2024-01-30 02:31:46 +00:00
			`from colossalai.inference.modeling.models.nopadding_llama import (`
[Inference]Repalce Attention layer and MLP layer by shardformer to optimize the weight transpose operation，add fused_qkv and fused linear_add (#5340) * add fused qkv * replace attn and mlp by shardformer * fix bugs in mlp * add docstrings * fix test_inference_engine.py * add optimize unbind * add fused_addmm * rm squeeze(1) * refactor codes * fix ci bugs * rename ShardFormerLlamaMLP and ShardFormerLlamaAttention * Removed the dependency on LlamaFlashAttention2 * rollback test_inference_engine.py 2024-02-01 07:49:39 +00:00			`NopadLlamaAttention,`
			`NopadLlamaMLP,`
[Inference]Add Nopadding Llama Modeling (#5327) * add nopadding llama modeling * add nopadding_llama.py * rm unused codes * fix bugs in test_xine_copy.py * fix code style 2024-01-30 02:31:46 +00:00			`llama_causal_lm_forward,`
			`llama_decoder_layer_forward,`
			`llama_model_forward,`
			`)`
			`from colossalai.inference.utils import init_to_get_rotary`
[Inference]Repalce Attention layer and MLP layer by shardformer to optimize the weight transpose operation，add fused_qkv and fused linear_add (#5340) * add fused qkv * replace attn and mlp by shardformer * fix bugs in mlp * add docstrings * fix test_inference_engine.py * add optimize unbind * add fused_addmm * rm squeeze(1) * refactor codes * fix ci bugs * rename ShardFormerLlamaMLP and ShardFormerLlamaAttention * Removed the dependency on LlamaFlashAttention2 * rollback test_inference_engine.py 2024-02-01 07:49:39 +00:00			`from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription`
[Inference]Add Nopadding Llama Modeling (#5327) * add nopadding llama modeling * add nopadding_llama.py * rm unused codes * fix bugs in test_xine_copy.py * fix code style 2024-01-30 02:31:46 +00:00
			`# import colossalai`
			`from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy`

			`try:`
			`from colossalai.kernel.triton import rms_layernorm`

			`HAS_TRITON_RMSNORM = True`
			`except:`
			`print("you should install triton from https://github.com/openai/triton")`
			`HAS_TRITON_RMSNORM = False`


			`def get_triton_rmsnorm_forward():`
			`if HAS_TRITON_RMSNORM:`

Optimized the execution interval time between cuda kernels caused by view and memcopy (#5390) * opt_view_and_memcopy * fix bugs in ci * fix ci bugs * update benchmark scripts * fix ci bugs 2024-02-21 05:23:57 +00:00			`def _triton_rmsnorm_forward(`
			`self: LlamaRMSNorm, hidden_states: torch.Tensor, norm_output: torch.Tensor, residual: torch.Tensor = None`
			`):`
			`return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon, norm_output, residual)`
[Inference]Add Nopadding Llama Modeling (#5327) * add nopadding llama modeling * add nopadding_llama.py * rm unused codes * fix bugs in test_xine_copy.py * fix code style 2024-01-30 02:31:46 +00:00
			`return _triton_rmsnorm_forward`
			`else:`
			`return None`


			`class NoPaddingLlamaModelInferPolicy(LlamaForCausalLMPolicy):`
			`def __init__(self) -> None:`
			`super().__init__()`

			`def module_policy(self):`
			`policy = super().module_policy()`
[Inference]Repalce Attention layer and MLP layer by shardformer to optimize the weight transpose operation，add fused_qkv and fused linear_add (#5340) * add fused qkv * replace attn and mlp by shardformer * fix bugs in mlp * add docstrings * fix test_inference_engine.py * add optimize unbind * add fused_addmm * rm squeeze(1) * refactor codes * fix ci bugs * rename ShardFormerLlamaMLP and ShardFormerLlamaAttention * Removed the dependency on LlamaFlashAttention2 * rollback test_inference_engine.py 2024-02-01 07:49:39 +00:00
			`decoder_attribute_replacement = {`
			`"lm_head.weight": Parameter(self.model.lm_head.weight.transpose(0, 1), requires_grad=False),`
			`}`
			`policy[LlamaForCausalLM] = ModulePolicyDescription(`
			`attribute_replacement=decoder_attribute_replacement,`
			`)`

			`policy[LlamaDecoderLayer] = ModulePolicyDescription(`
			`sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="mlp",`
			`target_module=NopadLlamaMLP,`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="self_attn",`
			`target_module=NopadLlamaAttention,`
			`),`
			`]`
			`)`

[Inference]Add Nopadding Llama Modeling (#5327) * add nopadding llama modeling * add nopadding_llama.py * rm unused codes * fix bugs in test_xine_copy.py * fix code style 2024-01-30 02:31:46 +00:00			`self.shard_config._infer()`

			`infer_forward = llama_causal_lm_forward`
			`method_replacement = {"forward": partial(infer_forward)}`
			`self.append_or_create_method_replacement(`
			`description=method_replacement, policy=policy, target_key=LlamaForCausalLM`
			`)`

			`infer_forward = llama_model_forward`
			`method_replacement = {"forward": partial(infer_forward)}`
			`self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=LlamaModel)`

			`infer_forward = llama_decoder_layer_forward`
			`method_replacement = {"forward": partial(infer_forward)}`
			`self.append_or_create_method_replacement(`
			`description=method_replacement, policy=policy, target_key=LlamaDecoderLayer`
			`)`

			`infer_forward = None`
			`if HAS_TRITON_RMSNORM:`
			`infer_forward = get_triton_rmsnorm_forward()`

			`if infer_forward is not None:`
			`method_replacement = {"forward": partial(infer_forward)}`
			`self.append_or_create_method_replacement(`
			`description=method_replacement, policy=policy, target_key=LlamaRMSNorm`
			`)`

			`return policy`

			`def postprocess(self):`
			`init_to_get_rotary(self.model.model)`
			`return self.model`