ColossalAI/colossalai/legacy/inference/hybridengine/polices/llama.py

from functools import partial
from typing import List

import torch
from torch.nn import Module
from transformers.models.llama.modeling_llama import (
    LlamaAttention,
    LlamaDecoderLayer,
    LlamaForCausalLM,
    LlamaModel,
    LlamaRMSNorm,
)

from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription

# import colossalai
from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy

from ..modeling._utils import init_to_get_rotary
from ..modeling.llama import LlamaInferenceForwards

try:
    from colossalai.kernel.triton import rmsnorm_forward

    HAS_TRITON_RMSNORM = True
except:
    print("you should install triton from https://github.com/openai/triton")
    HAS_TRITON_RMSNORM = False


def get_triton_rmsnorm_forward():
    if HAS_TRITON_RMSNORM:

        def _triton_rmsnorm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor):
            return rmsnorm_forward(hidden_states, self.weight.data, self.variance_epsilon)

        return _triton_rmsnorm_forward
    else:
        return None


class LlamaModelInferPolicy(LlamaForCausalLMPolicy):
    def __init__(self) -> None:
        super().__init__()

    def module_policy(self):
        policy = super().module_policy()

        if self.shard_config.inference_gptq:
            from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear

            decoder_attribute_replacement = {
                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
            }
            policy[LlamaDecoderLayer] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
                sub_module_replacement=[
                    SubModuleReplacementDescription(
                        suffix="self_attn.q_proj",
                        target_module=ColCaiQuantLinear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.k_proj",
                        target_module=ColCaiQuantLinear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.v_proj",
                        target_module=ColCaiQuantLinear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.o_proj",
                        target_module=RowCaiQuantLinear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.gate_proj",
                        target_module=ColCaiQuantLinear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.up_proj",
                        target_module=ColCaiQuantLinear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.down_proj",
                        target_module=RowCaiQuantLinear,
                        kwargs={"split_num": 1},
                    ),
                ],
            )

        self.shard_config._infer()

        infer_forward = LlamaInferenceForwards.llama_model_forward
        method_replacement = {"forward": partial(infer_forward)}
        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=LlamaModel)

        infer_forward = LlamaInferenceForwards.llama_decoder_layer_forward
        method_replacement = {"forward": partial(infer_forward)}
        self.append_or_create_method_replacement(
            description=method_replacement, policy=policy, target_key=LlamaDecoderLayer
        )

        infer_forward = LlamaInferenceForwards.llama_flash_attn_kvcache_forward
        method_replacement = {"forward": partial(infer_forward)}
        self.append_or_create_method_replacement(
            description=method_replacement, policy=policy, target_key=LlamaAttention
        )

        if self.pipeline_stage_manager:
            # set None as default
            self.set_pipeline_forward(
                model_cls=LlamaForCausalLM, new_forward=LlamaInferenceForwards.llama_causal_lm_forward, policy=policy
            )
        infer_forward = None
        if HAS_TRITON_RMSNORM:
            infer_forward = get_triton_rmsnorm_forward()

        if infer_forward is not None:
            method_replacement = {"forward": partial(infer_forward)}
            self.append_or_create_method_replacement(
                description=method_replacement, policy=policy, target_key=LlamaRMSNorm
            )

        return policy

    def postprocess(self):
        init_to_get_rotary(self.model.model)
        return self.model

    def get_held_layers(self) -> List[Module]:
        """Get pipeline layers for current stage."""
        stage_manager = self.pipeline_stage_manager
        held_layers = super().get_held_layers()
        if stage_manager.is_first_stage():
            held_layers.append(self.model.lm_head)
        return held_layers
[Pipeline inference] Combine kvcache with pipeline inference (#4938) * merge kvcache with pipeline inference and refactor the code structure * support ppsize > 2 * refactor pipeline code * do pre-commit * modify benchmark * fix bench mark * polish code * add docstring and update readme * refactor the code * fix some logic bug of ppinfer * polish readme * fix typo * skip infer test 2023-10-27 08:19:54 +00:00			`from functools import partial`
			`from typing import List`

			`import torch`
			`from torch.nn import Module`
			`from transformers.models.llama.modeling_llama import (`
			`LlamaAttention,`
			`LlamaDecoderLayer,`
			`LlamaForCausalLM,`
			`LlamaModel,`
			`LlamaRMSNorm,`
			`)`

			`from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription`

			`# import colossalai`
			`from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy`

			`from ..modeling._utils import init_to_get_rotary`
[Pipeline Inference] Merge pp with tp (#4993) * refactor pipeline into new CaiInferEngine * updata llama modeling forward * merge tp with pp * update docstring * optimize test workflow and example * fix typo * add assert and todo 2023-11-01 04:46:21 +00:00			`from ..modeling.llama import LlamaInferenceForwards`
[Pipeline inference] Combine kvcache with pipeline inference (#4938) * merge kvcache with pipeline inference and refactor the code structure * support ppsize > 2 * refactor pipeline code * do pre-commit * modify benchmark * fix bench mark * polish code * add docstring and update readme * refactor the code * fix some logic bug of ppinfer * polish readme * fix typo * skip infer test 2023-10-27 08:19:54 +00:00
			`try:`
			`from colossalai.kernel.triton import rmsnorm_forward`

			`HAS_TRITON_RMSNORM = True`
			`except:`
			`print("you should install triton from https://github.com/openai/triton")`
			`HAS_TRITON_RMSNORM = False`


			`def get_triton_rmsnorm_forward():`
			`if HAS_TRITON_RMSNORM:`

			`def _triton_rmsnorm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor):`
			`return rmsnorm_forward(hidden_states, self.weight.data, self.variance_epsilon)`

			`return _triton_rmsnorm_forward`
			`else:`
			`return None`


			`class LlamaModelInferPolicy(LlamaForCausalLMPolicy):`
			`def __init__(self) -> None:`
			`super().__init__()`

			`def module_policy(self):`
			`policy = super().module_policy()`
[inference] Refactor inference architecture (#5057) * [inference] support only TP (#4998) * support only tp * enable tp * add support for bloom (#5008) * [refactor] refactor gptq and smoothquant llama (#5012) * refactor gptq and smoothquant llama * fix import error * fix linear import torch-int * fix smoothquant llama import error * fix import accelerate error * fix bug * fix import smooth cuda * fix smoothcuda * [Inference Refactor] Merge chatglm2 with pp and tp (#5023) merge chatglm with pp and tp * [Refactor] remove useless inference code (#5022) * remove useless code * fix quant model * fix test import bug * mv original inference legacy * fix chatglm2 * [Refactor] refactor policy search and quant type controlling in inference (#5035) * [Refactor] refactor policy search and quant type controling in inference * [inference] update readme (#5051) * update readme * update readme * fix architecture * fix table * fix table * [inference] udpate example (#5053) * udpate example * fix run.sh * fix rebase bug * fix some errors * update readme * add some features * update interface * update readme * update benchmark * add requirements-infer --------- Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com> 2023-11-19 13:05:05 +00:00
			`if self.shard_config.inference_gptq:`
[Pipeline inference] Combine kvcache with pipeline inference (#4938) * merge kvcache with pipeline inference and refactor the code structure * support ppsize > 2 * refactor pipeline code * do pre-commit * modify benchmark * fix bench mark * polish code * add docstring and update readme * refactor the code * fix some logic bug of ppinfer * polish readme * fix typo * skip infer test 2023-10-27 08:19:54 +00:00			`from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear`

			`decoder_attribute_replacement = {`
			`"self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,`
			`"self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,`
			`}`
			`policy[LlamaDecoderLayer] = ModulePolicyDescription(`
			`attribute_replacement=decoder_attribute_replacement,`
			`sub_module_replacement=[`
			`SubModuleReplacementDescription(`
			`suffix="self_attn.q_proj",`
			`target_module=ColCaiQuantLinear,`
			`kwargs={"split_num": 1},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="self_attn.k_proj",`
			`target_module=ColCaiQuantLinear,`
			`kwargs={"split_num": 1},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="self_attn.v_proj",`
			`target_module=ColCaiQuantLinear,`
			`kwargs={"split_num": 1},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="self_attn.o_proj",`
			`target_module=RowCaiQuantLinear,`
			`kwargs={"split_num": 1},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="mlp.gate_proj",`
			`target_module=ColCaiQuantLinear,`
			`kwargs={"split_num": 1},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="mlp.up_proj",`
			`target_module=ColCaiQuantLinear,`
			`kwargs={"split_num": 1},`
			`),`
			`SubModuleReplacementDescription(`
			`suffix="mlp.down_proj",`
			`target_module=RowCaiQuantLinear,`
			`kwargs={"split_num": 1},`
			`),`
			`],`
			`)`

			`self.shard_config._infer()`

			`infer_forward = LlamaInferenceForwards.llama_model_forward`
			`method_replacement = {"forward": partial(infer_forward)}`
			`self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=LlamaModel)`

			`infer_forward = LlamaInferenceForwards.llama_decoder_layer_forward`
			`method_replacement = {"forward": partial(infer_forward)}`
			`self.append_or_create_method_replacement(`
			`description=method_replacement, policy=policy, target_key=LlamaDecoderLayer`
			`)`

			`infer_forward = LlamaInferenceForwards.llama_flash_attn_kvcache_forward`
			`method_replacement = {"forward": partial(infer_forward)}`
			`self.append_or_create_method_replacement(`
			`description=method_replacement, policy=policy, target_key=LlamaAttention`
			`)`

			`if self.pipeline_stage_manager:`
			`# set None as default`
			`self.set_pipeline_forward(`
			`model_cls=LlamaForCausalLM, new_forward=LlamaInferenceForwards.llama_causal_lm_forward, policy=policy`
			`)`
			`infer_forward = None`
			`if HAS_TRITON_RMSNORM:`
			`infer_forward = get_triton_rmsnorm_forward()`

			`if infer_forward is not None:`
			`method_replacement = {"forward": partial(infer_forward)}`
			`self.append_or_create_method_replacement(`
			`description=method_replacement, policy=policy, target_key=LlamaRMSNorm`
			`)`

			`return policy`

			`def postprocess(self):`
			`init_to_get_rotary(self.model.model)`
			`return self.model`

			`def get_held_layers(self) -> List[Module]:`
			`"""Get pipeline layers for current stage."""`
			`stage_manager = self.pipeline_stage_manager`
			`held_layers = super().get_held_layers()`
			`if stage_manager.is_first_stage():`
			`held_layers.append(self.model.lm_head)`
			`return held_layers`