[refactor] refactor gptq and smoothquant llama (#5012)

* refactor gptq and smoothquant llama * fix import error * fix linear import torch-int * fix smoothquant llama import error * fix import accelerate error * fix bug * fix import smooth cuda * fix smoothcuda
2023-11-08 09:17:52 +08:00 · 2023-11-08 09:17:52 +08:00 · 450115bd0f
parent 48d0a58d10
commit 450115bd0f
16 changed files with 635 additions and 41 deletions
--- a/colossalai/inference/hybridengine/engine.py
+++ b/colossalai/inference/hybridengine/engine.py
@ -14,10 +14,7 @@ from ..tensor_parallel.kvcache_manager import MemoryManager
 PP_AXIS, TP_AXIS = 0, 1
-_supported_models = [
+_supported_models = ["LlamaForCausalLM", "BloomForCausalLM", "LlamaGPTQForCausalLM", "SmoothLlamaForCausalLM"]
    "LlamaForCausalLM",
    "BloomForCausalLM",
 ]
 class CaiInferEngine:
@ -70,12 +67,21 @@ class CaiInferEngine:
        max_batch_size: int = 4,
        max_input_len: int = 32,
        max_output_len: int = 32,
        quant: str = None,
        verbose: bool = False,
        # TODO: implement early_stopping, and various gerneration options
        early_stopping: bool = False,
        do_sample: bool = False,
        num_beams: int = 1,
    ) -> None:
        if quant == "gptq":
            from ..quant.gptq import GPTQManager
            self.gptq_manager = GPTQManager(model.quantize_config, max_input_len=max_input_len)
            model = model.model
        elif quant == "smoothquant":
            model = model.model
        assert model.__class__.__name__ in _supported_models, f"Model {model.__class__.__name__} is not supported."
        assert (
            tp_size * pp_size == dist.get_world_size()
@ -85,9 +91,14 @@ class CaiInferEngine:
        assert max_batch_size <= 64, "Max batch size exceeds the constraint"
        assert max_input_len + max_output_len <= 4096, "Max length exceeds the constraint"
-
+        assert quant in ["smoothquant", "gptq", None], "quant should be one of 'smoothquant', 'gptq'"
        self.pp_size = pp_size
        self.tp_size = tp_size
        self.quant = quant
        if quant == "smoothquant" and dtype != "fp32":
            dtype = "fp32"
            print("Warning: smoothquant only support fp32 and int8 mix precision. set dtype to fp32")
        if dtype == "fp16":
            self.dtype = torch.float16
@ -118,6 +129,8 @@ class CaiInferEngine:
        self.schedule = GenerateSchedule(stage_manager, self.mb_manager, verbose)
        self.model = self._shardformer(model, model_policy, stage_manager, pg_mesh.get_group_along_axis(TP_AXIS))
        if quant == "gptq":
            self.gptq_manager.post_init_gptq_buffer(self.model)
    def inference(self, input_list):
        """
@ -149,6 +162,7 @@ class CaiInferEngine:
            enable_flash_attention=False,
            enable_jit_fused=False,
            enable_sequence_parallelism=False,
            quant=self.quant,
        )
        shardformer = ShardFormer(shard_config=shardconfig)
        shard_model, _ = shardformer.optimize(model, model_policy)
@ -158,7 +172,7 @@ class CaiInferEngine:
        max_total_token_num = max_batch_size * (max_input_len + max_output_len)
        if model.config.model_type == "llama":
            head_dim = model.config.hidden_size // model.config.num_attention_heads
-            head_num = model.config.num_attention_heads // self.tp_size
+            head_num = model.config.num_key_value_heads // self.tp_size
            num_hidden_layers = (
                model.config.num_hidden_layers
                if hasattr(model.config, "num_hidden_layers")
@ -171,5 +185,8 @@ class CaiInferEngine:
            num_hidden_layers = model.config.n_layer
            layer_num = num_hidden_layers // self.pp_size
-        cache_manager = MemoryManager(max_total_token_num, self.dtype, head_num, head_dim, layer_num)
+        if self.quant == "smoothquant":
            cache_manager = MemoryManager(max_total_token_num, torch.int8, head_num, head_dim, layer_num)
        else:
            cache_manager = MemoryManager(max_total_token_num, self.dtype, head_num, head_dim, layer_num)
        return cache_manager
--- a/colossalai/inference/hybridengine/modeling/init.py
+++ b/colossalai/inference/hybridengine/modeling/init.py
@ -1,3 +1,4 @@
 from .bloom import BloomInferenceForwards
 from .llama import LlamaInferenceForwards
-__all__ = ["LlamaInferenceForwards"]
+__all__ = ["LlamaInferenceForwards", "BloomInferenceForwards"]
--- a/colossalai/inference/hybridengine/polices/llama.py
+++ b/colossalai/inference/hybridengine/polices/llama.py
@ -45,14 +45,15 @@ class LlamaModelInferPolicy(LlamaForCausalLMPolicy):
    def module_policy(self):
        policy = super().module_policy()
-
+        decoder_attribute_replacement = {
-        if self.shard_config.inference_gptq:
+            "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
            "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
            "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
            // self.shard_config.tensor_parallel_size,
        }
        if self.shard_config.quant == "gptq":
            from colossalai.inference.quant.gptq.cai_gptq import ColCaiQuantLinear, RowCaiQuantLinear
            decoder_attribute_replacement = {
                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
            }
            policy[LlamaDecoderLayer] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
                sub_module_replacement=[
@ -94,6 +95,55 @@ class LlamaModelInferPolicy(LlamaForCausalLMPolicy):
                ],
            )
        elif self.shard_config.quant == "smoothquant":
            from colossalai.inference.quant.smoothquant.models.llama import LlamaSmoothquantDecoderLayer
            from colossalai.inference.quant.smoothquant.models.parallel_linear import (
                ColW8A8BFP32OFP32Linear,
                RowW8A8B8O8Linear,
                RowW8A8BFP32O32LinearSiLU,
                RowW8A8BFP32OFP32Linear,
            )
            policy[LlamaSmoothquantDecoderLayer] = ModulePolicyDescription(
                attribute_replacement=decoder_attribute_replacement,
                sub_module_replacement=[
                    SubModuleReplacementDescription(
                        suffix="self_attn.q_proj",
                        target_module=RowW8A8B8O8Linear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.k_proj",
                        target_module=RowW8A8B8O8Linear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.v_proj",
                        target_module=RowW8A8B8O8Linear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="self_attn.o_proj",
                        target_module=ColW8A8BFP32OFP32Linear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.gate_proj",
                        target_module=RowW8A8BFP32O32LinearSiLU,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.up_proj",
                        target_module=RowW8A8BFP32OFP32Linear,
                        kwargs={"split_num": 1},
                    ),
                    SubModuleReplacementDescription(
                        suffix="mlp.down_proj",
                        target_module=ColW8A8BFP32OFP32Linear,
                        kwargs={"split_num": 1},
                    ),
                ],
            )
        self.shard_config._infer()
        infer_forward = LlamaInferenceForwards.llama_model_forward
--- a/colossalai/inference/quant/init.py
+++ b/colossalai/inference/quant/init.py
@ -0,0 +1 @@
 from .smoothquant.models.llama import SmoothLlamaForCausalLM
--- a/colossalai/inference/quant/gptq/init.py
+++ b/colossalai/inference/quant/gptq/init.py
@ -2,3 +2,4 @@ from .cai_gptq import HAS_AUTO_GPTQ
 if HAS_AUTO_GPTQ:
    from .cai_gptq import CaiGPTQLinearOp, CaiQuantLinear
    from .gptq_manager import GPTQManager
--- a/colossalai/inference/quant/gptq/gptq_manager.py
+++ b/colossalai/inference/quant/gptq/gptq_manager.py
@ -0,0 +1,61 @@
 import torch
 class GPTQManager:
    def __init__(self, quant_config, max_input_len: int = 1):
        self.max_dq_buffer_size = 1
        self.max_inner_outer_dim = 1
        self.bits = quant_config.bits
        self.use_act_order = quant_config.desc_act
        self.max_input_len = 1
        self.gptq_temp_state_buffer = None
        self.gptq_temp_dq_buffer = None
        self.quant_config = quant_config
    def post_init_gptq_buffer(self, model: torch.nn.Module) -> None:
        from .cai_gptq import CaiQuantLinear
        HAS_GPTQ_CUDA = False
        try:
            from colossalai.kernel.op_builder.gptq import GPTQBuilder
            gptq_cuda = GPTQBuilder().load()
            HAS_GPTQ_CUDA = True
        except ImportError:
            warnings.warn("CUDA gptq is not installed")
            HAS_GPTQ_CUDA = False
        for name, submodule in model.named_modules():
            if isinstance(submodule, CaiQuantLinear):
                self.max_dq_buffer_size = max(self.max_dq_buffer_size, submodule.qweight.numel() * 8)
                if self.use_act_order:
                    self.max_inner_outer_dim = max(
                        self.max_inner_outer_dim, submodule.infeatures, submodule.outfeatures
                    )
                self.bits = submodule.bits
        if not (HAS_GPTQ_CUDA and self.bits == 4):
            return
        max_input_len = 1
        if self.use_act_order:
            max_input_len = self.max_input_len
        # The temp_state buffer is required to reorder X in the act-order case.
        # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
        self.gptq_temp_state_buffer = torch.zeros(
            (max_input_len, self.max_inner_outer_dim), dtype=torch.float16, device=torch.cuda.current_device()
        )
        self.gptq_temp_dq_buffer = torch.zeros(
            (1, self.max_dq_buffer_size), dtype=torch.float16, device=torch.cuda.current_device()
        )
        gptq_cuda.prepare_buffers(
            torch.device(torch.cuda.current_device()), self.gptq_temp_state_buffer, self.gptq_temp_dq_buffer
        )
        # Using the default from exllama repo here.
        matmul_recons_thd = 8
        matmul_fused_remap = False
        matmul_no_half2 = False
        gptq_cuda.set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
        torch.cuda.empty_cache()
--- a/colossalai/inference/quant/smoothquant/models/init.py
+++ b/colossalai/inference/quant/smoothquant/models/init.py
@ -4,9 +4,7 @@ try:
    HAS_TORCH_INT = True
 except ImportError:
    HAS_TORCH_INT = False
-    raise ImportError(
+    print("Not install torch_int. Please install torch_int from https://github.com/Guangxuan-Xiao/torch-int")
        "Not install torch_int. Please install torch_int from https://github.com/Guangxuan-Xiao/torch-int"
    )
 if HAS_TORCH_INT:
    from .llama import LLamaSmoothquantAttention, LlamaSmoothquantMLP
--- a/colossalai/inference/quant/smoothquant/models/base_model.py
+++ b/colossalai/inference/quant/smoothquant/models/base_model.py
@ -9,7 +9,6 @@ from functools import partial
 from os.path import isdir, isfile, join
 from typing import Dict, List, Optional, Union
 import accelerate
 import numpy as np
 import torch
 import torch.nn as nn
@ -24,6 +23,15 @@ from transformers.utils.hub import PushToHubMixin, cached_file
 from colossalai.inference.tensor_parallel.batch_infer_state import BatchInferState
 from colossalai.inference.tensor_parallel.kvcache_manager import MemoryManager
 try:
    import accelerate
    HAS_ACCELERATE = True
 except ImportError:
    HAS_ACCELERATE = False
    print("accelerate is not installed.")
 SUPPORTED_MODELS = ["llama"]
--- a/colossalai/inference/quant/smoothquant/models/linear.py
+++ b/colossalai/inference/quant/smoothquant/models/linear.py
@ -1,17 +1,25 @@
 # modified from torch-int: https://github.com/Guangxuan-Xiao/torch-int/blob/main/torch_int/nn/linear.py
 import torch
-from torch_int._CUDA import linear_a8_w8_b8_o8, linear_a8_w8_bfp32_ofp32
+
-from torch_int.functional.quantization import quantize_per_tensor_absmax
+try:
    from torch_int._CUDA import linear_a8_w8_b8_o8, linear_a8_w8_bfp32_ofp32
    from torch_int.functional.quantization import quantize_per_tensor_absmax
    HAS_TORCH_INT = True
 except ImportError:
    HAS_TORCH_INT = False
    print("Not install torch_int. Please install torch_int from https://github.com/Guangxuan-Xiao/torch-int")
 try:
    from colossalai.kernel.op_builder.smoothquant import SmoothquantBuilder
    smoothquant_cuda = SmoothquantBuilder().load()
    HAS_SMOOTHQUANT_CUDA = True
-except ImportError:
+except:
    HAS_SMOOTHQUANT_CUDA = False
-    raise ImportError("CUDA smoothquant linear is not installed")
+    print("CUDA smoothquant linear is not installed")
 class W8A8BFP32O32LinearSiLU(torch.nn.Module):
@ -138,21 +146,23 @@ class W8A8BFP32OFP32Linear(torch.nn.Module):
        )
        self.register_buffer(
            "bias",
-            torch.zeros(self.out_features, dtype=torch.float32, requires_grad=False),
+            torch.zeros((1, self.out_features), dtype=torch.float32, requires_grad=False),
        )
        self.register_buffer("a", torch.tensor(alpha))
    def _apply(self, fn):
        # prevent the bias from being converted to half
        super()._apply(fn)
-        self.bias = self.bias.to(torch.float32)
+        if self.bias is not None:
            self.bias = self.bias.to(torch.float32)
        return self
    def to(self, *args, **kwargs):
        super().to(*args, **kwargs)
        self.weight = self.weight.to(*args, **kwargs)
-        self.bias = self.bias.to(*args, **kwargs)
+        if self.bias is not None:
-        self.bias = self.bias.to(torch.float32)
+            self.bias = self.bias.to(*args, **kwargs)
            self.bias = self.bias.to(torch.float32)
        return self
    @torch.no_grad()
--- a/colossalai/inference/quant/smoothquant/models/llama.py
+++ b/colossalai/inference/quant/smoothquant/models/llama.py
@ -8,7 +8,6 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch_int.nn.bmm import BMM_S8T_S8N_F32T, BMM_S8T_S8N_S8T
 from transformers import PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.models.llama.configuration_llama import LlamaConfig
@ -18,7 +17,6 @@ from transformers.models.llama.modeling_llama import (
    LlamaDecoderLayer,
    LlamaMLP,
    LlamaRotaryEmbedding,
    repeat_kv,
    rotate_half,
 )
 from transformers.utils import add_start_docstrings_to_model_forward
@ -31,10 +29,31 @@ from colossalai.kernel.triton import (
    smooth_token_attention_fwd,
 )
 try:
    from torch_int.nn.bmm import BMM_S8T_S8N_F32T, BMM_S8T_S8N_S8T
    HAS_TORCH_INT = True
 except ImportError:
    HAS_TORCH_INT = False
    print("Not install torch_int. Please install torch_int from https://github.com/Guangxuan-Xiao/torch-int")
 from .base_model import BaseSmoothForCausalLM
 from .linear import W8A8B8O8Linear, W8A8BFP32O32LinearSiLU, W8A8BFP32OFP32Linear
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 class LLamaSmoothquantAttention(nn.Module):
    def __init__(
        self,
@ -116,7 +135,6 @@ class LLamaSmoothquantAttention(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
        rotary_emb: Tuple[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
@ -131,8 +149,7 @@ class LLamaSmoothquantAttention(nn.Module):
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)
-        cos = rotary_emb[0]
+        cos, sin = infer_state.position_cos, infer_state.position_sin
        sin = rotary_emb[1]
        int8_rotary_embedding_fwd(
            query_states.view(-1, self.num_heads, self.head_dim),
@ -348,7 +365,6 @@ class LlamaSmoothquantDecoderLayer(nn.Module):
    def forward(
        self,
        hidden_states: torch.Tensor,
        rotary_emb: Tuple[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
@ -378,7 +394,6 @@ class LlamaSmoothquantDecoderLayer(nn.Module):
        # Self Attention
        hidden_states, self_attn_weights, present_key_value = self.self_attn(
            hidden_states=hidden_states,
            rotary_emb=rotary_emb,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
@ -650,15 +665,15 @@ def llama_model_forward(
        raise NotImplementedError("not implement gradient_checkpointing and training options ")
    if past_key_values_length == 0:
-        position_cos = torch.index_select(self._cos_cached, 0, position_ids.view(-1)).view(
+        infer_state.position_cos = torch.index_select(self._cos_cached, 0, position_ids.view(-1)).view(
            position_ids.view(-1).shape[0], -1
        )
-        position_sin = torch.index_select(self._sin_cached, 0, position_ids.view(-1)).view(
+        infer_state.position_sin = torch.index_select(self._sin_cached, 0, position_ids.view(-1)).view(
            position_ids.view(-1).shape[0], -1
        )
    else:
-        position_cos = torch.index_select(self._cos_cached, 0, position_ids.view(-1)).view(batch_size, -1)
+        infer_state.position_cos = torch.index_select(self._cos_cached, 0, position_ids.view(-1)).view(batch_size, -1)
-        position_sin = torch.index_select(self._sin_cached, 0, position_ids.view(-1)).view(batch_size, -1)
+        infer_state.position_sin = torch.index_select(self._sin_cached, 0, position_ids.view(-1)).view(batch_size, -1)
    # decoder layers
    all_hidden_states = () if output_hidden_states else None
@ -673,7 +688,6 @@ def llama_model_forward(
        layer_outputs = decoder_layer(
            hidden_states,
            rotary_emb=(position_cos, position_sin),
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
--- a/colossalai/inference/quant/smoothquant/models/parallel_linear.py
+++ b/colossalai/inference/quant/smoothquant/models/parallel_linear.py
@ -0,0 +1,264 @@
 from typing import List, Union
 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed import ProcessGroup
 from colossalai.lazy import LazyInitContext
 from colossalai.shardformer.layer import ParallelModule
 from .linear import W8A8B8O8Linear, W8A8BFP32O32LinearSiLU, W8A8BFP32OFP32Linear
 def split_row_copy(smooth_linear, para_linear, tp_size=1, tp_rank=0, split_num=1):
    qweights = smooth_linear.weight.split(smooth_linear.out_features // split_num, dim=0)
    if smooth_linear.bias is not None:
        bias = smooth_linear.bias.split(smooth_linear.out_features // split_num, dim=0)
    smooth_split_out_features = para_linear.out_features // split_num
    for i in range(split_num):
        para_linear.weight[i * smooth_split_out_features : (i + 1) * smooth_split_out_features, :] = qweights[i][
            tp_rank * smooth_split_out_features : (tp_rank + 1) * smooth_split_out_features, :
        ]
        if para_linear.bias is not None:
            para_linear.bias[:, i * smooth_split_out_features : (i + 1) * smooth_split_out_features] = bias[i][
                :, tp_rank * smooth_split_out_features : (tp_rank + 1) * smooth_split_out_features
            ]
 def split_column_copy(smooth_linear, para_linear, tp_rank=0, split_num=1):
    qweights = smooth_linear.weight.split(smooth_linear.in_features // split_num, dim=-1)
    smooth_split_in_features = para_linear.in_features // split_num
    for i in range(split_num):
        para_linear.weight[:, i * smooth_split_in_features : (i + 1) * smooth_split_in_features] = qweights[i][
            :, tp_rank * smooth_split_in_features : (tp_rank + 1) * smooth_split_in_features
        ]
    if smooth_linear.bias is not None:
        para_linear.bias.copy_(smooth_linear.bias)
 class RowW8A8B8O8Linear(W8A8B8O8Linear, ParallelModule):
    def __init__(self, in_features, out_features, alpha=1.0, beta=1.0):
        super().__init__(in_features, out_features, alpha, beta)
        self.process_group = None
        self.tp_size = 1
        self.tp_rank = 0
    @staticmethod
    def from_native_module(
        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
    ) -> ParallelModule:
        LazyInitContext.materialize(module)
        # get the attributes
        out_features = module.out_features
        # ensure only one process group is passed
        if isinstance(process_group, (list, tuple)):
            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
            process_group = process_group[0]
        tp_size = dist.get_world_size(process_group)
        tp_rank = dist.get_rank(process_group)
        if out_features < tp_size:
            return module
        if out_features % tp_size != 0:
            raise ValueError(
                f"The size of out_features:{out_features} is not integer multiples of tensor parallel size: {tp_size}!"
            )
        linear_1d = RowW8A8B8O8Linear(module.in_features, module.out_features // tp_size)
        linear_1d.tp_size = tp_size
        linear_1d.tp_rank = tp_rank
        linear_1d.process_group = process_group
        linear_1d.a = module.a.clone().detach()
        linear_1d.b = module.b.clone().detach()
        split_row_copy(module, linear_1d, tp_rank=tp_rank, **kwargs)
        return linear_1d
 class ColW8A8B8O8Linear(W8A8B8O8Linear, ParallelModule):
    def __init__(self, in_features, out_features, alpha=1.0, beta=1.0):
        super().__init__(in_features, out_features, alpha, beta)
        self.process_group = None
        self.tp_size = 1
        self.tp_rank = 0
    @staticmethod
    def from_native_module(
        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
    ) -> ParallelModule:
        LazyInitContext.materialize(module)
        # get the attributes
        in_features = module.in_features
        # ensure only one process group is passed
        if isinstance(process_group, (list, tuple)):
            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
            process_group = process_group[0]
        tp_size = dist.get_world_size(process_group)
        tp_rank = dist.get_rank(process_group)
        if in_features < tp_size:
            return module
        if in_features % tp_size != 0:
            raise ValueError(
                f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!"
            )
        linear_1d = ColW8A8B8O8Linear(module.in_features // tp_size, module.out_features)
        linear_1d.tp_size = tp_size
        linear_1d.tp_rank = tp_rank
        linear_1d.process_group = process_group
        linear_1d.a = torch.tensor(module.a)
        linear_1d.b = torch.tensor(module.b)
        split_column_copy(module, linear_1d, tp_rank=tp_rank, **kwargs)
        if linear_1d.bias is not None:
            linear_1d.bias = linear_1d.bias // tp_size
        return linear_1d
    @torch.no_grad()
    def forward(self, x):
        output = super().forward(x)
        if self.tp_size > 1:
            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=self.process_group)
        return output
 class RowW8A8BFP32O32LinearSiLU(W8A8BFP32O32LinearSiLU, ParallelModule):
    def __init__(self, in_features, out_features, alpha=1.0, beta=1.0):
        super().__init__(in_features, out_features, alpha, beta)
        self.process_group = None
        self.tp_size = 1
        self.tp_rank = 0
    @staticmethod
    def from_native_module(
        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
    ) -> ParallelModule:
        LazyInitContext.materialize(module)
        # get the attributes
        out_features = module.out_features
        # ensure only one process group is passed
        if isinstance(process_group, (list, tuple)):
            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
            process_group = process_group[0]
        tp_size = dist.get_world_size(process_group)
        tp_rank = dist.get_rank(process_group)
        if out_features < tp_size:
            return module
        if out_features % tp_size != 0:
            raise ValueError(
                f"The size of out_features:{out_features} is not integer multiples of tensor parallel size: {tp_size}!"
            )
        linear_1d = RowW8A8BFP32O32LinearSiLU(module.in_features, module.out_features // tp_size)
        linear_1d.tp_size = tp_size
        linear_1d.tp_rank = tp_rank
        linear_1d.process_group = process_group
        linear_1d.a = module.a.clone().detach()
        split_row_copy(module, linear_1d, tp_rank=tp_rank, **kwargs)
        return linear_1d
 class RowW8A8BFP32OFP32Linear(W8A8BFP32OFP32Linear, ParallelModule):
    def __init__(self, in_features, out_features, alpha=1.0, beta=1.0):
        super().__init__(in_features, out_features, alpha, beta)
        self.process_group = None
        self.tp_size = 1
        self.tp_rank = 0
    @staticmethod
    def from_native_module(
        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
    ) -> ParallelModule:
        LazyInitContext.materialize(module)
        # get the attributes
        out_features = module.out_features
        # ensure only one process group is passed
        if isinstance(process_group, (list, tuple)):
            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
            process_group = process_group[0]
        tp_size = dist.get_world_size(process_group)
        tp_rank = dist.get_rank(process_group)
        if out_features < tp_size:
            return module
        if out_features % tp_size != 0:
            raise ValueError(
                f"The size of out_features:{out_features} is not integer multiples of tensor parallel size: {tp_size}!"
            )
        linear_1d = RowW8A8BFP32OFP32Linear(module.in_features, module.out_features // tp_size)
        linear_1d.tp_size = tp_size
        linear_1d.tp_rank = tp_rank
        linear_1d.process_group = process_group
        linear_1d.a = module.a.clone().detach()
        split_row_copy(module, linear_1d, tp_rank=tp_rank, **kwargs)
        return linear_1d
 class ColW8A8BFP32OFP32Linear(W8A8BFP32OFP32Linear, ParallelModule):
    def __init__(self, in_features, out_features, alpha=1.0, beta=1.0):
        super().__init__(in_features, out_features, alpha, beta)
        self.process_group = None
        self.tp_size = 1
        self.tp_rank = 0
    @staticmethod
    def from_native_module(
        module: nn.Module, process_group: Union[ProcessGroup, List[ProcessGroup]], *args, **kwargs
    ) -> ParallelModule:
        LazyInitContext.materialize(module)
        # get the attributes
        in_features = module.in_features
        # ensure only one process group is passed
        if isinstance(process_group, (list, tuple)):
            assert len(process_group) == 1, f"Expected only one process group, got {len(process_group)}."
            process_group = process_group[0]
        tp_size = dist.get_world_size(process_group)
        tp_rank = dist.get_rank(process_group)
        if in_features < tp_size:
            return module
        if in_features % tp_size != 0:
            raise ValueError(
                f"The size of in_features:{in_features} is not integer multiples of tensor parallel size: {tp_size}!"
            )
        linear_1d = ColW8A8BFP32OFP32Linear(module.in_features // tp_size, module.out_features)
        linear_1d.tp_size = tp_size
        linear_1d.tp_rank = tp_rank
        linear_1d.process_group = process_group
        linear_1d.a = module.a.clone().detach()
        split_column_copy(module, linear_1d, tp_rank=tp_rank, **kwargs)
        if linear_1d.bias is not None:
            linear_1d.bias = linear_1d.bias / tp_size
        return linear_1d
    @torch.no_grad()
    def forward(self, x):
        output = super().forward(x)
        if self.tp_size > 1:
            dist.all_reduce(output, op=dist.ReduceOp.SUM, group=self.process_group)
        return output
--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@ -37,6 +37,7 @@ class ShardConfig:
    inference_gptq: bool = False
    enable_sequence_parallelism: bool = False
    enable_sequence_overlap: bool = False
    quant: str = None
    # pipeline_parallel_size: int
    # data_parallel_size: int
    # tensor_parallel_mode: Literal['1d', '2d', '2.5d', '3d']
@ -77,4 +78,3 @@ class ShardConfig:
        Set default params for inference.
        """
        # assert self.pipeline_stage_manager is None, "pipeline parallelism is not supported in inference for now"
        pass
--- a/examples/inference/hybrid_gptq_llama.py
+++ b/examples/inference/hybrid_gptq_llama.py
@ -0,0 +1,79 @@
 import argparse
 import os
 import torch
 import torch.distributed as dist
 from auto_gptq import AutoGPTQForCausalLM
 import colossalai
 from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
 os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
 def run_llama_test(args):
    quantized_model_dir = args.quantized_path
    max_batch_size = args.max_batch_size
    max_input_len = args.max_input_len
    max_output_len = args.max_output_len
    micro_batch_size = args.micro_batch_size
    # load quantized model to the first GPU
    model = AutoGPTQForCausalLM.from_quantized(
        quantized_model_dir, inject_fused_attention=False, device=torch.cuda.current_device()
    )
    engine = CaiInferEngine(
        tp_size=2,
        pp_size=2,
        model=model,
        model_policy=LlamaModelInferPolicy(),
        max_batch_size=max_batch_size,
        max_input_len=max_input_len,
        max_output_len=max_output_len,
        micro_batch_size=micro_batch_size,
        quant="gptq",
    )
    def data_gen():
        input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
        attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
        return dict(input_ids=input_ids, attention_mask=attention_mask)
    inputs = data_gen()
    for k, v in inputs.items():
        if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
            new_shape = [1] * v.dim()
            new_shape[0] = 16
            inputs[k] = v.to("cuda").repeat(*new_shape)
    output = engine.inference(inputs)
    if dist.get_rank() == 0:
        assert len(output[0]) == max_output_len, f"{len(output)}, {max_output_len}"
 def check_llama(rank, world_size, port, args):
    disable_existing_loggers()
    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
    run_llama_test(args)
@rerun_if_address_is_in_use()
@clear_cache_before_run()
 def test_gptq_llama(args):
    spawn(check_llama, args.tp_size * args.pp_size, args=args)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
    parser.add_argument("--tp_size", type=int, default=2, help="Tensor parallel size")
    parser.add_argument("--pp_size", type=int, default=2, help="Pipeline parallel size")
    parser.add_argument("--max_batch_size", type=int, default=4, help="Maximum batch size")
    parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size")
    parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
    parser.add_argument("--max_output_len", type=int, default=32, help="Maximum output length")
    args = parser.parse_args()
    test_gptq_llama(args)
--- a/examples/inference/hybrid_smoothquant_llama.py
+++ b/examples/inference/hybrid_smoothquant_llama.py
@ -0,0 +1,76 @@
 import argparse
 import torch
 import torch.distributed as dist
 import colossalai
 from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy
 from colossalai.inference.quant.smoothquant.models.llama import SmoothLlamaForCausalLM
 from colossalai.logging import disable_existing_loggers
 from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
@torch.no_grad()
 def run_llama_test(args):
    quantized_model_dir = args.quantized_path
    max_batch_size = args.max_batch_size
    max_input_len = args.max_input_len
    max_output_len = args.max_output_len
    micro_batch_size = args.micro_batch_size
    def data_gen():
        input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
        attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
        return dict(input_ids=input_ids, attention_mask=attention_mask)
    inputs = data_gen()
    for k, v in inputs.items():
        if torch.is_tensor(v) or "Tensor" in v.__class__.__name__:
            new_shape = [1] * v.dim()
            new_shape[0] = 16
            inputs[k] = v.to("cuda").repeat(*new_shape)
    model = SmoothLlamaForCausalLM.from_quantized(quantized_model_dir, model_basename="llama-7b")
    model = model.cuda()
    engine = CaiInferEngine(
        tp_size=2,
        pp_size=2,
        model=model,
        model_policy=LlamaModelInferPolicy(),
        max_batch_size=max_batch_size,
        max_input_len=max_input_len,
        max_output_len=max_output_len,
        micro_batch_size=micro_batch_size,
        quant="smoothquant",
    )
    output = engine.inference(inputs)
    if dist.get_rank() == 0:
        assert len(output[0]) == 32, f"{len(output)}, {32}"
 def check_llama(rank, world_size, port, args):
    disable_existing_loggers()
    colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
    run_llama_test(args)
@rerun_if_address_is_in_use()
@clear_cache_before_run()
 def test_smoothquant_llama():
    spawn(check_llama, args.tp_size * args.pp_size, args=args)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
    parser.add_argument("--tp_size", type=int, default=2, help="Tensor parallel size")
    parser.add_argument("--pp_size", type=int, default=2, help="Pipeline parallel size")
    parser.add_argument("--max_batch_size", type=int, default=4, help="Maximum batch size")
    parser.add_argument("--micro_batch_size", type=int, default=4, help="Micro batch size")
    parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
    parser.add_argument("--max_output_len", type=int, default=32, help="Maximum output length")
    args = parser.parse_args()
    test_smoothquant_llama()
--- a/tests/test_infer/test_hybrid_bloom.py
+++ b/tests/test_infer/test_hybrid_bloom.py
@ -9,6 +9,10 @@ from colossalai.inference import BloomModelInferPolicy, CaiInferEngine
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
 try:
    HAS_LIGHTLLM_KERNEL = True
 except:
    HAS_LIGHTLLM_KERNEL = False
 def data_gen():
@ -88,7 +92,10 @@ def check_tp_inference(rank, world_size, port):
    run_tp_inference_test()
-@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
+@pytest.mark.skipif(
    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
 )
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
--- a/tests/test_infer/test_hybrid_llama.py
+++ b/tests/test_infer/test_hybrid_llama.py
@ -9,6 +9,10 @@ from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy
 from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
 CUDA_SUPPORT = version.parse(torch.version.cuda) > version.parse("11.5")
 try:
    HAS_LIGHTLLM_KERNEL = True
 except:
    HAS_LIGHTLLM_KERNEL = False
 def data_gen():
@ -90,7 +94,10 @@ def check_tp_inference(rank, world_size, port):
    run_tp_inference_test()
-@pytest.mark.skipif(not CUDA_SUPPORT, reason="kv-cache manager engine requires cuda version to be higher than 11.5")
+@pytest.mark.skipif(
    not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
    reason="kv-cache manager engine requires cuda version to be higher than 11.5",
 )
@pytest.mark.dist
@rerun_if_address_is_in_use()
@clear_cache_before_run()
		`@ -0,0 +1 @@`
							`from .smoothquant.models.llama import SmoothLlamaForCausalLM`