ColossalAI/colossalai/kernel/triton/gptq_triton.py

# Adapted from AutoGPTQ auto_gptq: https://github.com/PanQiWei/AutoGPTQ

import torch
import triton
import triton.language as tl
from auto_gptq.nn_modules.triton_utils import custom_autotune


@triton.jit
def tanh(x):
    # Tanh is just a scaled sigmoid
    return 2 * tl.sigmoid(2 * x) - 1


@triton.jit
def cosh(x):
    exp_x = tl.exp(x)
    return (exp_x + 1.0 / exp_x) * 0.5


# a Triton implementation of the most used activations
# See for instance http://arxiv.org/abs/1606.08415 for an overview


# ReLU
@triton.jit
def relu(x):
    """
    ReLU_ activation function

    .. _ReLU: https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html
    """
    return tl.where(x >= 0, x, 0.0)


@triton.jit
def squared_relu(x):
    """
    Squared ReLU activation, as proposed in the Primer_ paper.

    .. _Primer: https://arxiv.org/abs/2109.08668
    """
    x_sq = x * x
    return tl.where(x > 0.0, x_sq, 0.0)


@triton.jit
def star_relu(x):
    """
    Star ReLU activation, as proposed in the "MetaFormer Baselines for Vision"_ paper.

    .. _ "MetaFormer Baselines for Vision": https://arxiv.org/pdf/2210.13452.pdf
    """
    x_sq = x * x
    return 0.8944 * tl.where(x > 0.0, x_sq, 0.0) - 0.4472


# Leaky ReLU
@triton.jit
def leaky_relu(x):
    """
    LeakyReLU_ activation

    .. _LeakyReLU: https://pytorch.org/docs/stable/generated/torch.nn.LeakyReLU.html
    """
    return tl.where(x >= 0.0, x, 0.01 * x)


@triton.jit
def gelu(x):
    """
    GeLU_ activation - Gaussian error linear unit

    .. _GeLU: https://arxiv.org/pdf/1606.08415.pdf
    """
    return 0.5 * x * (1 + tanh(_kAlpha * (x + 0.044715 * x * x * x)))


@triton.jit
def smelu(x):
    """
    SmeLU_ activation -  Smooth ReLU with beta=2.0

    .. _SmeLU: https://arxiv.org/pdf/2202.06499.pdf
    """
    beta = 2.0

    relu = tl.where(x >= beta, x, 0.0)
    return tl.where(tl.abs(x) <= beta, (x + beta) * (x + beta) / (4.0 * beta), relu)


@triton.jit
def silu(x):
    return x * tl.sigmoid(x)


@custom_autotune.autotune(
    configs=[
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=8
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=4
        ),
    ],
    key=["M", "N", "K"],
    nearest_power_of_two=True,
    prune_configs_by={
        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
        "perf_model": None,
        "top_k": None,
    },
)
@triton.jit
def cai_gptq_matmul_248_kernel(
    a_ptr,
    b_ptr,
    c_ptr,
    scales_ptr,
    zeros_ptr,
    bias_ptr,
    residual_ptr,
    M,
    N,
    K,
    bits,
    maxq,
    gptq_group_size,
    stride_am,
    stride_ak,
    stride_bk,
    stride_bn,
    stride_cm,
    stride_cn,
    stride_scales,
    stride_zeros,
    QKV_FUSED: tl.constexpr,
    ADD_BIAS: tl.constexpr,
    ADD_RESIDUAL: tl.constexpr,
    ACT_TYPE: tl.constexpr,
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
    GROUP_SIZE_M: tl.constexpr,
):
    """
    Compute the matrix multiplication C = A x B.
    A is of shape (M, K) float16
    B is of shape (K//8, N) int32
    C is of shape (M, N) float16
    scales is of shape (G, N) float16
    zeros is of shape (G, N) float16
    """
    infearure_per_bits = 32 // bits

    pid = tl.program_id(axis=0)
    NK = K

    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
    num_pid_k = tl.cdiv(NK, BLOCK_SIZE_K)
    qkv_offset = pid // (num_pid_m * num_pid_n)
    pid = pid % (num_pid_m * num_pid_n)
    num_pid_in_group = GROUP_SIZE_M * num_pid_n
    group_id = pid // num_pid_in_group
    first_pid_m = group_id * GROUP_SIZE_M
    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
    pid_m = first_pid_m + (pid % group_size_m)
    pid_n = (pid % num_pid_in_group) // group_size_m

    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    offs_k = tl.arange(0, BLOCK_SIZE_K)
    # offs_bk = offs_k + qkv_offset * NK
    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)

    a_mask = offs_am[:, None] < M
    # b_ptrs is set up such that it repeats elements along the K axis 8 times
    b_ptrs = (
        b_ptr
        + qkv_offset * N * NK // infearure_per_bits
        + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
    # g_ptrs = g_ptr + offs_k
    # shifter is used to extract the N bits of each element in the 32-bit word from B
    scales_ptrs = scales_ptr + qkv_offset * NK * N // gptq_group_size + offs_bn[None, :]
    zeros_ptrs = (
        zeros_ptr
        + qkv_offset * NK * N // gptq_group_size // infearure_per_bits
        + (offs_bn[None, :] // infearure_per_bits)
    )

    shifter = (offs_k % infearure_per_bits) * bits
    zeros_shifter = (offs_bn % infearure_per_bits) * bits
    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    g_idx_base = tl.arange(0, BLOCK_SIZE_K)
    g_idx_base = g_idx_base // gptq_group_size
    g_idx = g_idx_base
    # tl.device_print("gidx, ", g_idx)

    scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
    zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
    zeros = (zeros >> zeros_shifter[None, :]) & maxq
    zeros = zeros + 1

    for k in range(0, num_pid_k):
        # g_idx = tl.load(g_ptrs)
        # if (k + 1) * BLOCK_SIZE_K > currend_group_end:
        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
        zeros = (zeros >> zeros_shifter[None, :]) & maxq
        zeros = zeros + 1
        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
        # Now we need to unpack b (which is N-bit values) into 32-bit values
        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
        b = (b - zeros).to(tl.float16) * scales  # Scale and shift
        accumulator += tl.dot(a, b)

        a_ptrs += BLOCK_SIZE_K
        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
        g_idx = g_idx_base + ((k + 1) * BLOCK_SIZE_K) // gptq_group_size
        # if (k + 2) * BLOCK_SIZE_K > currend_group_end:

    c_ptrs = c_ptr + qkv_offset * M * N + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)

    if ADD_BIAS:
        bias_mask = offs_bn < N
        offs_bn += qkv_offset * N
        bias_ptrs = bias_ptr + stride_cn * offs_bn
        bias = tl.load(bias_ptrs, mask=bias_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
        accumulator += bias[None, :]

    if ACT_TYPE == 1:
        accumulator = relu(accumulator)
    elif ACT_TYPE == 2:
        accumulator = gelu(accumulator)
    elif ACT_TYPE == 3:
        accumulator = silu(accumulator)

    if ADD_RESIDUAL:
        residual_ptrs = residual_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
        res = tl.load(residual_ptrs, mask=c_mask, other=0.0)
        accumulator += res

    tl.store(c_ptrs, accumulator, mask=c_mask)


@custom_autotune.autotune(
    configs=[
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=8
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8
        ),
        triton.Config(
            {"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=4
        ),
    ],
    key=["M", "N", "K"],
    nearest_power_of_two=True,
    prune_configs_by={
        "early_config_prune": custom_autotune.matmul248_kernel_config_pruner,
        "perf_model": None,
        "top_k": None,
    },
)
@triton.jit
def cai_gptq_idx_matmul_248_kernel(
    a_ptr,
    b_ptr,
    c_ptr,
    scales_ptr,
    zeros_ptr,
    idx_ptr,
    bias_ptr,
    residual_ptr,
    M,
    N,
    K,
    bits,
    maxq,
    gptq_group_size,
    stride_am,
    stride_ak,
    stride_bk,
    stride_bn,
    stride_cm,
    stride_cn,
    stride_scales,
    stride_zeros,
    QKV_FUSED: tl.constexpr,
    ADD_BIAS: tl.constexpr,
    ADD_RESIDUAL: tl.constexpr,
    ACT_TYPE: tl.constexpr,
    BLOCK_SIZE_M: tl.constexpr,
    BLOCK_SIZE_N: tl.constexpr,
    BLOCK_SIZE_K: tl.constexpr,
    GROUP_SIZE_M: tl.constexpr,
):
    """
    Compute the matrix multiplication C = A x B.
    A is of shape (M, K) float16
    B is of shape (K//8, N) int32
    C is of shape (M, N) float16
    scales is of shape (G, N) float16
    zeros is of shape (G, N) float16
    """
    infearure_per_bits = 32 // bits

    pid = tl.program_id(axis=0)
    NK = K

    # if QKV_FUSED:
    #     NK = K//3
    # else:
    #     NK = K
    # NK = K

    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
    num_pid_k = tl.cdiv(NK, BLOCK_SIZE_K)
    qkv_offset = pid // (num_pid_m * num_pid_n)
    pid = pid % (num_pid_m * num_pid_n)
    num_pid_in_group = GROUP_SIZE_M * num_pid_n
    group_id = pid // num_pid_in_group
    first_pid_m = group_id * GROUP_SIZE_M
    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
    pid_m = first_pid_m + (pid % group_size_m)
    pid_n = (pid % num_pid_in_group) // group_size_m

    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    offs_k = tl.arange(0, BLOCK_SIZE_K)
    # offs_bk = offs_k + qkv_offset * NK
    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)

    a_mask = offs_am[:, None] < M
    # b_ptrs is set up such that it repeats elements along the K axis 8 times
    b_ptrs = (
        b_ptr
        + qkv_offset * N * NK // infearure_per_bits
        + ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)
    )  # (BLOCK_SIZE_K, BLOCK_SIZE_N)
    # g_ptrs = g_ptr + offs_k
    # shifter is used to extract the N bits of each element in the 32-bit word from B
    scales_ptrs = scales_ptr + qkv_offset * NK * N // gptq_group_size + offs_bn[None, :]
    zeros_ptrs = (
        zeros_ptr
        + qkv_offset * NK * N // gptq_group_size // infearure_per_bits
        + (offs_bn[None, :] // infearure_per_bits)
    )

    shifter = (offs_k % infearure_per_bits) * bits
    zeros_shifter = (offs_bn % infearure_per_bits) * bits
    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    g_ptrs = idx_ptr + offs_k
    g_idx = tl.load(g_ptrs)
    # tl.device_print("gidx, ", g_idx)
    zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)

    scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)

    for k in range(0, num_pid_k):
        g_idx = tl.load(g_ptrs)
        scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)
        zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros)  # (BLOCK_SIZE_K, BLOCK_SIZE_N,)

        zeros = (zeros >> zeros_shifter[None, :]) & maxq
        zeros = zeros + 1

        # Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop
        a = tl.load(a_ptrs, mask=a_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
        b = tl.load(b_ptrs)  # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated
        # Now we need to unpack b (which is N-bit values) into 32-bit values
        b = (b >> shifter[:, None]) & maxq  # Extract the N-bit values
        b = (b - zeros).to(tl.float16) * scales  # Scale and shift
        accumulator += tl.dot(a, b)

        a_ptrs += BLOCK_SIZE_K
        b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk
        g_ptrs += BLOCK_SIZE_K

    c_ptrs = c_ptr + qkv_offset * M * N + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
    c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)

    if ADD_BIAS:
        bias_mask = offs_bn < N
        offs_bn += qkv_offset * N
        bias_ptrs = bias_ptr + stride_cn * offs_bn
        bias = tl.load(bias_ptrs, mask=bias_mask, other=0.0)  # (BLOCK_SIZE_M, BLOCK_SIZE_K)
        accumulator += bias[None, :]

    if ACT_TYPE == 1:
        accumulator = relu(accumulator)
    elif ACT_TYPE == 2:
        accumulator = gelu(accumulator)
    elif ACT_TYPE == 3:
        accumulator = silu(accumulator)

    if ADD_RESIDUAL:
        residual_ptrs = residual_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]
        res = tl.load(residual_ptrs, mask=c_mask, other=0.0)
        accumulator += res

    tl.store(c_ptrs, accumulator, mask=c_mask)


def gptq_fused_linear_triton(
    input,
    qweight,
    scales,
    qzeros,
    bias,
    residual,
    bits,
    maxq,
    gptq_group_size,
    qkv_fused,
    add_bias,
    add_residual,
    g_idx=None,
    act_type=0,
):
    # print("gptq fused ", qkv_fused, add_bias, add_residual)
    assert input.is_cuda, "input is not in cuda"
    assert qweight.is_cuda, "qweight is not in cuda"
    assert scales.is_cuda, "scales is not in cuda"
    assert qzeros.is_cuda, "qzeros is not in cuda"

    with torch.cuda.device(input.device):
        if qkv_fused:
            grid = lambda META: (
                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])
                * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"])
                * 3,
            )
            output = torch.empty((input.shape[0] * 3, qweight.shape[1]), device=input.device, dtype=torch.float16)
        else:
            grid = lambda META: (
                triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),
            )
            output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)
        # print("dtype, ", qweight.dtype, output.dtype, scales.dtype, qzeros.dtype, bias.dtype, residual.dtype)
        if g_idx is None:
            cai_gptq_matmul_248_kernel[grid](
                input,
                qweight,
                output,
                scales,
                qzeros,
                bias,
                residual,
                input.shape[0],
                qweight.shape[1],
                input.shape[1],
                bits,
                maxq,
                gptq_group_size,
                input.stride(0),
                input.stride(1),
                qweight.stride(0),
                qweight.stride(1),
                output.stride(0),
                output.stride(1),
                scales.stride(0),
                qzeros.stride(0),
                QKV_FUSED=qkv_fused,
                ADD_BIAS=add_bias,
                ADD_RESIDUAL=add_residual,
                ACT_TYPE=act_type,
            )
        else:
            cai_gptq_idx_matmul_248_kernel[grid](
                input,
                qweight,
                output,
                scales,
                qzeros,
                g_idx,
                bias,
                residual,
                input.shape[0],
                qweight.shape[1],
                input.shape[1],
                bits,
                maxq,
                gptq_group_size,
                input.stride(0),
                input.stride(1),
                qweight.stride(0),
                qweight.stride(1),
                output.stride(0),
                output.stride(1),
                scales.stride(0),
                qzeros.stride(0),
                QKV_FUSED=qkv_fused,
                ADD_BIAS=add_bias,
                ADD_RESIDUAL=add_residual,
                ACT_TYPE=act_type,
            )
        if qkv_fused:
            return output.view(3, input.shape[0], qweight.shape[1])
        else:
            return output
[feature] add gptq for inference (#4754) * [gptq] add gptq kernel (#4416) * add gptq * refactor code * fix tests * replace auto-gptq * rname inferance/quant * refactor test * add auto-gptq as an option * reset requirements * change assert and check auto-gptq * add import warnings * change test flash attn version * remove example * change requirements of flash_attn * modify tests * [skip ci] change requirements-test * [gptq] faster gptq cuda kernel (#4494) * [skip ci] add cuda kernels * add license * [skip ci] fix max_input_len * format files & change test size * [skip ci] * [gptq] add gptq tensor parallel (#4538) * add gptq tensor parallel * add gptq tp * delete print * add test gptq check * add test auto gptq check * [gptq] combine gptq and kv cache manager (#4706) * combine gptq and kv cache manager * add init bits * delete useless code * add model path * delete usless print and update test * delete usless import * move option gptq to shard config * change replace linear to shardformer * update bloom policy * delete useless code * fix import bug and delete uselss code * change colossalai/gptq to colossalai/quant/gptq * update import linear for tests * delete useless code and mv gptq_kernel to kernel directory * fix triton kernel * add triton import 2023-09-22 03:02:50 +00:00			`# Adapted from AutoGPTQ auto_gptq: https://github.com/PanQiWei/AutoGPTQ`

			`import torch`
			`import triton`
			`import triton.language as tl`
			`from auto_gptq.nn_modules.triton_utils import custom_autotune`


			`@triton.jit`
			`def tanh(x):`
			`# Tanh is just a scaled sigmoid`
			`return 2 * tl.sigmoid(2 * x) - 1`


			`@triton.jit`
			`def cosh(x):`
			`exp_x = tl.exp(x)`
			`return (exp_x + 1.0 / exp_x) * 0.5`


			`# a Triton implementation of the most used activations`
			`# See for instance http://arxiv.org/abs/1606.08415 for an overview`


			`# ReLU`
			`@triton.jit`
			`def relu(x):`
			`"""`
			`ReLU_ activation function`

			`.. _ReLU: https://pytorch.org/docs/stable/generated/torch.nn.ReLU.html`
			`"""`
			`return tl.where(x >= 0, x, 0.0)`


			`@triton.jit`
			`def squared_relu(x):`
			`"""`
			`Squared ReLU activation, as proposed in the Primer_ paper.`

			`.. _Primer: https://arxiv.org/abs/2109.08668`
			`"""`
			`x_sq = x * x`
			`return tl.where(x > 0.0, x_sq, 0.0)`


			`@triton.jit`
			`def star_relu(x):`
			`"""`
			`Star ReLU activation, as proposed in the "MetaFormer Baselines for Vision"_ paper.`

			`.. _ "MetaFormer Baselines for Vision": https://arxiv.org/pdf/2210.13452.pdf`
			`"""`
			`x_sq = x * x`
			`return 0.8944 * tl.where(x > 0.0, x_sq, 0.0) - 0.4472`


			`# Leaky ReLU`
			`@triton.jit`
			`def leaky_relu(x):`
			`"""`
			`LeakyReLU_ activation`

			`.. _LeakyReLU: https://pytorch.org/docs/stable/generated/torch.nn.LeakyReLU.html`
			`"""`
			`return tl.where(x >= 0.0, x, 0.01 * x)`


			`@triton.jit`
			`def gelu(x):`
			`"""`
			`GeLU_ activation - Gaussian error linear unit`

			`.. _GeLU: https://arxiv.org/pdf/1606.08415.pdf`
			`"""`
			`return 0.5 * x * (1 + tanh(_kAlpha * (x + 0.044715 * x * x * x)))`


			`@triton.jit`
			`def smelu(x):`
			`"""`
			`SmeLU_ activation - Smooth ReLU with beta=2.0`

			`.. _SmeLU: https://arxiv.org/pdf/2202.06499.pdf`
			`"""`
			`beta = 2.0`

			`relu = tl.where(x >= beta, x, 0.0)`
			`return tl.where(tl.abs(x) <= beta, (x + beta) * (x + beta) / (4.0 * beta), relu)`


			`@triton.jit`
			`def silu(x):`
			`return x * tl.sigmoid(x)`


			`@custom_autotune.autotune(`
			`configs=[`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=8`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=4`
			`),`
			`],`
			`key=["M", "N", "K"],`
			`nearest_power_of_two=True,`
			`prune_configs_by={`
			`"early_config_prune": custom_autotune.matmul248_kernel_config_pruner,`
			`"perf_model": None,`
			`"top_k": None,`
			`},`
			`)`
			`@triton.jit`
			`def cai_gptq_matmul_248_kernel(`
			`a_ptr,`
			`b_ptr,`
			`c_ptr,`
			`scales_ptr,`
			`zeros_ptr,`
			`bias_ptr,`
			`residual_ptr,`
			`M,`
			`N,`
			`K,`
			`bits,`
			`maxq,`
			`gptq_group_size,`
			`stride_am,`
			`stride_ak,`
			`stride_bk,`
			`stride_bn,`
			`stride_cm,`
			`stride_cn,`
			`stride_scales,`
			`stride_zeros,`
			`QKV_FUSED: tl.constexpr,`
			`ADD_BIAS: tl.constexpr,`
			`ADD_RESIDUAL: tl.constexpr,`
			`ACT_TYPE: tl.constexpr,`
			`BLOCK_SIZE_M: tl.constexpr,`
			`BLOCK_SIZE_N: tl.constexpr,`
			`BLOCK_SIZE_K: tl.constexpr,`
			`GROUP_SIZE_M: tl.constexpr,`
			`):`
			`"""`
			`Compute the matrix multiplication C = A x B.`
			`A is of shape (M, K) float16`
			`B is of shape (K//8, N) int32`
			`C is of shape (M, N) float16`
			`scales is of shape (G, N) float16`
			`zeros is of shape (G, N) float16`
			`"""`
			`infearure_per_bits = 32 // bits`

			`pid = tl.program_id(axis=0)`
			`NK = K`

			`num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)`
			`num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)`
			`num_pid_k = tl.cdiv(NK, BLOCK_SIZE_K)`
			`qkv_offset = pid // (num_pid_m * num_pid_n)`
			`pid = pid % (num_pid_m * num_pid_n)`
			`num_pid_in_group = GROUP_SIZE_M * num_pid_n`
			`group_id = pid // num_pid_in_group`
			`first_pid_m = group_id * GROUP_SIZE_M`
			`group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)`
			`pid_m = first_pid_m + (pid % group_size_m)`
			`pid_n = (pid % num_pid_in_group) // group_size_m`

			`offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)`
			`offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)`
			`offs_k = tl.arange(0, BLOCK_SIZE_K)`
			`# offs_bk = offs_k + qkv_offset * NK`
			`a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_K)`

			`a_mask = offs_am[:, None] < M`
			`# b_ptrs is set up such that it repeats elements along the K axis 8 times`
			`b_ptrs = (`
			`b_ptr`
			`+ qkv_offset * N * NK // infearure_per_bits`
			`+ ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)`
			`) # (BLOCK_SIZE_K, BLOCK_SIZE_N)`
			`# g_ptrs = g_ptr + offs_k`
			`# shifter is used to extract the N bits of each element in the 32-bit word from B`
			`scales_ptrs = scales_ptr + qkv_offset * NK * N // gptq_group_size + offs_bn[None, :]`
			`zeros_ptrs = (`
			`zeros_ptr`
			`+ qkv_offset * NK * N // gptq_group_size // infearure_per_bits`
			`+ (offs_bn[None, :] // infearure_per_bits)`
			`)`

			`shifter = (offs_k % infearure_per_bits) * bits`
			`zeros_shifter = (offs_bn % infearure_per_bits) * bits`
			`accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)`
			`g_idx_base = tl.arange(0, BLOCK_SIZE_K)`
			`g_idx_base = g_idx_base // gptq_group_size`
			`g_idx = g_idx_base`
			`# tl.device_print("gidx, ", g_idx)`

			`scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)`
			`zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)`
			`zeros = (zeros >> zeros_shifter[None, :]) & maxq`
			`zeros = zeros + 1`

			`for k in range(0, num_pid_k):`
			`# g_idx = tl.load(g_ptrs)`
			`# if (k + 1) * BLOCK_SIZE_K > currend_group_end:`
			`scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)`
			`zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)`
			`zeros = (zeros >> zeros_shifter[None, :]) & maxq`
			`zeros = zeros + 1`
			`# Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop`
			`a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K)`
			`b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated`
			`# Now we need to unpack b (which is N-bit values) into 32-bit values`
			`b = (b >> shifter[:, None]) & maxq # Extract the N-bit values`
			`b = (b - zeros).to(tl.float16) * scales # Scale and shift`
			`accumulator += tl.dot(a, b)`

			`a_ptrs += BLOCK_SIZE_K`
			`b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk`
			`g_idx = g_idx_base + ((k + 1) * BLOCK_SIZE_K) // gptq_group_size`
			`# if (k + 2) * BLOCK_SIZE_K > currend_group_end:`

			`c_ptrs = c_ptr + qkv_offset * M * N + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]`
			`c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)`

			`if ADD_BIAS:`
			`bias_mask = offs_bn < N`
			`offs_bn += qkv_offset * N`
			`bias_ptrs = bias_ptr + stride_cn * offs_bn`
			`bias = tl.load(bias_ptrs, mask=bias_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K)`
			`accumulator += bias[None, :]`

			`if ACT_TYPE == 1:`
			`accumulator = relu(accumulator)`
			`elif ACT_TYPE == 2:`
			`accumulator = gelu(accumulator)`
			`elif ACT_TYPE == 3:`
			`accumulator = silu(accumulator)`

			`if ADD_RESIDUAL:`
			`residual_ptrs = residual_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]`
			`res = tl.load(residual_ptrs, mask=c_mask, other=0.0)`
			`accumulator += res`

			`tl.store(c_ptrs, accumulator, mask=c_mask)`


			`@custom_autotune.autotune(`
			`configs=[`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 256, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 128, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=4, num_warps=4`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=8`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 64, "BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64, "GROUP_SIZE_M": 8}, num_stages=3, num_warps=8`
			`),`
			`triton.Config(`
			`{"BLOCK_SIZE_M": 32, "BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128, "GROUP_SIZE_M": 8}, num_stages=2, num_warps=4`
			`),`
			`],`
			`key=["M", "N", "K"],`
			`nearest_power_of_two=True,`
			`prune_configs_by={`
			`"early_config_prune": custom_autotune.matmul248_kernel_config_pruner,`
			`"perf_model": None,`
			`"top_k": None,`
			`},`
			`)`
			`@triton.jit`
			`def cai_gptq_idx_matmul_248_kernel(`
			`a_ptr,`
			`b_ptr,`
			`c_ptr,`
			`scales_ptr,`
			`zeros_ptr,`
			`idx_ptr,`
			`bias_ptr,`
			`residual_ptr,`
			`M,`
			`N,`
			`K,`
			`bits,`
			`maxq,`
			`gptq_group_size,`
			`stride_am,`
			`stride_ak,`
			`stride_bk,`
			`stride_bn,`
			`stride_cm,`
			`stride_cn,`
			`stride_scales,`
			`stride_zeros,`
			`QKV_FUSED: tl.constexpr,`
			`ADD_BIAS: tl.constexpr,`
			`ADD_RESIDUAL: tl.constexpr,`
			`ACT_TYPE: tl.constexpr,`
			`BLOCK_SIZE_M: tl.constexpr,`
			`BLOCK_SIZE_N: tl.constexpr,`
			`BLOCK_SIZE_K: tl.constexpr,`
			`GROUP_SIZE_M: tl.constexpr,`
			`):`
			`"""`
			`Compute the matrix multiplication C = A x B.`
			`A is of shape (M, K) float16`
			`B is of shape (K//8, N) int32`
			`C is of shape (M, N) float16`
			`scales is of shape (G, N) float16`
			`zeros is of shape (G, N) float16`
			`"""`
			`infearure_per_bits = 32 // bits`

			`pid = tl.program_id(axis=0)`
			`NK = K`

			`# if QKV_FUSED:`
			`# NK = K//3`
			`# else:`
			`# NK = K`
			`# NK = K`

			`num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)`
			`num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)`
			`num_pid_k = tl.cdiv(NK, BLOCK_SIZE_K)`
			`qkv_offset = pid // (num_pid_m * num_pid_n)`
			`pid = pid % (num_pid_m * num_pid_n)`
			`num_pid_in_group = GROUP_SIZE_M * num_pid_n`
			`group_id = pid // num_pid_in_group`
			`first_pid_m = group_id * GROUP_SIZE_M`
			`group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)`
			`pid_m = first_pid_m + (pid % group_size_m)`
			`pid_n = (pid % num_pid_in_group) // group_size_m`

			`offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)`
			`offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)`
			`offs_k = tl.arange(0, BLOCK_SIZE_K)`
			`# offs_bk = offs_k + qkv_offset * NK`
			`a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) # (BLOCK_SIZE_M, BLOCK_SIZE_K)`

			`a_mask = offs_am[:, None] < M`
			`# b_ptrs is set up such that it repeats elements along the K axis 8 times`
			`b_ptrs = (`
			`b_ptr`
			`+ qkv_offset * N * NK // infearure_per_bits`
			`+ ((offs_k[:, None] // infearure_per_bits) * stride_bk + offs_bn[None, :] * stride_bn)`
			`) # (BLOCK_SIZE_K, BLOCK_SIZE_N)`
			`# g_ptrs = g_ptr + offs_k`
			`# shifter is used to extract the N bits of each element in the 32-bit word from B`
			`scales_ptrs = scales_ptr + qkv_offset * NK * N // gptq_group_size + offs_bn[None, :]`
			`zeros_ptrs = (`
			`zeros_ptr`
			`+ qkv_offset * NK * N // gptq_group_size // infearure_per_bits`
			`+ (offs_bn[None, :] // infearure_per_bits)`
			`)`

			`shifter = (offs_k % infearure_per_bits) * bits`
			`zeros_shifter = (offs_bn % infearure_per_bits) * bits`
			`accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)`
			`g_ptrs = idx_ptr + offs_k`
			`g_idx = tl.load(g_ptrs)`
			`# tl.device_print("gidx, ", g_idx)`
			`zeros_ptrs = zeros_ptr + (offs_bn[None, :] // infearure_per_bits)`

			`scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)`

			`for k in range(0, num_pid_k):`
			`g_idx = tl.load(g_ptrs)`
			`scales = tl.load(scales_ptrs + g_idx[:, None] * stride_scales) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)`
			`zeros = tl.load(zeros_ptrs + g_idx[:, None] * stride_zeros) # (BLOCK_SIZE_K, BLOCK_SIZE_N,)`

			`zeros = (zeros >> zeros_shifter[None, :]) & maxq`
			`zeros = zeros + 1`

			`# Fetch scales and zeros; these are per-outfeature and thus reused in the inner loop`
			`a = tl.load(a_ptrs, mask=a_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K)`
			`b = tl.load(b_ptrs) # (BLOCK_SIZE_K, BLOCK_SIZE_N), but repeated`
			`# Now we need to unpack b (which is N-bit values) into 32-bit values`
			`b = (b >> shifter[:, None]) & maxq # Extract the N-bit values`
			`b = (b - zeros).to(tl.float16) * scales # Scale and shift`
			`accumulator += tl.dot(a, b)`

			`a_ptrs += BLOCK_SIZE_K`
			`b_ptrs += (BLOCK_SIZE_K // infearure_per_bits) * stride_bk`
			`g_ptrs += BLOCK_SIZE_K`

			`c_ptrs = c_ptr + qkv_offset * M * N + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]`
			`c_mask = (offs_am[:, None] < M) & (offs_bn[None, :] < N)`

			`if ADD_BIAS:`
			`bias_mask = offs_bn < N`
			`offs_bn += qkv_offset * N`
			`bias_ptrs = bias_ptr + stride_cn * offs_bn`
			`bias = tl.load(bias_ptrs, mask=bias_mask, other=0.0) # (BLOCK_SIZE_M, BLOCK_SIZE_K)`
			`accumulator += bias[None, :]`

			`if ACT_TYPE == 1:`
			`accumulator = relu(accumulator)`
			`elif ACT_TYPE == 2:`
			`accumulator = gelu(accumulator)`
			`elif ACT_TYPE == 3:`
			`accumulator = silu(accumulator)`

			`if ADD_RESIDUAL:`
			`residual_ptrs = residual_ptr + stride_cm * offs_am[:, None] + stride_cn * offs_bn[None, :]`
			`res = tl.load(residual_ptrs, mask=c_mask, other=0.0)`
			`accumulator += res`

			`tl.store(c_ptrs, accumulator, mask=c_mask)`


			`def gptq_fused_linear_triton(`
			`input,`
			`qweight,`
			`scales,`
			`qzeros,`
			`bias,`
			`residual,`
			`bits,`
			`maxq,`
			`gptq_group_size,`
			`qkv_fused,`
			`add_bias,`
			`add_residual,`
			`g_idx=None,`
			`act_type=0,`
			`):`
			`# print("gptq fused ", qkv_fused, add_bias, add_residual)`
			`assert input.is_cuda, "input is not in cuda"`
			`assert qweight.is_cuda, "qweight is not in cuda"`
			`assert scales.is_cuda, "scales is not in cuda"`
			`assert qzeros.is_cuda, "qzeros is not in cuda"`

			`with torch.cuda.device(input.device):`
			`if qkv_fused:`
			`grid = lambda META: (`
			`triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"])`
			`* triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"])`
			`* 3,`
			`)`
			`output = torch.empty((input.shape[0] * 3, qweight.shape[1]), device=input.device, dtype=torch.float16)`
			`else:`
			`grid = lambda META: (`
			`triton.cdiv(input.shape[0], META["BLOCK_SIZE_M"]) * triton.cdiv(qweight.shape[1], META["BLOCK_SIZE_N"]),`
			`)`
			`output = torch.empty((input.shape[0], qweight.shape[1]), device=input.device, dtype=torch.float16)`
			`# print("dtype, ", qweight.dtype, output.dtype, scales.dtype, qzeros.dtype, bias.dtype, residual.dtype)`
			`if g_idx is None:`
			`cai_gptq_matmul_248_kernel[grid](`
			`input,`
			`qweight,`
			`output,`
			`scales,`
			`qzeros,`
			`bias,`
			`residual,`
			`input.shape[0],`
			`qweight.shape[1],`
			`input.shape[1],`
			`bits,`
			`maxq,`
			`gptq_group_size,`
			`input.stride(0),`
			`input.stride(1),`
			`qweight.stride(0),`
			`qweight.stride(1),`
			`output.stride(0),`
			`output.stride(1),`
			`scales.stride(0),`
			`qzeros.stride(0),`
			`QKV_FUSED=qkv_fused,`
			`ADD_BIAS=add_bias,`
			`ADD_RESIDUAL=add_residual,`
			`ACT_TYPE=act_type,`
			`)`
			`else:`
			`cai_gptq_idx_matmul_248_kernel[grid](`
			`input,`
			`qweight,`
			`output,`
			`scales,`
			`qzeros,`
			`g_idx,`
			`bias,`
			`residual,`
			`input.shape[0],`
			`qweight.shape[1],`
			`input.shape[1],`
			`bits,`
			`maxq,`
			`gptq_group_size,`
			`input.stride(0),`
			`input.stride(1),`
			`qweight.stride(0),`
			`qweight.stride(1),`
			`output.stride(0),`
			`output.stride(1),`
			`scales.stride(0),`
			`qzeros.stride(0),`
			`QKV_FUSED=qkv_fused,`
			`ADD_BIAS=add_bias,`
			`ADD_RESIDUAL=add_residual,`
			`ACT_TYPE=act_type,`
			`)`
			`if qkv_fused:`
			`return output.view(3, input.shape[0], qweight.shape[1])`
			`else:`
			`return output`