ColossalAI/colossalai/kernel/cuda_native/csrc/gptq/q4_matrix.cu

// Adapted from turboderp exllama: https://github.com/turboderp/exllama

#include "q4_matrix.cuh"
#include <vector>
#include "util.cuh"
#include "matrix.cuh"

using namespace std;

const int UNSHUF_BLOCKSIZE_X = 64;

const int RECONS_THREADS_X = 64;      // Block size and thread count along columns in out, each thread converts 1 column
const int RECONS_THREADS_Y = 1;       // Block size and thread count along rows in x and out, each thread converts 8 rows

vector<Q4Matrix*> g_q4_matrices;

void g_q4_keep_matrix(Q4Matrix* m)
{
    g_q4_matrices.push_back(m);
}

void g_q4_free_matrices()
{
    for (const auto& m : g_q4_matrices) delete m;
    g_q4_matrices.clear();
}

Q4Matrix::Q4Matrix
(
    const int _height,
    const int _width,
    const int _groups,

    uint32_t* _qweight,
    uint32_t* _qzeros,
    half* _scales,
    uint32_t* _g_idx,

    const int _device
) :
    height(_height),
    width(_width),
    groups(_groups),
    device(_device)
{
    cudaSetDevice(device);

    cuda_qweight = _qweight;
    cuda_qzeros = _qzeros;
    cuda_scales = _scales;

    groupsize = height / groups;

    if (_g_idx) make_sequential(_g_idx);
}

Q4Matrix::~Q4Matrix()
{
}

// Make sequential

__global__ void make_sequential_kernel
(
    const uint32_t* __restrict__ w,
    uint32_t* __restrict__ w_new,
    const uint32_t* __restrict__ x_map,
    const int w_height,
    const int w_width
)
{
    const uint64_t* w2 = (uint64_t*) w;
    uint64_t* w_new2 = (uint64_t*) w_new;
    int w2_stride = w_width >> 1;

    int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
    if (w2_column >= w2_stride) return;

    int w_new2_row = blockIdx.y;

    int x_map_idx = w_new2_row << 3;

    uint64_t dst = 0;

    #pragma unroll
    for (int i = 0; i < 8; i++)
    {
        int source_row = x_map[x_map_idx++];

        int w2_row = source_row >> 3;
        int w2_subrow = source_row & 0x07;
        int w2_row_shift = w2_subrow << 2;
        int wnew2_row_shift = i << 2;

        uint64_t src = w2[w2_row * w2_stride + w2_column];
        src >>= w2_row_shift;
        src &= 0x0000000f0000000f;
        src <<= wnew2_row_shift;
        dst |= src;
    }

    w_new2[w_new2_row * w2_stride + w2_column] = dst;
}

void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
{
    uint32_t* cuda_new_qweight = NULL;
    cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
    cudaMalloc(&cuda_x_map, height * sizeof(uint32_t));  // TODO: Should probably be allocated in PyTorch

    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));

    // Group histogram

    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;

    // Group map

    for (int i = 0, acc = 0; i < groups; i++)
    {
        short tmp = cpu_g_idx_map[i];
        cpu_g_idx_map[i] = acc;
        acc += tmp;
    }

    // X map (inverse)

    for (int row = 0; row < height; row++)
    {
        uint32_t target_group = cpu_g_idx[row];
        uint32_t target_row = cpu_g_idx_map[target_group];
        cpu_g_idx_map[target_group]++;
        cpu_x_map_inv[row] = target_row;
    }

    // X map

    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;

    // Move to CUDA

    cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);

    // Rearrange rows in w

    dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
    dim3 blocks
    (
        (width + UNSHUF_BLOCKSIZE_X * 2 - 1) / (UNSHUF_BLOCKSIZE_X * 2),
        height / 8,
        1
    );

    make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);

    // Replace qweights

    cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);

    // Cleanup

    cudaDeviceSynchronize();
    cudaFree(cuda_new_qweight);
    free(cpu_g_idx_map);
    free(cpu_x_map);
    free(cpu_x_map_inv);
}

__global__ void reconstruct_kernel
(
    const uint32_t* __restrict__ w,
    half* __restrict__ out,  // (y)
    const half* __restrict__ w_scales,
    const uint32_t* __restrict__ w_zeros,
    const int height,
    const int width,
    const int groupsize
)
{
    // Start of block

    int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
    int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
    if (column >= width) return;
    
    // Views

    MatrixView_q4_column w_(w, height, width);
    MatrixView_half_rw out_(out, height, width);
    MatrixView_half w_scales_(w_scales, height / groupsize, width);
    MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);

    // Groupsize version

    int group = row / groupsize;

    half w_scale = w_scales_.item(group, column);
    uint32_t w_zero = w_zeros_.item(group, column) + 1;

    uint32_t w_read = w_.item_uint32_t(row, column);
    half* out_ptr = out_.item_ptr(row, column);

    #pragma unroll
    for (int s = 0; s < 32; s += 4)
    {
        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
        *out_ptr = w_item; out_ptr += out_.width;
    }
}

void Q4Matrix::reconstruct(half* out)
{
    dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);

    dim3 blocks
    (
        (width + threads.x - 1) / threads.x,
        (height / 8 + threads.y - 1) / threads.y,
        1
    );

    reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
}
[feature] add gptq for inference (#4754) * [gptq] add gptq kernel (#4416) * add gptq * refactor code * fix tests * replace auto-gptq * rname inferance/quant * refactor test * add auto-gptq as an option * reset requirements * change assert and check auto-gptq * add import warnings * change test flash attn version * remove example * change requirements of flash_attn * modify tests * [skip ci] change requirements-test * [gptq] faster gptq cuda kernel (#4494) * [skip ci] add cuda kernels * add license * [skip ci] fix max_input_len * format files & change test size * [skip ci] * [gptq] add gptq tensor parallel (#4538) * add gptq tensor parallel * add gptq tp * delete print * add test gptq check * add test auto gptq check * [gptq] combine gptq and kv cache manager (#4706) * combine gptq and kv cache manager * add init bits * delete useless code * add model path * delete usless print and update test * delete usless import * move option gptq to shard config * change replace linear to shardformer * update bloom policy * delete useless code * fix import bug and delete uselss code * change colossalai/gptq to colossalai/quant/gptq * update import linear for tests * delete useless code and mv gptq_kernel to kernel directory * fix triton kernel * add triton import 2023-09-22 03:02:50 +00:00			`// Adapted from turboderp exllama: https://github.com/turboderp/exllama`

			`#include "q4_matrix.cuh"`
			`#include <vector>`
			`#include "util.cuh"`
			`#include "matrix.cuh"`

			`using namespace std;`

			`const int UNSHUF_BLOCKSIZE_X = 64;`

			`const int RECONS_THREADS_X = 64; // Block size and thread count along columns in out, each thread converts 1 column`
			`const int RECONS_THREADS_Y = 1; // Block size and thread count along rows in x and out, each thread converts 8 rows`

			`vector<Q4Matrix*> g_q4_matrices;`

			`void g_q4_keep_matrix(Q4Matrix* m)`
			`{`
			`g_q4_matrices.push_back(m);`
			`}`

			`void g_q4_free_matrices()`
			`{`
			`for (const auto& m : g_q4_matrices) delete m;`
			`g_q4_matrices.clear();`
			`}`

			`Q4Matrix::Q4Matrix`
			`(`
			`const int _height,`
			`const int _width,`
			`const int _groups,`

			`uint32_t* _qweight,`
			`uint32_t* _qzeros,`
			`half* _scales,`
			`uint32_t* _g_idx,`

			`const int _device`
			`) :`
			`height(_height),`
			`width(_width),`
			`groups(_groups),`
			`device(_device)`
			`{`
			`cudaSetDevice(device);`

			`cuda_qweight = _qweight;`
			`cuda_qzeros = _qzeros;`
			`cuda_scales = _scales;`

			`groupsize = height / groups;`

			`if (_g_idx) make_sequential(_g_idx);`
			`}`

			`Q4Matrix::~Q4Matrix()`
			`{`
			`}`

			`// Make sequential`

			`__global__ void make_sequential_kernel`
			`(`
			`const uint32_t* __restrict__ w,`
			`uint32_t* __restrict__ w_new,`
			`const uint32_t* __restrict__ x_map,`
			`const int w_height,`
			`const int w_width`
			`)`
			`{`
			`const uint64_t* w2 = (uint64_t*) w;`
			`uint64_t* w_new2 = (uint64_t*) w_new;`
			`int w2_stride = w_width >> 1;`

			`int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;`
			`if (w2_column >= w2_stride) return;`

			`int w_new2_row = blockIdx.y;`

			`int x_map_idx = w_new2_row << 3;`

			`uint64_t dst = 0;`

			`#pragma unroll`
			`for (int i = 0; i < 8; i++)`
			`{`
			`int source_row = x_map[x_map_idx++];`

			`int w2_row = source_row >> 3;`
			`int w2_subrow = source_row & 0x07;`
			`int w2_row_shift = w2_subrow << 2;`
			`int wnew2_row_shift = i << 2;`

			`uint64_t src = w2[w2_row * w2_stride + w2_column];`
			`src >>= w2_row_shift;`
			`src &= 0x0000000f0000000f;`
			`src <<= wnew2_row_shift;`
			`dst \|= src;`
			`}`

			`w_new2[w_new2_row * w2_stride + w2_column] = dst;`
			`}`

			`void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)`
			`{`
			`uint32_t* cuda_new_qweight = NULL;`
			`cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));`
			`cudaMalloc(&cuda_x_map, height * sizeof(uint32_t)); // TODO: Should probably be allocated in PyTorch`

			`uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));`
			`uint32_t* cpu_x_map = (uint32_t) malloc(height sizeof(uint32_t));`
			`uint32_t* cpu_x_map_inv = (uint32_t) malloc(height sizeof(uint32_t));`

			`// Group histogram`

			`for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;`

			`// Group map`

			`for (int i = 0, acc = 0; i < groups; i++)`
			`{`
			`short tmp = cpu_g_idx_map[i];`
			`cpu_g_idx_map[i] = acc;`
			`acc += tmp;`
			`}`

			`// X map (inverse)`

			`for (int row = 0; row < height; row++)`
			`{`
			`uint32_t target_group = cpu_g_idx[row];`
			`uint32_t target_row = cpu_g_idx_map[target_group];`
			`cpu_g_idx_map[target_group]++;`
			`cpu_x_map_inv[row] = target_row;`
			`}`

			`// X map`

			`for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;`

			`// Move to CUDA`

			`cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);`

			`// Rearrange rows in w`

			`dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);`
			`dim3 blocks`
			`(`
			`(width + UNSHUF_BLOCKSIZE_X * 2 - 1) / (UNSHUF_BLOCKSIZE_X * 2),`
			`height / 8,`
			`1`
			`);`

			`make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);`

			`// Replace qweights`

			`cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);`

			`// Cleanup`

			`cudaDeviceSynchronize();`
			`cudaFree(cuda_new_qweight);`
			`free(cpu_g_idx_map);`
			`free(cpu_x_map);`
			`free(cpu_x_map_inv);`
			`}`

			`__global__ void reconstruct_kernel`
			`(`
			`const uint32_t* __restrict__ w,`
			`half* __restrict__ out, // (y)`
			`const half* __restrict__ w_scales,`
			`const uint32_t* __restrict__ w_zeros,`
			`const int height,`
			`const int width,`
			`const int groupsize`
			`)`
			`{`
			`// Start of block`

			`int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;`
			`int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;`
			`if (column >= width) return;`

			`// Views`

			`MatrixView_q4_column w_(w, height, width);`
			`MatrixView_half_rw out_(out, height, width);`
			`MatrixView_half w_scales_(w_scales, height / groupsize, width);`
			`MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);`

			`// Groupsize version`

			`int group = row / groupsize;`

			`half w_scale = w_scales_.item(group, column);`
			`uint32_t w_zero = w_zeros_.item(group, column) + 1;`

			`uint32_t w_read = w_.item_uint32_t(row, column);`
			`half* out_ptr = out_.item_ptr(row, column);`

			`#pragma unroll`
			`for (int s = 0; s < 32; s += 4)`
			`{`
			`half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);`
			`*out_ptr = w_item; out_ptr += out_.width;`
			`}`
			`}`

			`void Q4Matrix::reconstruct(half* out)`
			`{`
			`dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);`

			`dim3 blocks`
			`(`
			`(width + threads.x - 1) / threads.x,`
			`(height / 8 + threads.y - 1) / threads.y,`
			`1`
			`);`

			`reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);`
			`}`