ColossalAI/extensions/pybind/inference/inference.cpp

#include <torch/extension.h>

void decode_kv_cache_memcpy(
    torch::Tensor& key,    // [num_tokens, num_heads, head_size]
    torch::Tensor& value,  // [num_tokens, num_heads, head_size]
    torch::Tensor&
        key_cache,  // [num_blocks, head_num, head_dim/x, block_size, x]
    torch::Tensor&
        value_cache,  // [num_blocks, num_heads, block_size, head_size]
    torch::Tensor& sequence_lengths,  // [batch_size]
    torch::Tensor& block_tables);     // [batch_size, max_seq_len]

void context_kv_cache_memcpy(
    at::Tensor& key,        // [num_tokens, head_num, head_dim]
    at::Tensor& value,      // [num_tokens, head_num, head_dim]
    at::Tensor& key_cache,  // [num_blocks, head_num, head_dim/x, block_size, x]
    at::Tensor& value_cache,  // [num_blocks, head_num, block_size, head_dim]
    at::Tensor& sequence_lengths,  // [batch_size]
    at::Tensor& cu_seqlens,        // [batch_size + 1]
    at::Tensor& block_tables,      // [batch_size, max_seq_len]
    int max_seq_len_in_batch);

void rotary_embedding(
    torch::Tensor& query,  // [total_tokens, head_num, head_dim]
    torch::Tensor& key,    // [total_tokens, kv_head_num, head_dim]
    torch::Tensor& cos,    // [total_tokens, head_dim]
    torch::Tensor& sin,    // [total_tokens, head_dim]
    bool high_precision);

void rotary_embedding_and_cache_copy(
    torch::Tensor& query,  // [num_tokens, head_num, head_dim]
    torch::Tensor& key,    // [num_tokens, kv_head_num, head_dim]
    torch::Tensor& value,  // [num_tokens, num_heads, head_dim]
    torch::Tensor& cos,    // [num_tokens, head_dim]
    torch::Tensor& sin,    // [num_tokens, head_dim]
    torch::Tensor&
        key_cache,  // [num_blocks, head_num, head_dim/x, block_size, x]
    torch::Tensor&
        value_cache,  // [num_blocks, num_heads, block_size, head_dim]
    torch::Tensor& sequence_lengths,  // [batch_size]
    torch::Tensor& block_tables,      // [batch_size, max_seq_len]
    bool high_precision);

torch::Tensor silu_and_mul(const torch::Tensor& ins);

void rms_layernorm(torch::Tensor& out,     // [..., hidden_size]
                   torch::Tensor& input,   // [..., hidden_size]
                   torch::Tensor& weight,  // [hidden_size]
                   float epsilon);

void fused_add_rms_layernorm(torch::Tensor& input,     // [..., hidden_size]
                             torch::Tensor& residual,  // [..., hidden_size]
                             torch::Tensor& weight,    // [hidden_size]
                             float epsilon);

void get_cos_and_sin(at::Tensor& cos_cache,  // [max_rotary_position, head_dim]
                     at::Tensor& sin_cache,  // [max_rotary_position, head_dim]
                     at::Tensor& cos,        // [num_tokens, head_dim]
                     at::Tensor& sin,        // [num_tokens, head_dim]
                     at::Tensor& sequence_lengths,  // [batch_size]
                     int max_seq_len_in_batch, bool is_prompts);

void flash_decoding_attention(
    torch::Tensor& out,    // [num_tokens, num_heads, head_size]
    torch::Tensor& query,  // [num_tokens, num_heads, head_size]
    torch::Tensor&
        key_cache,  // [num_blocks, num_kv_heads, head_size/x, block_size, x]
    torch::Tensor&
        value_cache,  // [num_blocks, num_kv_heads, block_size, head_size]
    torch::Tensor& context_lens,  // [num_tokens]
    torch::Tensor& block_tables,  // [num_tokens, max_num_blocks_per_seq]
    int block_size, int max_context_len,
    torch::Tensor&
        tmp_out,  // [num_tokens, num_heads, max_num_partitions, head_size]
    torch::Tensor& tmp_out_lse,  // [num_tokens, num_heads, max_num_partitions]
    const c10::optional<torch::Tensor>& alibi_slopes, float scale);

void convert_fp8(torch::Tensor& input, torch::Tensor& output);

PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def("decode_kv_cache_memcpy", &decode_kv_cache_memcpy,
        "Copy the GPU memory of kvcache during the decode stage.");

  m.def("context_kv_cache_memcpy", &context_kv_cache_memcpy,
        "Copy the GPU memory of kvcache during the context stage.");

  m.def(
      "rotary_embedding_and_cache_copy", &rotary_embedding_and_cache_copy,
      "Performing Rotary Embedding-related calculations and KVCache Memcopy.");

  m.def("rotary_embedding", &rotary_embedding,
        "Performing Rotary Embedding-related calculations.");

  m.def("silu_and_mul", &silu_and_mul, "Silu with a following multiply");

  m.def("rms_layernorm", &rms_layernorm,
        "Apply Root Mean Square (RMS) Normalization to the input tensor.");

  m.def("fused_add_rms_layernorm", &fused_add_rms_layernorm,
        "In-place fused Add and RMS Normalization.");

  m.def("get_cos_and_sin", &get_cos_and_sin, "Get cos and sin from the cache.");

  m.def("flash_decoding_attention", &flash_decoding_attention,
        "Compute the attention between an input query and the cached "
        "keys/values using PagedAttention.");

  m.def("convert_fp8", &convert_fp8,
        "Convert input to fp8 output or convert fp8 input to output.");
}
[Inference]Add CUDA KVCache Kernel (#5406) * add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review 9 months ago			`#include <torch/extension.h>`

			`void decode_kv_cache_memcpy(`
[Inference/Kernel] refactor kvcache manager and rotary_embedding and kvcache_memcpy oper… (#5663) * refactor kvcache manager and rotary_embedding and kvcache_memcpy operator * refactor decode_kv_cache_memcpy * enable alibi in pagedattention 7 months ago			`torch::Tensor& key, // [num_tokens, num_heads, head_size]`
			`torch::Tensor& value, // [num_tokens, num_heads, head_size]`
			`torch::Tensor&`
			`key_cache, // [num_blocks, head_num, head_dim/x, block_size, x]`
[Inference]Add CUDA KVCache Kernel (#5406) * add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review 9 months ago			`torch::Tensor&`
			`value_cache, // [num_blocks, num_heads, block_size, head_size]`
			`torch::Tensor& sequence_lengths, // [batch_size]`
			`torch::Tensor& block_tables); // [batch_size, max_seq_len]`

[Inference]Support FP16/BF16 Flash Attention 2 And Add high_precision Flag To Rotary Embedding (#5461) * Support FP16/BF16 Flash Attention 2 * fix bugs in test_kv_cache_memcpy.py * add context_kv_cache_memcpy_kernel.cu * rm typename MT * add tail process * add high_precision * add high_precision to config.py * rm unused code * change the comment for the high_precision parameter * update test_rotary_embdding_unpad.py * fix vector_copy_utils.h * add comment for self.high_precision when using float32 8 months ago			`void context_kv_cache_memcpy(`
[Inference/Kernel] refactor kvcache manager and rotary_embedding and kvcache_memcpy oper… (#5663) * refactor kvcache manager and rotary_embedding and kvcache_memcpy operator * refactor decode_kv_cache_memcpy * enable alibi in pagedattention 7 months ago			`at::Tensor& key, // [num_tokens, head_num, head_dim]`
			`at::Tensor& value, // [num_tokens, head_num, head_dim]`
			`at::Tensor& key_cache, // [num_blocks, head_num, head_dim/x, block_size, x]`
[Inference]Support FP16/BF16 Flash Attention 2 And Add high_precision Flag To Rotary Embedding (#5461) * Support FP16/BF16 Flash Attention 2 * fix bugs in test_kv_cache_memcpy.py * add context_kv_cache_memcpy_kernel.cu * rm typename MT * add tail process * add high_precision * add high_precision to config.py * rm unused code * change the comment for the high_precision parameter * update test_rotary_embdding_unpad.py * fix vector_copy_utils.h * add comment for self.high_precision when using float32 8 months ago			`at::Tensor& value_cache, // [num_blocks, head_num, block_size, head_dim]`
			`at::Tensor& sequence_lengths, // [batch_size]`
			`at::Tensor& cu_seqlens, // [batch_size + 1]`
			`at::Tensor& block_tables, // [batch_size, max_seq_len]`
			`int max_seq_len_in_batch);`

[Inference/kernel]Add Fused Rotary Embedding and KVCache Memcopy CUDA Kernel (#5418) * add rotary embedding kernel * add rotary_embedding_kernel * add fused rotary_emb and kvcache memcopy * add fused_rotary_emb_and_cache_kernel.cu * add fused_rotary_emb_and_memcopy * fix bugs in fused_rotary_emb_and_cache_kernel.cu * fix ci bugs * use vec memcopy and opt the gloabl memory access * fix code style * fix test_rotary_embdding_unpad.py * codes revised based on the review comments * fix bugs about include path * rm inline 9 months ago			`void rotary_embedding(`
			`torch::Tensor& query, // [total_tokens, head_num, head_dim]`
			`torch::Tensor& key, // [total_tokens, kv_head_num, head_dim]`
			`torch::Tensor& cos, // [total_tokens, head_dim]`
[Inference]Support FP16/BF16 Flash Attention 2 And Add high_precision Flag To Rotary Embedding (#5461) * Support FP16/BF16 Flash Attention 2 * fix bugs in test_kv_cache_memcpy.py * add context_kv_cache_memcpy_kernel.cu * rm typename MT * add tail process * add high_precision * add high_precision to config.py * rm unused code * change the comment for the high_precision parameter * update test_rotary_embdding_unpad.py * fix vector_copy_utils.h * add comment for self.high_precision when using float32 8 months ago			`torch::Tensor& sin, // [total_tokens, head_dim]`
			`bool high_precision);`
[Inference/kernel]Add Fused Rotary Embedding and KVCache Memcopy CUDA Kernel (#5418) * add rotary embedding kernel * add rotary_embedding_kernel * add fused rotary_emb and kvcache memcopy * add fused_rotary_emb_and_cache_kernel.cu * add fused_rotary_emb_and_memcopy * fix bugs in fused_rotary_emb_and_cache_kernel.cu * fix ci bugs * use vec memcopy and opt the gloabl memory access * fix code style * fix test_rotary_embdding_unpad.py * codes revised based on the review comments * fix bugs about include path * rm inline 9 months ago
			`void rotary_embedding_and_cache_copy(`
[Inference/Kernel] refactor kvcache manager and rotary_embedding and kvcache_memcpy oper… (#5663) * refactor kvcache manager and rotary_embedding and kvcache_memcpy operator * refactor decode_kv_cache_memcpy * enable alibi in pagedattention 7 months ago			`torch::Tensor& query, // [num_tokens, head_num, head_dim]`
			`torch::Tensor& key, // [num_tokens, kv_head_num, head_dim]`
			`torch::Tensor& value, // [num_tokens, num_heads, head_dim]`
			`torch::Tensor& cos, // [num_tokens, head_dim]`
			`torch::Tensor& sin, // [num_tokens, head_dim]`
			`torch::Tensor&`
			`key_cache, // [num_blocks, head_num, head_dim/x, block_size, x]`
[Inference/kernel]Add Fused Rotary Embedding and KVCache Memcopy CUDA Kernel (#5418) * add rotary embedding kernel * add rotary_embedding_kernel * add fused rotary_emb and kvcache memcopy * add fused_rotary_emb_and_cache_kernel.cu * add fused_rotary_emb_and_memcopy * fix bugs in fused_rotary_emb_and_cache_kernel.cu * fix ci bugs * use vec memcopy and opt the gloabl memory access * fix code style * fix test_rotary_embdding_unpad.py * codes revised based on the review comments * fix bugs about include path * rm inline 9 months ago			`torch::Tensor&`
			`value_cache, // [num_blocks, num_heads, block_size, head_dim]`
			`torch::Tensor& sequence_lengths, // [batch_size]`
[Inference]Support FP16/BF16 Flash Attention 2 And Add high_precision Flag To Rotary Embedding (#5461) * Support FP16/BF16 Flash Attention 2 * fix bugs in test_kv_cache_memcpy.py * add context_kv_cache_memcpy_kernel.cu * rm typename MT * add tail process * add high_precision * add high_precision to config.py * rm unused code * change the comment for the high_precision parameter * update test_rotary_embdding_unpad.py * fix vector_copy_utils.h * add comment for self.high_precision when using float32 8 months ago			`torch::Tensor& block_tables, // [batch_size, max_seq_len]`
			`bool high_precision);`

add silu_and_mul for infer 9 months ago			`torch::Tensor silu_and_mul(const torch::Tensor& ins);`

feat rmsnorm cuda kernel and add unittest, benchmark script (#5417) 9 months ago			`void rms_layernorm(torch::Tensor& out, // [..., hidden_size]`
			`torch::Tensor& input, // [..., hidden_size]`
			`torch::Tensor& weight, // [hidden_size]`
			`float epsilon);`

			`void fused_add_rms_layernorm(torch::Tensor& input, // [..., hidden_size]`
			`torch::Tensor& residual, // [..., hidden_size]`
			`torch::Tensor& weight, // [hidden_size]`
			`float epsilon);`

[Inference/Kernel]Add get_cos_and_sin Kernel (#5528) * Add get_cos_and_sin kernel * fix code comments * fix code typos * merge common codes of get_cos_and_sin kernel. * Fixed a typo * Changed 'asset allclose' to 'assert equal'. 8 months ago			`void get_cos_and_sin(at::Tensor& cos_cache, // [max_rotary_position, head_dim]`
			`at::Tensor& sin_cache, // [max_rotary_position, head_dim]`
			`at::Tensor& cos, // [num_tokens, head_dim]`
			`at::Tensor& sin, // [num_tokens, head_dim]`
			`at::Tensor& sequence_lengths, // [batch_size]`
			`int max_seq_len_in_batch, bool is_prompts);`

[Inference/Kernel] Add Paged Decoding kernel, sequence split within the same thread block (#5531) * feat flash decoding for paged attention * refactor flashdecodingattention * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 7 months ago			`void flash_decoding_attention(`
			`torch::Tensor& out, // [num_tokens, num_heads, head_size]`
			`torch::Tensor& query, // [num_tokens, num_heads, head_size]`
			`torch::Tensor&`
[Inference/Kernel] Optimize paged attention: Refactor key cache layout (#5643) * optimize flashdecodingattention: refactor code with different key cache layout(from [num_blocks, num_kv_heads, block_size, head_size] to [num_blocks, num_kv_heads, head_size/x, block_size, x]) * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 7 months ago			`key_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]`
[Inference/Kernel] Add Paged Decoding kernel, sequence split within the same thread block (#5531) * feat flash decoding for paged attention * refactor flashdecodingattention * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 7 months ago			`torch::Tensor&`
			`value_cache, // [num_blocks, num_kv_heads, block_size, head_size]`
			`torch::Tensor& context_lens, // [num_tokens]`
			`torch::Tensor& block_tables, // [num_tokens, max_num_blocks_per_seq]`
			`int block_size, int max_context_len,`
			`torch::Tensor&`
			`tmp_out, // [num_tokens, num_heads, max_num_partitions, head_size]`
			`torch::Tensor& tmp_out_lse, // [num_tokens, num_heads, max_num_partitions]`
[Inference/Kernel] refactor kvcache manager and rotary_embedding and kvcache_memcpy oper… (#5663) * refactor kvcache manager and rotary_embedding and kvcache_memcpy operator * refactor decode_kv_cache_memcpy * enable alibi in pagedattention 7 months ago			`const c10::optional<torch::Tensor>& alibi_slopes, float scale);`
[Inference/Kernel] Add Paged Decoding kernel, sequence split within the same thread block (#5531) * feat flash decoding for paged attention * refactor flashdecodingattention * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 7 months ago
[Inference/Feat] Add convert_fp8 op for fp8 test in the future (#5706) * add convert_fp8 op for fp8 test in the future * rerun ci 7 months ago			`void convert_fp8(torch::Tensor& input, torch::Tensor& output);`

[Inference]Add CUDA KVCache Kernel (#5406) * add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review 9 months ago			`PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {`
			`m.def("decode_kv_cache_memcpy", &decode_kv_cache_memcpy,`
			`"Copy the GPU memory of kvcache during the decode stage.");`
feat rmsnorm cuda kernel and add unittest, benchmark script (#5417) 9 months ago
[Inference]Support FP16/BF16 Flash Attention 2 And Add high_precision Flag To Rotary Embedding (#5461) * Support FP16/BF16 Flash Attention 2 * fix bugs in test_kv_cache_memcpy.py * add context_kv_cache_memcpy_kernel.cu * rm typename MT * add tail process * add high_precision * add high_precision to config.py * rm unused code * change the comment for the high_precision parameter * update test_rotary_embdding_unpad.py * fix vector_copy_utils.h * add comment for self.high_precision when using float32 8 months ago			`m.def("context_kv_cache_memcpy", &context_kv_cache_memcpy,`
			`"Copy the GPU memory of kvcache during the context stage.");`

[Inference/kernel]Add Fused Rotary Embedding and KVCache Memcopy CUDA Kernel (#5418) * add rotary embedding kernel * add rotary_embedding_kernel * add fused rotary_emb and kvcache memcopy * add fused_rotary_emb_and_cache_kernel.cu * add fused_rotary_emb_and_memcopy * fix bugs in fused_rotary_emb_and_cache_kernel.cu * fix ci bugs * use vec memcopy and opt the gloabl memory access * fix code style * fix test_rotary_embdding_unpad.py * codes revised based on the review comments * fix bugs about include path * rm inline 9 months ago			`m.def(`
			`"rotary_embedding_and_cache_copy", &rotary_embedding_and_cache_copy,`
[Inference/Kernel]Add get_cos_and_sin Kernel (#5528) * Add get_cos_and_sin kernel * fix code comments * fix code typos * merge common codes of get_cos_and_sin kernel. * Fixed a typo * Changed 'asset allclose' to 'assert equal'. 8 months ago			`"Performing Rotary Embedding-related calculations and KVCache Memcopy.");`
[Inference/kernel]Add Fused Rotary Embedding and KVCache Memcopy CUDA Kernel (#5418) * add rotary embedding kernel * add rotary_embedding_kernel * add fused rotary_emb and kvcache memcopy * add fused_rotary_emb_and_cache_kernel.cu * add fused_rotary_emb_and_memcopy * fix bugs in fused_rotary_emb_and_cache_kernel.cu * fix ci bugs * use vec memcopy and opt the gloabl memory access * fix code style * fix test_rotary_embdding_unpad.py * codes revised based on the review comments * fix bugs about include path * rm inline 9 months ago
			`m.def("rotary_embedding", &rotary_embedding,`
[Inference/Kernel]Add get_cos_and_sin Kernel (#5528) * Add get_cos_and_sin kernel * fix code comments * fix code typos * merge common codes of get_cos_and_sin kernel. * Fixed a typo * Changed 'asset allclose' to 'assert equal'. 8 months ago			`"Performing Rotary Embedding-related calculations.");`
[Inference/kernel]Add Fused Rotary Embedding and KVCache Memcopy CUDA Kernel (#5418) * add rotary embedding kernel * add rotary_embedding_kernel * add fused rotary_emb and kvcache memcopy * add fused_rotary_emb_and_cache_kernel.cu * add fused_rotary_emb_and_memcopy * fix bugs in fused_rotary_emb_and_cache_kernel.cu * fix ci bugs * use vec memcopy and opt the gloabl memory access * fix code style * fix test_rotary_embdding_unpad.py * codes revised based on the review comments * fix bugs about include path * rm inline 9 months ago
add silu_and_mul for infer 9 months ago			`m.def("silu_and_mul", &silu_and_mul, "Silu with a following multiply");`
feat rmsnorm cuda kernel and add unittest, benchmark script (#5417) 9 months ago
			`m.def("rms_layernorm", &rms_layernorm,`
			`"Apply Root Mean Square (RMS) Normalization to the input tensor.");`

			`m.def("fused_add_rms_layernorm", &fused_add_rms_layernorm,`
			`"In-place fused Add and RMS Normalization.");`
[Inference/Kernel]Add get_cos_and_sin Kernel (#5528) * Add get_cos_and_sin kernel * fix code comments * fix code typos * merge common codes of get_cos_and_sin kernel. * Fixed a typo * Changed 'asset allclose' to 'assert equal'. 8 months ago
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 8 months ago			`m.def("get_cos_and_sin", &get_cos_and_sin, "Get cos and sin from the cache.");`
[Inference/Kernel] Add Paged Decoding kernel, sequence split within the same thread block (#5531) * feat flash decoding for paged attention * refactor flashdecodingattention * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> 7 months ago
			`m.def("flash_decoding_attention", &flash_decoding_attention,`
			`"Compute the attention between an input query and the cached "`
			`"keys/values using PagedAttention.");`
[Inference/Feat] Add convert_fp8 op for fp8 test in the future (#5706) * add convert_fp8 op for fp8 test in the future * rerun ci 7 months ago
			`m.def("convert_fp8", &convert_fp8,`
			`"Convert input to fp8 output or convert fp8 input to output.");`
[Inference]Add CUDA KVCache Kernel (#5406) * add cuda KVCache kernel * annotation benchmark_kvcache_copy * add use cuda * fix import path * move benchmark scripts to example/ * rm benchmark codes in test_kv_cache_memcpy.py * rm redundancy codes * rm redundancy codes * pr was modified according to the review 9 months ago			`}`