mirror of https://github.com/hpcaitech/ColossalAI
70 lines
2.0 KiB
Python
70 lines
2.0 KiB
Python
|
import torch
|
||
|
|
||
|
try:
|
||
|
import triton
|
||
|
import triton.language as tl
|
||
|
HAS_TRITON = True
|
||
|
except ImportError:
|
||
|
HAS_TRITON = False
|
||
|
print("please install triton from https://github.com/openai/triton")
|
||
|
|
||
|
if HAS_TRITON:
|
||
|
@triton.jit
|
||
|
def _fwd_copy_kv_cache_dest(
|
||
|
kv_cache_ptr, dest_index_ptr,
|
||
|
out,
|
||
|
stride_k_bs,
|
||
|
stride_k_h,
|
||
|
stride_k_d,
|
||
|
stride_o_bs,
|
||
|
stride_o_h,
|
||
|
stride_o_d,
|
||
|
head_num,
|
||
|
BLOCK_DMODEL: tl.constexpr,
|
||
|
BLOCK_HEAD: tl.constexpr
|
||
|
):
|
||
|
cur_index = tl.program_id(0)
|
||
|
offs_h = tl.arange(0, BLOCK_HEAD)
|
||
|
offs_d = tl.arange(0, BLOCK_DMODEL)
|
||
|
|
||
|
dest_index = tl.load(dest_index_ptr + cur_index)
|
||
|
|
||
|
cache_offsets = stride_k_h * offs_h[:, None] + stride_k_d * offs_d[None, :]
|
||
|
k_ptrs = kv_cache_ptr + cur_index * stride_k_bs + cache_offsets
|
||
|
|
||
|
o_offsets = stride_o_h * offs_h[:, None] + stride_o_d * offs_d[None, :]
|
||
|
o_ptrs = out + dest_index * stride_o_bs + o_offsets
|
||
|
|
||
|
k = tl.load(k_ptrs, mask=offs_h[:, None] < head_num, other=0.0)
|
||
|
tl.store(o_ptrs, k, mask=offs_h[:, None] < head_num)
|
||
|
return
|
||
|
|
||
|
|
||
|
@torch.no_grad()
|
||
|
def copy_kv_cache_to_dest(k_ptr, dest_index_ptr, out):
|
||
|
seq_len = dest_index_ptr.shape[0]
|
||
|
head_num = k_ptr.shape[1]
|
||
|
head_dim = k_ptr.shape[2]
|
||
|
assert head_num == out.shape[1], "head_num should be the same for k_ptr and out"
|
||
|
assert head_dim == out.shape[2], "head_dim should be the same for k_ptr and out"
|
||
|
|
||
|
num_warps = 2
|
||
|
|
||
|
_fwd_copy_kv_cache_dest[(seq_len,)](
|
||
|
k_ptr, dest_index_ptr, out,
|
||
|
k_ptr.stride(0),
|
||
|
k_ptr.stride(1),
|
||
|
k_ptr.stride(2),
|
||
|
out.stride(0),
|
||
|
out.stride(1),
|
||
|
out.stride(2),
|
||
|
head_num,
|
||
|
BLOCK_DMODEL=head_dim,
|
||
|
BLOCK_HEAD=triton.next_power_of_2(head_num),
|
||
|
num_warps=num_warps,
|
||
|
num_stages=2,
|
||
|
)
|
||
|
return
|
||
|
|
||
|
|