|
|
from typing import Any
|
|
|
|
|
|
__all__ = ["CacheBlock"]
|
|
|
|
|
|
|
|
|
class CacheBlock:
|
|
|
"""A simplified version of logical cache block used for Paged Attention."""
|
|
|
|
|
|
def __init__(self, block_id: int, block_size: int, elem_size: int, k_ptrs: Any = None, v_ptrs: Any = None):
|
|
|
# Unique id of a cache block
|
|
|
self.block_id = block_id
|
|
|
|
|
|
# size/capacity of the block in terms of the number of tokens it can hold
|
|
|
self.block_size = block_size
|
|
|
|
|
|
# element size in bytes
|
|
|
self.elem_size = elem_size
|
|
|
|
|
|
# For common cases, we track the relationships between logical and physical caches in KV Cache Manager,
|
|
|
# Additionally, k, v pointers can be optionally used for tracking the physical cache by CacheBlock itself.
|
|
|
self.k_ptrs = k_ptrs
|
|
|
self.v_ptrs = v_ptrs
|
|
|
|
|
|
self.ref_count = 0
|
|
|
# the number of slots that have been allocated (i.e. the number of tokens occupying the block)
|
|
|
self.allocated_size = 0
|
|
|
# the token ids whose KV Cache would be written to corresponding physical caches
|
|
|
# TODO add logics to update token_ids
|
|
|
self.token_ids = [None] * self.block_size
|
|
|
|
|
|
@property
|
|
|
def available_space(self) -> int:
|
|
|
# `allocated_size` is ensured to be less than or equal to `block_size`
|
|
|
return self.block_size - self.allocated_size
|
|
|
|
|
|
def add_ref(self) -> None:
|
|
|
self.ref_count += 1
|
|
|
|
|
|
def remove_ref(self) -> None:
|
|
|
assert self.ref_count > 0, f"Block#{self.block_id} has no reference to remove."
|
|
|
self.ref_count -= 1
|
|
|
|
|
|
def has_ref(self) -> bool:
|
|
|
return self.ref_count > 0
|
|
|
|
|
|
def allocate(self, size: int) -> None:
|
|
|
assert size <= self.available_space, f"Block#{self.block_id} has no available space to allocate."
|
|
|
self.allocated_size += size
|
|
|
|
|
|
def is_empty(self):
|
|
|
return self.allocated_size < 1
|
|
|
|
|
|
def clear(self) -> None:
|
|
|
self.ref_count = 0
|
|
|
self.allocated_size = 0
|
|
|
|
|
|
def __repr__(self):
|
|
|
return f"CacheBlock#{self.block_id}(ref#{self.ref_count}, allocated#{self.allocated_size})"
|