ColossalAI/colossalai/inference/kv_cache/block_cache.py

from typing import Any

__all__ = ["CacheBlock"]


class CacheBlock:
    """A simplified version of logical cache block used for Paged Attention."""

    def __init__(self, block_id: int, block_size: int, elem_size: int, k_ptrs: Any = None, v_ptrs: Any = None):
        # Unique id of a cache block
        self.block_id = block_id

        # size/capacity of the block in terms of the number of tokens it can hold
        self.block_size = block_size

        # element size in bytes
        self.elem_size = elem_size

        # For common cases, we track the relationships between logical and physical caches in KV Cache Manager,
        # Additionally, k, v pointers can be optionally used for tracking the physical cache by CacheBlock itself.
        self.k_ptrs = k_ptrs
        self.v_ptrs = v_ptrs

        self.ref_count = 0
        # the number of slots that have been allocated (i.e. the number of tokens occupying the block)
        self.allocated_size = 0
        # the token ids whose KV Cache would be written to corresponding physical caches
        # TODO add logics to update token_ids
        self.token_ids = [None] * self.block_size

    @property
    def available_space(self) -> int:
        # `allocated_size` is ensured to be less than or equal to `block_size`
        return self.block_size - self.allocated_size

    def add_ref(self) -> None:
        self.ref_count += 1

    def remove_ref(self) -> None:
        assert self.ref_count > 0, f"Block#{self.block_id} has no reference to remove."
        self.ref_count -= 1

    def has_ref(self) -> bool:
        return self.ref_count > 0

    def allocate(self, size: int) -> None:
        assert size <= self.available_space, f"Block#{self.block_id} has no available space to allocate."
        self.allocated_size += size

    def is_empty(self):
        return self.allocated_size < 1

    def clear(self) -> None:
        self.ref_count = 0
        self.allocated_size = 0

    def __repr__(self):
        return f"CacheBlock#{self.block_id}(ref#{self.ref_count}, allocated#{self.allocated_size})"