diff --git a/colossalai/inference/config.py b/colossalai/inference/config.py deleted file mode 100644 index d274beb14..000000000 --- a/colossalai/inference/config.py +++ /dev/null @@ -1,7 +0,0 @@ -""" -Our config consists of three parts: - 1. model_config: The configuration for the model, including `model name`, 'model path' and self-defined layer. - 2. parallel_config: The configuration for parallelize model, including `tp_size`,'pp_size', `world size`, `local rank`, `master port`, `master ip`. - 3. cache_config: Configuration for initialize and manage kv cache, including `block size`, `block num` -For the convenience of users, we provide a unified config api for that wrapped all the configs. One can easily construct a colossal_config by setting the needed configs. -""" diff --git a/colossalai/inference/core/config.py b/colossalai/inference/core/config.py new file mode 100644 index 000000000..6b44dd7af --- /dev/null +++ b/colossalai/inference/core/config.py @@ -0,0 +1,54 @@ +from typing import Optional, Union +from dataclasses import dataclass + +import torch +import torch.nn as nn + +@dataclass +class InferenceConfig: + """The inference configuration. + + Args: + model: Path or nn.Module of this model. + tokenizer: Path of the tokenizer to use. + tokenizer_mode: "auto" will use the fast tokenizer if available, and "slow" will always use the slow tokenizer. + trust_remote_code: Whether to trust remote code from huggingface. + max_batch_size: Maximum batch size. + max_output_len: Maximum output length. + max_input_len: Maximum input length. + block_size: The number of blocks in a logical block. + gpu_utilization_rate: Maximum GPU memory usage ratio. + dtype: The data type for weights and activations. + tp_size: Tensor parallel size. + pp_size: Pipeline parallel size. + max_seq_len: Maximum length of input sentence. + quant_mode: Quantization mode. + revision: The specific version(a branch, name, a commit id, or a tag name) of model to use. + """ + + model: Union[str, nn.Module] + tokenizer: str = None + tokenizer_mode: str = "auto" + trust_remote_code: bool = False + max_batch_size: int = 8 + max_output_len: int = 256 + max_input_len: int = 256 + block_size: int = 16 + gpu_utilization_rate: float = 0.7 + dtype: Union[str, torch.dtype] = torch.float32 + tp_size: int = 1 + pp_size: int = 1 + max_seq_len: Optional[int] = None + quant_mode: Optional[str] = None + revision: Optional[str] = None + + def __post_init__(self): + self._verify_args() + + def _verify_args(self): + if self.gpu_utilization_rate > 1.0: + raise ValueError( + f"GPU utilization should be less than 1.0, but is set to {self.gpu_memory_utilization}." + ) + if self.tokenizer_mode not in ["auto", "slow"]: + raise ValueError("Tokenizer mode must be " "either 'auto' or 'slow'," f"but got {self.tokenizer_mode}") diff --git a/colossalai/inference/core/engine.py b/colossalai/inference/core/engine.py index bf26b3ecb..7f78e9761 100644 --- a/colossalai/inference/core/engine.py +++ b/colossalai/inference/core/engine.py @@ -1,12 +1,14 @@ from logging import Logger from typing import Optional -from .request_handler import RequestHandler +from transformers import AutoConfig +from .config import InferenceConfig -class InferEngine: + +class InferenceEngine: """ - InferEngine is the core component for Inference. + InferenceEngine is the core component for Inference. It is responsible for launch the inference process, including: - Initialize model and distributed training environment(if needed) @@ -15,37 +17,27 @@ class InferEngine: - Log the generation process Args: - colossal_config: We provide a unified config api for that wrapped all the configs. You can use it to replace the below configs. - model_config : The configuration for the model. - parallel_config: The configuration for parallelize model. - cache_config : Configuration for initialize and manage kv cache. - tokenizer (Tokenizer): The tokenizer to be used for inference. - use_logger (bool): Determine whether or not to log the generation process. + tokenizer: Path of the tokenizer to use. + inference_config: We provide a unified config api for that wrapped all the configs. You can use it to replace the below configs. + verbose (bool): Determine whether or not to log the generation process. """ def __init__( self, - model_config, - cache_config, - parallel_config, - tokenizer, - use_logger: bool = False, - colossal_config: Optional["ColossalInferConfig"] = None, + tokenizer: str = None, + inference_config: Optional["InferenceConfig"] = None, + verbose: bool = False, ) -> None: - assert colossal_config or ( - model_config and cache_config and parallel_config - ), "Please provide colossal_config or model_config, cache_config, parallel_config" - if colossal_config: - model_config, cache_config, parallel_config = colossal_config - - self.model_config = model_config - self.cache_config = cache_config - self.parallel_config = parallel_config - self._verify_config() + assert inference_config, "Please provide inference_config." self._init_model() - self.request_handler = RequestHandler(cache_config) - if use_logger: + # cache_config may need to be modified later. + # self.request_handler = RequestHandler(cache_config) + self.tokenizer = tokenizer + self.hf_model_config = AutoConfig.from_pretrained( + self.model, trust_remote_code=self.trust_remote_code, revision=self.revision + ) + if verbose: self.logger = Logger() def _init_model(self): diff --git a/colossalai/inference/core/inference_struct.py b/colossalai/inference/core/inference_struct.py new file mode 100644 index 000000000..331f0308a --- /dev/null +++ b/colossalai/inference/core/inference_struct.py @@ -0,0 +1,169 @@ +import enum +from dataclasses import dataclass +from typing import Dict, List, Set + + +class RequsetStatus(enum.Enum): + """The status of Sentences""" + + WAITING = enum.auto() + RUNNING = enum.auto() + ABORTED = enum.auto() + OVERLENGTH = enum.auto() + COMPLETED = enum.auto() + LENGTH_CAPPED = enum.auto() + + @staticmethod + def is_finished(status: "RequsetStatus") -> bool: + return status in [ + RequsetStatus.OVERLENGTH, + RequsetStatus.COMPLETED, + RequsetStatus.LENGTH_CAPPED, + ] + + @staticmethod + def is_running(status: "RequsetStatus") -> bool: + return status == RequsetStatus.RUNNING + + @staticmethod + def is_waiting(status: "RequsetStatus") -> bool: + return status == RequsetStatus.WAITING + + +class Sequence: + """Store information of input sequence. + + Args: + request_id: The ID of input sequence. + prompt: The prompt of input sequence. + token_id: The tokens ID of input sequence. + block_size: The block size of input sequence. + sample_params: The sample_params of input sequence. + block_table_index: The index of input sequence in block_table. + """ + + def __init__( + self, + request_id: int, + prompt: str, + token_id: List[int], + block_size: int, + sample_params, # SampleParams needs to be imported later. + block_table_index: int, + ): + self.request_id = request_id + self.prompt = prompt + self.input_token_id = token_id + self.blokc_size = block_size + self.sample_params = sample_params + self.output_token_id = [] + self.status = RequsetStatus.WAITING + self.block_table_index = block_table_index + + def get_sentence_len(self) -> None: + """ + Get length of current sentence. + """ + return len(self.input_token_id) + len(self.output_token_id) + + def get_input_len(self) -> None: + """ + Get length of input sentence. + """ + return len(self.input_token_id) + + def get_output_len(self) -> None: + """ + Get output length of current sentence. + """ + return len(self.output_token_id) + + def check_finish(self) -> bool: + """ + Check whether inference is over. + """ + return RequsetStatus.is_finished(self.status) + + def __repr__(self) -> str: + return ( + f"Request ID(request_id={self.request_id}, " + f"prompt={self.prompt}, " + f"status={self.status.name}, " + f"sample_params={self.sample_params}, " + f"logical block number={len(self._logical_blocks)}" + ) + + +@dataclass +class BatchHandler: + """ + Information to be passed and used for a batch of sequences. + """ + + sequences_set: Set[Sequence] + block_table: Dict[int, int] + + @classmethod + def init_batch(cls, seqs: List[Sequence]) -> "BatchHandler": + """ + Initializes inference batches by input sentence list. + + Args: + seqs (List[Sequence]): List of input sequence. + """ + sequences_set = set() + block_table = {} + for seq in seqs: + if seq in sequences_set: + print("The sequence is already in sequences_set.") + assert ( + seq.request_id in block_table + ), "The sequence has been added to sequences_set, but it has not been added to block_table." + continue + assert ( + seq.request_id not in block_table + ), "The sequence has not been added to sequences_set, but it is already in block_table." + + sequences_set.add(seq) + block_table[seq.request_id] = seq.block_table_index + + return cls(sequences_set=sequences_set, block_table=block_table) + + def clear_batch(self) -> None: + """ + Clear sequence set and block table. + """ + for seq in self.sequences_set: + if not seq.check_finish(): + seq.status = RequsetStatus.ABORTED + self.sequences_set.clear() + self.block_table.clear() + + def fliter_batch(self) -> None: + """ + Remove completed sentences from a batch. + """ + for seq in self.sequences_set: + if seq.check_finish(): + self.sequences_set.reomve(seq) + del self.block_table[seq.request_id] + + def add_seqs(self, seqs: List[Sequence]) -> None: + """ + Add new sequence to batch + + Args: + seqs (List[Sequence]): The list of new sequences. + """ + for seq in seqs: + if seq in self.sequences_set: + print("The sequence is already in sequences_set.") + assert ( + seq.request_id in self.block_table + ), "The sequence has been added to sequences_set, but it has not been added to block_table." + continue + assert ( + seq.request_id not in self.block_table + ), "The sequence has not been added to sequences_set, but it is already in block_table." + self.sequences_set.add(seq) + self.block_table[seq.request_id] = seq.block_table_index diff --git a/tests/test_infer/test_config_and_struct.py b/tests/test_infer/test_config_and_struct.py new file mode 100644 index 000000000..580396e51 --- /dev/null +++ b/tests/test_infer/test_config_and_struct.py @@ -0,0 +1,37 @@ +from colossalai.inference.core.config import InferenceConfig +from colossalai.inference.core.inference_struct import BatchHandler, Sequence + + +def test_config_and_struct(): + InferenceConfig("/llama") + sequence = Sequence( + request_id=1, + prompt="abc", + token_id=[1, 2, 3], + block_size=16, + sample_params=None, + block_table_index=1, + ) + + sequence2 = Sequence( + request_id=2, + prompt="bcd", + token_id=[4, 5, 6], + block_size=16, + sample_params=None, + block_table_index=2, + ) + + assert sequence.get_sentence_len() == 3 + assert sequence.get_input_len() == 3 + assert sequence.get_output_len() == 0 + assert sequence.check_finish() == False + + batch = BatchHandler.init_batch([sequence]) + batch.fliter_batch() + batch.add_seqs([sequence2]) + batch.clear_batch() + + +if __name__ == "__main__": + test_config_and_struct()