[Feat]Inference RPC Server Support (#5705)

* rpc support source * kv cache logical/physical disaggregation * sampler refactor * colossalai launch built in * Unitest * Rpyc support --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
6 months ago · 18d67d0e8e
15 changed files with 1032 additions and 63 deletions
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@ -2,11 +2,11 @@
 Our config contains various options for inference optimization, it is a unified API that wraps all the configurations for inference.
 """
 import logging
+from abc import ABC, abstractmethod
 from dataclasses import dataclass, fields
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Union

 import torch
-import torch.distributed as dist
 from transformers.generation import GenerationConfig

 from colossalai.inference.flash_decoding_utils import FDIntermTensors
@ -30,8 +30,25 @@ _DEFAULT_PROMPT_TEMPLATES = {
 }


+class RPC_PARAM(ABC):
+    """
+    NOTE(lry89757) We use rpyc to transport param between client and server.
+    Rpyc only support the type of `POD` in python as the param, so we should take some smart ways to transport the data like tensor or some sophisticated classes.
+    Drawing on the logic of `__setstate__`, `__getstate__`, we will let some classes(will be rpc param later) inherit this base class, and rewrite the to_rpc_param and from_rpc_param. We will invoke `to_rpc_param` in client to pass the params and recover the param in server side by `from_rpc_param`.
+    """
+
+    @abstractmethod
+    def to_rpc_param(self):
+        return NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def from_rpc_param():
+        return NotImplementedError
+
+
@dataclass
-class InputMetaData:
+class InputMetaData(RPC_PARAM):
    """The input info for a single step

    Args:
@ -48,6 +65,7 @@ class InputMetaData:
    dtype (torch.dtype, optional): The computation type of tensor, Defaults to torch.float32.
    use_spec_dec (bool): Indicate whether to use speculative decoding.
    num_tokens_to_verify (int): The number of tokens to verify in speculative decoding. Only valid when `use_spec_dec` is set to True.
+    batch_token_ids (List[List[int]], optional): input_token_ids + output_token_ids of current batch. Only used for `repetition_penalty`, `no_repeat_ngram_size` in sampler process.
    """

    block_tables: torch.Tensor = None
@ -63,6 +81,54 @@ class InputMetaData:
    dtype: torch.dtype = torch.float32
    use_spec_dec: bool = False
    num_tokens_to_verify: int = 0
+    batch_token_ids: Optional[
+        List[List[int]]
+    ] = None  # for `repetition_penalty`, `no_repeat_ngram_size` in sampler process
+
+    def to_rpc_param(self) -> Dict[str, any]:
+        return {
+            "block_tables": self.block_tables.tolist(),
+            "sequence_lengths": self.sequence_lengths.tolist(),
+            "batch_size": self.batch_size,
+            "is_prompts": self.is_prompts,
+            "use_cuda_kernel": self.use_cuda_kernel,
+            "use_cuda_graph": self.use_cuda_graph,
+            "kv_seq_len": self.kv_seq_len,
+            "head_dim": self.head_dim,
+            "high_precision": self.high_precision,
+            "dtype": str(self.dtype).split(".")[-1],
+            "use_spec_dec": self.use_spec_dec,
+            "num_tokens_to_verify": self.num_tokens_to_verify,
+            "batch_token_ids": self.batch_token_ids,
+        }
+
+    @staticmethod
+    def from_rpc_param(rpc_dict: Dict[str, any]) -> "InputMetaData":
+        """
+        We intentionally don't use `dict.get` method to ensure we pass the right rpc param, or program will show error message
+        """
+        from colossalai.accelerator import get_accelerator
+
+        dtype = getattr(torch, rpc_dict["dtype"])
+        return InputMetaData(
+            block_tables=torch.tensor(
+                rpc_dict["block_tables"], dtype=torch.int, device=get_accelerator().get_current_device()
+            ),
+            sequence_lengths=torch.tensor(
+                rpc_dict["sequence_lengths"], dtype=torch.int, device=get_accelerator().get_current_device()
+            ),
+            batch_size=rpc_dict["batch_size"],
+            is_prompts=rpc_dict["is_prompts"],
+            use_cuda_kernel=rpc_dict["use_cuda_kernel"],
+            use_cuda_graph=rpc_dict["use_cuda_graph"],
+            kv_seq_len=rpc_dict["kv_seq_len"],
+            head_dim=rpc_dict["head_dim"],
+            high_precision=rpc_dict["high_precision"],
+            dtype=dtype,
+            use_spec_dec=rpc_dict["use_spec_dec"],
+            num_tokens_to_verify=rpc_dict["num_tokens_to_verify"],
+            batch_token_ids=rpc_dict["batch_token_ids"],
+        )

    def __repr__(self) -> str:
        return (
@ -80,7 +146,7 @@ class InputMetaData:


@dataclass
-class InferenceConfig:
+class InferenceConfig(RPC_PARAM):
    """The inference configuration.

    Args:
@ -193,10 +259,6 @@ class InferenceConfig:
        if self.dtype == torch.float32:
            self.high_precision = False

-        # check distributed
-        assert (not torch.distributed.is_initialized() and self.tp_size * self.pp_size == 1) or (
-            self.tp_size * self.pp_size == dist.get_world_size()
-        ), f"TP size({self.tp_size}) * PP size({self.pp_size}) should be equal to the global world size ({dist.get_world_size()})"
        # check prompt template
        if self.prompt_template is None:
            return
@ -226,6 +288,43 @@ class InferenceConfig:

        return GenerationConfig.from_dict(meta_config)

+    def to_rpc_param(self) -> dict:
+        kwargs = {
+            "dtype": str(self.dtype).split(".")[-1],
+            "max_n_spec_tokens": self.max_n_spec_tokens,
+            "max_batch_size": self.max_batch_size,
+            "max_input_len": self.max_input_len,
+            "max_output_len": self.max_output_len,
+            "tp_size": self.tp_size,
+            "pp_size": self.pp_size,
+            "pad_input": self.pad_input,
+            "early_stopping": self.early_stopping,
+            "do_sample": self.do_sample,
+            "beam_width": self.beam_width,
+            "kv_cache_dtype": str(self.kv_cache_dtype).split(".")[-1],
+        }
+        return kwargs
+
+    @staticmethod
+    def from_rpc_param(rpc_dict: dict) -> "InferenceConfig":
+        """
+        We intentionally don't use `dict.get` method to ensure we pass the right rpc param, or program will show error message
+        """
+        return InferenceConfig(
+            dtype=getattr(torch, rpc_dict["dtype"]),
+            max_n_spec_tokens=rpc_dict["max_n_spec_tokens"],
+            max_batch_size=rpc_dict["max_batch_size"],
+            max_input_len=rpc_dict["max_input_len"],
+            max_output_len=rpc_dict["max_output_len"],
+            tp_size=rpc_dict["tp_size"],
+            pp_size=rpc_dict["pp_size"],
+            pad_input=rpc_dict["pad_input"],
+            early_stopping=rpc_dict["early_stopping"],
+            do_sample=rpc_dict["do_sample"],
+            beam_width=rpc_dict["beam_width"],
+            kv_cache_dtype=getattr(torch, rpc_dict["kv_cache_dtype"], None),
+        )
+
    @classmethod
    def from_dict(cls, config_dict: Dict[str, Any]) -> "InferenceConfig":
        # Get the list of attributes of this dataclass.
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@ -21,6 +21,7 @@ from colossalai.inference.batch_bucket import BatchBucket
 from colossalai.inference.config import InferenceConfig, InputMetaData
 from colossalai.inference.graph_runner import CUDAGraphRunner
 from colossalai.inference.modeling.policy import model_policy_map
+from colossalai.inference.sampler import search_tokens
 from colossalai.inference.spec import Drafter, GlideInput
 from colossalai.inference.struct import Sequence
 from colossalai.inference.utils import get_model_size, has_index_file
@ -424,7 +425,7 @@ class InferenceEngine:

        # 2. Prefill main model (Verifier) - fill past kv cache for main model
        logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
-        next_tokens = self.request_handler.search_tokens(self.generation_config, logits, batch)
+        next_tokens = search_tokens(self.generation_config, logits, batch_token_ids=batch.batch_token_ids)
        # append new inputs to the batch, temporarily
        batch.append_batch_tokens(next_tokens)
        self.request_handler.allocate_batch_spec_dec(batch, 1)
@ -472,7 +473,7 @@ class InferenceEngine:
            input_token_ids, output_tensor, input_meta_data = self.prepare_input(batch)
            logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)

-            next_tokens = self.request_handler.search_tokens(self.generation_config, logits, batch)
+            next_tokens = search_tokens(self.generation_config, logits, batch_token_ids=batch.batch_token_ids)

            # 5. Compare and process the results
            diff_indexes = torch.nonzero(~(next_tokens[:-1] == next_token_ids_spec))
@ -689,6 +690,13 @@ class InferenceEngine:
            (n_tokens, batch.num_heads * batch.head_dim), dtype=batch.dtype, device=batch.device
        )

+        batch_token_ids = None
+        config_dict = self.generation_config.to_dict()
+        # process repetition_penalty, no_repeat_ngram_size
+        for type in ["repetition_penalty", "no_repeat_ngram_size"]:
+            if type in config_dict and config_dict[type] is not None:
+                batch_token_ids = batch.batch_token_ids
+
        # only when we have the graph for specific decoding batch size can we use the cuda graph for inference
        use_cuda_graph = False
        if self.use_cuda_graph and not batch.is_prompts and batch.current_batch_size in self.graph_runners.keys():
@ -708,6 +716,7 @@ class InferenceEngine:
            dtype=batch.dtype,
            use_spec_dec=batch.use_spec_dec,
            num_tokens_to_verify=batch.num_tokens_to_verify,
+            batch_token_ids=batch_token_ids,
        )

        return input_ids, output_tensor, input_meta_data
@ -738,7 +747,9 @@ class InferenceEngine:
        logits = model_executable(input_token_ids, output_tensor, input_meta_data, self.k_cache, self.v_cache)
        if self.inference_config.pad_input:
            logits = logits[:, -1, :]
-        next_tokens = self.request_handler.search_tokens(self.generation_config, logits, batch)
+        next_tokens = search_tokens(
+            self.generation_config, logits, input_meta_data.is_prompts, batch_token_ids=input_meta_data.batch_token_ids
+        )
        self.request_handler.append_next_tokens(next_tokens)
        finished_sequences = self.request_handler.update()

--- a/colossalai/inference/core/request_handler.py
+++ b/colossalai/inference/core/request_handler.py
@ -7,10 +7,11 @@ from transformers.generation import GenerationConfig
 from colossalai.inference.batch_bucket import BatchBucket
 from colossalai.inference.config import InferenceConfig
 from colossalai.inference.flash_decoding_utils import FDIntermTensors
-from colossalai.inference.kv_cache import KVCacheManager
-from colossalai.inference.logit_processors import logit_processor
-from colossalai.inference.sampler import *
+from colossalai.inference.kv_cache import KVCacheManager, RPCKVCacheManager
 from colossalai.inference.struct import RequestStatus, Sequence
+from colossalai.logging import get_dist_logger
+
+logger = get_dist_logger(__name__)

 __all__ = ["RunningList", "RequestHandler"]

@ -295,17 +296,6 @@ class RequestHandler:

        return None

-    def _sample(self, probs: torch.Tensor, logprobs: torch.Tensor, generation_config: GenerationConfig):
-        if generation_config.num_beams == 1:
-            if generation_config.do_sample:
-                sample_tokens = multinomial_sample(generation_config, probs)
-            else:
-                sample_tokens = greedy_sample(generation_config, logprobs)
-        else:
-            sample_tokens = beam_search_sample(generation_config, logprobs, is_prompt=not self.prefill_bb.is_empty)
-
-        return sample_tokens
-
    def update_seq_finished(self, sequence: Sequence, generation_config: GenerationConfig):
        if (
            sequence.output_token_id[-1] == generation_config.eos_token_id
@ -328,33 +318,6 @@ class RequestHandler:
    def total_requests_in_batch_bucket(self) -> int:
        return self.prefill_bb.current_batch_size + self.running_bb.current_batch_size

-    def search_tokens(self, generation_config: GenerationConfig, logits, cur_batch: BatchBucket):
-        """
-        Sample tokens for finished requests.
-        """
-
-        # NOTE: need to decide the granularity to process logits (sequence or batch)
-        config_dict = generation_config.to_dict()
-        # process repetition_penalty, no_repeat_ngram_size
-        for type in ["repetition_penalty", "no_repeat_ngram_size"]:
-            if type in config_dict and config_dict[type] is not None:
-                logits = logit_processor(type, logits, config_dict[type], cur_batch)
-
-        # do logit processor
-        if generation_config.do_sample:
-            # process temperature, top_k, top_p
-            for type in ["temperature", "top_k", "top_p"]:
-                if type in config_dict and config_dict[type] is not None:
-                    logits = logit_processor(type, logits, config_dict[type])
-
-        # calculate probs
-        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
-        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
-
-        # sample the next tokens
-        sample_tokens = self._sample(probs, logprobs, generation_config)
-        return sample_tokens
-
    def append_next_tokens(self, sample_tokens: torch.Tensor):
        assert sample_tokens.dim() == 1
        n_elements = sample_tokens.size(0)
@ -386,3 +349,53 @@ class RequestHandler:
        self.done_list.extend(finished_seqs)

        return finished_seqs
+
+
+class RPCRequestHandler(RequestHandler):
+    """
+    RPC Version of request handler
+    """
+
+    def __init__(self, inference_config: InferenceConfig, model_config: PretrainedConfig) -> None:
+        self.inference_config = inference_config
+        self.running_list: RunningList = RunningList(inference_config.prefill_ratio)
+        self.waiting_list: List[List] = [[], [], []]
+        self.done_list: List[Sequence] = []
+        self.dtype = inference_config.dtype
+        self.max_batch_size = inference_config.max_batch_size
+
+        # initialize cache
+        self._init_cache(model_config)
+
+        # initialize batch
+        torch.cuda.current_device()
+        kv_max_split_num = (
+            inference_config.max_input_len + inference_config.max_output_len + inference_config.block_size - 1
+        ) // inference_config.block_size
+        head_dim = model_config.hidden_size // model_config.num_attention_heads
+
+        # TODO In the continuous batching scenario, the batch size may be greater than max_batch_size,
+        # which may cause bugs and this issue should be fixed later.
+        self.running_bb = BatchBucket(
+            num_heads=model_config.num_attention_heads // inference_config.tp_size,
+            head_dim=head_dim,
+            max_batch_size=self.max_batch_size,
+            max_length=inference_config.max_input_len + inference_config.max_output_len,
+            block_size=inference_config.block_size,
+            kv_max_split_num=kv_max_split_num,
+            fd_interm_tensor=None,
+            dtype=self.dtype,
+        )
+        self.prefill_bb = BatchBucket(
+            num_heads=model_config.num_attention_heads // inference_config.tp_size,
+            head_dim=head_dim,
+            max_batch_size=self.max_batch_size,
+            max_length=inference_config.max_input_len + inference_config.max_output_len,
+            block_size=inference_config.block_size,
+            kv_max_split_num=kv_max_split_num,
+            fd_interm_tensor=None,
+            dtype=self.dtype,
+        )
+
+    def _init_cache(self, model_config):
+        self.cache_manager = RPCKVCacheManager(self.inference_config, model_config)
--- a/colossalai/inference/core/rpc_engine.py
+++ b/colossalai/inference/core/rpc_engine.py
@ -0,0 +1,291 @@
+import asyncio
+from itertools import count
+from time import sleep
+from typing import List, Tuple, Union
+
+import rpyc
+import torch
+import torch.nn as nn
+from rpyc.utils.server import ThreadedServer
+from torch import multiprocessing as mp
+from transformers import AutoConfig, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers.configuration_utils import PretrainedConfig
+
+from colossalai.inference.batch_bucket import BatchBucket
+from colossalai.inference.config import InferenceConfig, InputMetaData
+from colossalai.inference.executor.rpc_worker import rpcWorkerService
+from colossalai.inference.utils import find_available_ports
+from colossalai.logging import get_dist_logger
+from colossalai.shardformer.policies.base_policy import Policy
+
+from .engine import InferenceEngine
+from .request_handler import RPCRequestHandler
+
+__all__ = ["RPCInferenceEngine"]
+
+
+def run_server(host, port, event: mp.Event = None):
+    server = ThreadedServer(
+        rpcWorkerService, port=port, protocol_config={"allow_public_attrs": True, "allow_all_attrs": True}
+    )
+    if event:
+        event.set()
+    server.start()
+
+
+class RPCInferenceEngine(InferenceEngine):
+
+    """
+    InferenceEngine which manages the inference process..
+
+    NOTE This `RPCInferenceEngine` is designed for multiple-card/online serving.
+    Original `InferenceEngine` is designed for single card and offline service, though it supports multi-card offline inference.
+
+    Args:
+        model_or_path (nn.Module or str): Path or nn.Module of this model, Currently we don't support `nn.Module` Format
+        tokenizer Optional[(Union[PreTrainedTokenizer, PreTrainedTokenizerFast])]: Path of the tokenizer to use.
+        inference_config (Optional[InferenceConfig], optional): Store the configuration information related to inference.
+        verbose (bool): Determine whether or not to log the generation process.
+        model_policy ("Policy"): the policy to shardformer model. It will be determined by the model type if not provided.
+    """
+
+    def __init__(
+        self,
+        model_or_path: Union[nn.Module, str],
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+        inference_config: InferenceConfig,
+        verbose: bool = False,
+        model_policy: Policy = None,
+    ) -> None:
+        """
+        If you input a real model loaded by transformers, the init will take quite a long time
+        Currently we don't support model(nn.Module) format as the param.
+        """
+
+        torch.multiprocessing.set_start_method("spawn", force=True)
+
+        self.inference_config = inference_config
+        self.tokenizer = tokenizer
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        self.verbose = verbose
+        self.logger = get_dist_logger(__name__)
+
+        try:
+            if isinstance(model_or_path, str):
+                self.model_config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True)
+            elif isinstance(model_or_path, nn.Module):
+                self.logger.error(
+                    f"An exception occurred during loading model Config: For {__class__.__name__}, we don't support param like nn.Module currently\n"
+                )
+                # self.model_config = model_or_path.config
+            else:
+                self.logger.error(
+                    f"An exception occurred during loading model Config: Please pass right param for {__class__.__name__}\n"
+                )
+        except Exception as e:
+            self.logger.error(
+                f"An exception occurred during loading model Config: {e}, The path should be transformers-like\n"
+            )
+        self.generation_config = inference_config.to_generation_config(self.model_config)
+
+        self.tp_size = inference_config.tp_size
+        self.events = [mp.Event() for _ in range(self.tp_size)]
+
+        # This operation will init the dist env and models
+        self.workers: List[rpcWorkerService] = []
+        self.init_workers()
+
+        asyncio.run(self.init_model(model_or_path, model_policy))
+
+        # init the scheduler and logic block manager
+        self.request_handler = self.init_scheduler(self.inference_config, self.model_config)
+
+        # init the physical cache
+        alloc_shape = self.request_handler.cache_manager.get_physical_cache_shape()
+        self.init_device_cache(alloc_shape)
+
+        self.use_cuda_graph = self.inference_config.use_cuda_graph
+        self.high_precision = inference_config.high_precision
+        self.dtype = inference_config.dtype
+
+        # Model and relatable attrs of speculative decoding will be set by `enable_spec_dec`
+        self.use_spec_dec = False
+        self.drafter_model = None
+        self.drafter = None
+        self.use_glide = False
+        self.n_spec_tokens = self.inference_config.max_n_spec_tokens
+
+        self.counter = count()
+        self._verify_args()
+
+        self.logger.info("engine init over ")
+
+    def _verify_args(self) -> None:
+        """Verify the input args"""
+        if not isinstance(self.inference_config, InferenceConfig):
+            raise TypeError("Invalid type of inference config provided.")
+        if not isinstance(self.tokenizer, (PreTrainedTokenizerFast, PreTrainedTokenizer)):
+            raise TypeError(
+                f"the tokenizer type must be PreTrainedTokenizer or PreTrainedTokenizerFast, but got {type(self.tokenizer)}"
+            )
+
+    def init_workers(self):
+        rpc_ports = find_available_ports(self.tp_size)
+        self.worker_processes = []
+        # mp.set_start_method('spawn')
+        for event, rpc_port in zip(self.events, rpc_ports):
+            p = mp.Process(target=run_server, args=("localhost", rpc_port, event))
+            p.start()
+            self.worker_processes.append(p)
+            self.logger.info(f"Starting RPC Worker on localhost:{rpc_port}...")
+
+        # Wait for all servers to start
+        for event in self.events:
+            event.wait()
+            event.clear()
+
+        sleep(0.05)
+
+        self.logger.info(f"init rpc server done.")
+
+        for rpc_port in rpc_ports:
+            try:
+                conn = rpyc.connect(
+                    "localhost",
+                    rpc_port,
+                    config={"allow_pickle": True, "allow_public_attrs": True, "allow_all_attrs": True},
+                )
+                self.workers.append(conn.root)
+            except:
+                raise Exception("conn error!")
+        self.logger.info(f"Build RPC Connection Success! Begin to load model...")
+        asyncio.run(self.init_worker_env())
+        self.logger.info(f"init dist env over")
+
+    async def async_parallel_wrapper(self, f, *args, **kwargs):
+        async_res = rpyc.async_(f)(*args, **kwargs)
+        await asyncio.to_thread(async_res.wait)
+        assert async_res.ready
+        return async_res.value
+
+    async def init_worker_env(self):
+        assert len(self.workers) == self.tp_size, "init workers first"
+
+        dist_group_port = find_available_ports(1)[0]
+        init_tasks = [
+            self.async_parallel_wrapper(
+                worker.init_dist_env, rank, self.inference_config.tp_size, "127.0.0.1", dist_group_port
+            )
+            for rank, worker in enumerate(self.workers)
+        ]
+
+        await asyncio.gather(*init_tasks)
+
+    async def init_model(self, model_or_path: Union[nn.Module, str], model_policy: Policy = None):
+        assert len(self.workers) == self.tp_size, "init workers first"
+
+        inference_config_param = self.inference_config.to_rpc_param()
+        model_path = model_or_path
+        model_policy_param = model_policy.to_rpc_param() if model_policy else None
+
+        init_tasks = [
+            self.async_parallel_wrapper(worker.init_model, inference_config_param, model_path, model_policy_param)
+            for rank, worker in enumerate(self.workers)
+        ]
+
+        await asyncio.gather(*init_tasks)
+
+    def init_scheduler(self, inference_config: InferenceConfig, model_config: PretrainedConfig) -> RPCRequestHandler:
+        return RPCRequestHandler(inference_config, model_config)
+
+    async def _init_device_cache(self, alloc_shape: Tuple[int, int, int, int]):
+        assert len(self.workers) == self.tp_size, "init workers first"
+
+        init_tasks = [self.async_parallel_wrapper(worker.init_cache, alloc_shape) for worker in self.workers]
+
+        await asyncio.gather(*init_tasks)
+
+    def init_device_cache(self, alloc_shape: Tuple[Tuple[int, ...], Tuple[int, ...]]):
+        asyncio.run(self._init_device_cache(alloc_shape))
+
+    def prepare_input(self, batch: BatchBucket) -> Tuple[List[int], InputMetaData]:
+        input_ids = batch.get_1D_inputs()
+        sequence_lengths = batch.get_sequence_lengths()
+
+        if batch.is_prompts:
+            n_tokens = sequence_lengths.sum().item()
+        else:
+            n_tokens = batch.current_batch_size
+            if batch.use_spec_dec:
+                n_tokens = batch.num_tokens_to_verify + 1
+                assert n_tokens == input_ids.size(0)
+                n_tokens = n_tokens * batch.current_batch_size
+
+        batch_token_ids = None
+        config_dict = self.generation_config.to_dict()
+        # process repetition_penalty, no_repeat_ngram_size
+        for type in ["repetition_penalty", "no_repeat_ngram_size"]:
+            if type in config_dict and config_dict[type] is not None:
+                batch_token_ids = batch.batch_token_ids
+
+        # only when we have the graph for specific decoding batch size can we use the cuda graph for inference
+        use_cuda_graph = False
+        if self.use_cuda_graph and not batch.is_prompts and batch.current_batch_size in self.graph_runners.keys():
+            use_cuda_graph = True
+
+        input_meta_data = InputMetaData(
+            block_tables=batch.get_block_table_tensor(),
+            sequence_lengths=sequence_lengths,
+            fd_inter_tensor=None,
+            batch_size=batch.current_batch_size,
+            is_prompts=batch.is_prompts,
+            use_cuda_kernel=self.inference_config.use_cuda_kernel,
+            use_cuda_graph=use_cuda_graph,
+            high_precision=self.high_precision,
+            kv_seq_len=sequence_lengths.max().item(),
+            head_dim=batch.head_dim,
+            dtype=batch.dtype,
+            use_spec_dec=batch.use_spec_dec,
+            num_tokens_to_verify=batch.num_tokens_to_verify,
+            batch_token_ids=batch_token_ids,
+        )
+
+        return input_ids.tolist(), input_meta_data
+
+    async def step_(self, input_token_ids, input_meta_data: InputMetaData):
+        assert len(self.workers) == self.tp_size, "init workers first"
+
+        init_tasks = [
+            self.async_parallel_wrapper(worker.execute_model_forward, input_token_ids, input_meta_data.to_rpc_param())
+            for worker in self.workers
+        ]
+        ret = await asyncio.gather(*init_tasks)
+
+        return ret[0]
+
+    def step(self) -> List[str]:
+        batch = self.request_handler.schedule()
+
+        input_token_ids, input_meta_data = self.prepare_input(batch)
+        # TODO: padding_id is used for generating attn_mask and will be removed if nopad version is supported.
+        next_tokens = asyncio.run(self.step_(input_token_ids, input_meta_data))
+
+        # update the request_handler
+        next_tokens = torch.tensor(next_tokens, dtype=torch.int)
+        self.request_handler.append_next_tokens(next_tokens)
+        finished_sequences = self.request_handler.update()
+        return finished_sequences
+
+    def kill_workers(self):
+        """
+        I don't find a good way to implicit invoke self.kill_workers
+        """
+        assert len(self.workers) != 0
+        for proc in self.worker_processes:
+            proc.kill()
+            proc.join()
+        self.logger.info(f"worker killed, serving end")
+
+    def __del__(self):
+        self.kill_workers()
--- a/colossalai/inference/executor/rpc_worker.py
+++ b/colossalai/inference/executor/rpc_worker.py
@ -0,0 +1,300 @@
+import os
+from typing import List, Tuple, Union
+
+import rpyc
+import torch
+import torch.distributed as dist
+from torch import nn
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.models.llama.modeling_llama import LlamaForCausalLM
+
+import colossalai
+from colossalai.accelerator import get_accelerator
+from colossalai.cluster import ProcessGroupMesh
+from colossalai.inference.config import InferenceConfig, InputMetaData
+from colossalai.inference.flash_decoding_utils import FDIntermTensors
+from colossalai.inference.modeling.policy import (
+    NoPaddingBaichuanModelInferPolicy,
+    NoPaddingLlamaModelInferPolicy,
+    model_policy_map,
+)
+from colossalai.inference.sampler import search_tokens
+from colossalai.inference.utils import get_model_size, has_index_file
+from colossalai.interface import ModelWrapper
+from colossalai.logging import get_dist_logger
+from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer import ShardConfig, ShardFormer
+from colossalai.shardformer.policies.base_policy import Policy
+
+PP_AXIS, TP_AXIS = 0, 1
+
+_SUPPORTED_MODELS = {
+    "LlamaForCausalLM": LlamaForCausalLM,
+    "BaichuanForCausalLM": AutoModelForCausalLM,
+}
+
+_SUPPORTED_MODEL_POLICIES = {
+    "NoPaddingLlamaModelInferPolicy": NoPaddingLlamaModelInferPolicy,
+    "NoPaddingBaichuanModelInferPolicy": NoPaddingBaichuanModelInferPolicy,
+}
+
+logger = get_dist_logger(__name__)
+
+
+class rpcWorkerService(rpyc.Service):
+
+    """
+    Execute the computation tasks and manage its own kv cache
+
+    Func with prefix `exposed_` will be invoked by client.
+    """
+
+    def exposed_init_dist_env(self, rank, world_size, master_address, master_port):
+        logger.info(f"init process group for rank {rank}")
+        colossalai.launch(rank=rank, world_size=world_size, port=master_port, host=master_address)
+        logger.info(f"init process group done for rank {rank}")
+
+    def exposed_init_model(
+        self, inference_config_param: dict, model_or_path: Union[nn.Module, str], model_policy_param: str = None
+    ):
+        assert dist.is_initialized(), "invoke init_dist_env first please!"
+
+        self.inference_config = InferenceConfig.from_rpc_param(inference_config_param)
+        model_policy = _SUPPORTED_MODEL_POLICIES[model_policy_param]() if model_policy_param else None
+
+        self.dtype = self.inference_config.dtype
+        self.verbose = True
+
+        self._init_model(model_or_path, model_policy)
+        self._init_fd_tensor()
+        self._init_output_tensor()
+        logger.info(f"init model done for rank {dist.get_rank()}")
+
+    def exposed_init_cache(self, alloc_shape: Tuple[Tuple[int, ...], Tuple[int, ...]]):
+        """Initialize the physical cache on the device.
+
+        For each layer of the model, we allocate two tensors for key and value respectively,
+        with shape of [num_blocks, num_kv_heads, block_size, head_size]
+        """
+        kalloc_shape, valloc_shape = alloc_shape
+        num_layers = self.model_config.num_hidden_layers
+
+        self.k_cache: List[torch.Tensor] = []
+        self.v_cache: List[torch.Tensor] = []
+        for _ in range(num_layers):
+            self.k_cache.append(
+                torch.zeros(
+                    kalloc_shape,
+                    dtype=self.inference_config.kv_cache_dtype,
+                    device=get_accelerator().get_current_device(),
+                )
+            )
+            self.v_cache.append(
+                torch.zeros(
+                    valloc_shape,
+                    dtype=self.inference_config.kv_cache_dtype,
+                    device=get_accelerator().get_current_device(),
+                )
+            )
+        logger.info("physical cache init over")
+
+    def exposed_execute_model_forward(self, input_token_ids_param: List[int], input_meta_data_param: dict):
+        # prepare the data for model forward
+        input_meta_data = InputMetaData.from_rpc_param(input_meta_data_param)
+        input_meta_data.fd_inter_tensor = self.fd_inter_tensor
+        if input_meta_data.is_prompts:
+            n_tokens = input_meta_data.sequence_lengths.sum().item()
+        else:
+            n_tokens = input_meta_data.batch_size
+        input_token_ids = torch.tensor(input_token_ids_param, dtype=torch.int, device=self.device)
+
+        # execute the model
+        logits = self.model(
+            input_token_ids,
+            self.output_tensor[:n_tokens],
+            input_meta_data,
+            self.k_cache,
+            self.v_cache,
+        )
+
+        # sampler
+        if self.inference_config.pad_input:
+            logits = logits[:, -1, :]
+        next_tokens = search_tokens(
+            self.inference_config.to_generation_config(self.model_config),
+            logits,
+            input_meta_data.is_prompts,
+            input_meta_data.batch_token_ids,
+        )
+
+        # return the tokens generated to scheduler
+        return next_tokens.tolist()
+
+    def _init_output_tensor(self):
+        alloc_shape = (
+            self.inference_config.max_batch_size
+            * (self.inference_config.max_input_len + self.inference_config.max_output_len),
+            self.model_config.hidden_size // self.inference_config.tp_size,
+        )
+        self.output_tensor = torch.zeros(alloc_shape, dtype=self.dtype, device=self.device)
+
+    def _init_fd_tensor(self):
+        fd_inter_tensor = FDIntermTensors()
+
+        if fd_inter_tensor._tensors_initialized:
+            fd_inter_tensor._reset()
+
+        # For Spec-Dec, process the speculated tokens plus the token in the last step for each seq
+        max_n_tokens = self.inference_config.max_batch_size
+        max_n_tokens *= self.inference_config.max_n_spec_tokens + 1
+
+        inference_config = self.inference_config
+        kv_max_split_num = (
+            inference_config.max_input_len + inference_config.max_output_len + inference_config.block_size - 1
+        ) // inference_config.block_size
+        head_dim = self.model_config.hidden_size // self.model_config.num_attention_heads
+
+        fd_inter_tensor.initialize(
+            max_batch_size=max_n_tokens,
+            num_attn_heads=self.model_config.num_attention_heads // self.inference_config.tp_size,
+            kv_max_split_num=kv_max_split_num,
+            head_dim=head_dim,
+            dtype=self.dtype,
+            device=get_accelerator().get_current_device(),
+        )
+
+        self.fd_inter_tensor = fd_inter_tensor
+
+    def _init_model(self, model_or_path: Union[nn.Module, str], model_policy: Policy = None):
+        """
+        Shard model or/and Load weight
+
+        Shard model: When we set tp_size > 1, we will shard the model by given model_policy.
+        Load Weight: If we pass a local model path, we will load the model weight by checkpoint_io. If it is a remote-transformer url, we will use `AutoModel.from_pretrained` api of transformers lib
+
+        Args:
+            model_or_path Union[nn.Module, str]: path to the checkpoint or model of transformer format.
+            model_policy (Policy): the policy to replace the model
+        """
+
+        if isinstance(model_or_path, str):
+            is_local = os.path.isdir(model_or_path)
+            try:
+                hf_config = AutoConfig.from_pretrained(model_or_path, trust_remote_code=True)
+                arch = getattr(hf_config, "architectures")[0]
+                if is_local:
+                    model = _SUPPORTED_MODELS[arch](hf_config)
+                else:
+                    # load the real checkpoint
+                    model = _SUPPORTED_MODELS[arch].from_pretrained(model_or_path, trust_remote_code=True)
+            except Exception as e:
+                logger.error(
+                    f"An exception occurred during loading model: {e}, model should be loaded by transformers\n"
+                )
+        else:
+            model = model_or_path
+
+        self.model_config = model.config
+
+        torch.cuda.empty_cache()
+        init_gpu_memory = torch.cuda.mem_get_info()[0]
+
+        self.device = get_accelerator().get_current_device()
+        torch.cuda.set_device(self.device)
+        if self.verbose:
+            logger.info(f"the device is {self.device}")
+
+        model = model.to(dtype=self.dtype, non_blocking=False).eval()
+
+        if self.verbose:
+            logger.info(
+                f"Before the shard, Rank: [{dist.get_rank()}], model size: {get_model_size(model)} GB, model's device is: {model.device}"
+            )
+
+        if model_policy is None:
+            if self.inference_config.pad_input:
+                model_type = "padding_" + self.model_config.model_type
+            else:
+                model_type = "nopadding_" + self.model_config.model_type
+            model_policy = model_policy_map[model_type]()
+
+        pg_mesh = ProcessGroupMesh(self.inference_config.pp_size, self.inference_config.tp_size)
+        tp_group = pg_mesh.get_group_along_axis(TP_AXIS)
+
+        self.model = self._shardformer(
+            model,
+            model_policy,
+            None,
+            tp_group=tp_group,
+        )
+
+        self.model = ModelWrapper(model).to(device=get_accelerator().get_current_device())
+
+        if self.verbose:
+            logger.info(
+                f"After the shard, Rank: [{dist.get_rank()}], model size: {get_model_size(self.model)} GB, model's device is: {model.device}"
+            )
+
+        if isinstance(model_or_path, str) and is_local:
+            from colossalai.inference.core.plugin import InferCheckpoint_io
+
+            cpt_io = InferCheckpoint_io()
+            if_has_index_file, model_index_file = has_index_file(model_or_path)
+            assert if_has_index_file, "the model path is invalid"
+            cpt_io.load_model(self.model, model_index_file)
+
+        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+        peak_memory = init_gpu_memory - free_gpu_memory
+        if self.verbose:
+            logger.info(
+                f"Rank [{dist.get_rank()}], Model Weight Max Occupy {peak_memory / (1024 ** 3)} GB, Model size: {get_model_size(self.model)} GB"
+            )
+
+    def _shardformer(
+        self,
+        model: nn.Module,
+        model_policy: Policy,
+        stage_manager: PipelineStageManager = None,
+        tp_group: ProcessGroupMesh = None,
+    ) -> nn.Module:
+        """
+        Initialize ShardConfig and replace the model with shardformer.
+
+        Args:
+            model (nn.Module): Path or nn.Module of this model.
+            model_policy (Policy): The policy to shardformer model which is determined by the model type.
+            stage_manager (PipelineStageManager, optional): Used to manage pipeline stages. Defaults to None.
+            tp_group (ProcessGroupMesh, optional): Used to manage the process TP group mesh. Defaults to None.
+
+        Returns:
+            nn.Module: The model optimized by Shardformer.
+        """
+
+        shardconfig = ShardConfig(
+            tensor_parallel_process_group=tp_group,
+            pipeline_stage_manager=stage_manager,
+            enable_tensor_parallelism=(self.inference_config.tp_size > 1),
+            enable_fused_normalization=False,
+            enable_all_optimization=False,
+            enable_flash_attention=False,
+            enable_jit_fused=False,
+            enable_sequence_parallelism=False,
+        )
+        shardformer = ShardFormer(shard_config=shardconfig)
+        shard_model, _ = shardformer.optimize(model, model_policy)
+        return shard_model
+
+    def exposed_compute_only_for_test(self):
+        dist_rank = dist.get_rank()
+
+        # Dummy data for each worker
+        data = torch.tensor([dist_rank], dtype=torch.float).cuda(dist_rank)
+        dist.barrier()
+
+        # Perform distributed all_reduce
+        dist.all_reduce(data, op=dist.ReduceOp.SUM)
+
+        dist.barrier()
+        logger.info(f"Worker rank {dist_rank}: Sum after all_reduce: {data.item()}")
+
+        return data.item()
--- a/colossalai/inference/kv_cache/init.py
+++ b/colossalai/inference/kv_cache/init.py
@ -1,4 +1,4 @@
 from .block_cache import CacheBlock
-from .kvcache_manager import KVCacheManager
+from .kvcache_manager import KVCacheManager, RPCKVCacheManager

-__all__ = ["CacheBlock", "KVCacheManager"]
+__all__ = ["CacheBlock", "KVCacheManager", "RPCKVCacheManager"]
--- a/colossalai/inference/kv_cache/kvcache_manager.py
+++ b/colossalai/inference/kv_cache/kvcache_manager.py
@ -497,3 +497,80 @@ class KVCacheManager:
            k_cache.append(torch.zeros(kalloc_shape, dtype=self.kv_cache_dtype, device=self.device))
            v_cache.append(torch.zeros(valloc_shape, dtype=self.kv_cache_dtype, device=self.device))
        return k_cache, v_cache
+
+
+class RPCKVCacheManager(KVCacheManager):
+    def __init__(self, config: InferenceConfig, model_config: PretrainedConfig, verbose: bool = False) -> None:
+        self.logger = get_dist_logger(__name__)
+        self.device = get_current_device()
+        self.config = config
+
+        # Parallel settings
+        self.tp_size = config.tp_size
+        # Model settings
+        self.dtype = config.dtype
+        self.elem_size_in_bytes = torch.tensor([], dtype=self.dtype).element_size()
+        self.num_layers = model_config.num_hidden_layers
+        self.head_num = model_config.num_attention_heads
+        self.head_size = model_config.hidden_size // self.head_num
+        if hasattr(model_config, "num_key_value_heads"):
+            self.kv_head_num = model_config.num_key_value_heads
+        else:
+            self.kv_head_num = self.head_num
+
+        if config.kv_cache_dtype is None:
+            self.kv_cache_dtype = config.dtype
+        else:
+            self.kv_cache_dtype = config.kv_cache_dtype
+
+        assert (
+            self.kv_head_num % self.tp_size == 0
+        ), f"Cannot shard {self.kv_head_num} heads with tp size {self.tp_size}"
+        self.kv_head_num //= self.tp_size
+        self.beam_width = config.beam_width
+        self.max_batch_size = config.max_batch_size
+        self.max_input_length = config.max_input_len
+        self.max_output_length = config.max_output_len
+        # Cache block settings
+        self.block_size = config.block_size
+        # NOTE: `num_blocks` is not prompted, but evaluated from the maximum input/output length, and the maximum batch size
+        self.max_blocks_per_sequence = (
+            self.max_input_length + self.max_output_length + self.block_size - 1
+        ) // self.block_size
+        self.num_blocks = self.max_blocks_per_sequence * self.max_batch_size * self.beam_width
+
+        # Logical cache blocks allocation
+        self._available_blocks = self.num_blocks
+        self._cache_blocks = tuple(self._init_logical_caches())
+        # block availablity state 0->allocated, 1->free
+        self._block_states = torch.ones((self.num_blocks,), dtype=torch.bool)
+        self._block_states_cum = torch.zeros(size=(self.num_blocks + 1,), dtype=torch.int64)
+        self._block_finder = torch.zeros((self.num_blocks,), dtype=torch.int64)
+
+    def get_physical_cache_shape(self) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
+        # Physical cache allocation
+        if self.config.use_cuda_kernel:
+            x = 16 // torch.tensor([], dtype=self.config.dtype).element_size()
+            kalloc_shape = (self.num_blocks, self.kv_head_num, self.head_size // x, self.block_size, x)
+            valloc_shape = (self.num_blocks, self.kv_head_num, self.block_size, self.head_size)
+            self.logger.info(
+                f"Allocating K cache with shape: {kalloc_shape}, V cache with shape: {valloc_shape} consisting of {self.num_blocks} blocks."
+            )
+        else:
+            alloc_shape = (self.num_blocks, self.kv_head_num, self.block_size, self.head_size)
+            kalloc_shape = alloc_shape
+            valloc_shape = alloc_shape
+            self.logger.info(f"Allocating KV cache with shape: {alloc_shape} consisting of {self.num_blocks} blocks.")
+        return kalloc_shape, valloc_shape
+
+    def get_kv_cache(self):
+        """Get k_cache and v_cache"""
+        return NotImplementedError
+
+    def _init_logical_caches(self):
+        """Initialize the logical cache blocks."""
+        blocks = []
+        for i in range(self.num_blocks):
+            cache_block = CacheBlock(i, self.block_size, self.elem_size_in_bytes, k_ptrs=None, v_ptrs=None)
+            blocks.append(cache_block)
+        return blocks
--- a/colossalai/inference/logit_processors.py
+++ b/colossalai/inference/logit_processors.py
@ -1,10 +1,9 @@
 # This code is adapted from huggingface transformers: https://github.com/huggingface/transformers/blob/v4.36.2/src/transformers/generation/logits_process.py
+from typing import List

 import torch
 import torch.nn.functional as F

-from colossalai.inference.batch_bucket import BatchBucket
-
 _LOGIT_PROCESSOR_MAP = {}


@ -22,7 +21,7 @@ def register_logit_processor(process_type):


@register_logit_processor("no_repeat_ngram_size")
-def no_repeat_ngram_size_logit_process(logits, ngram_size: int, batch: BatchBucket):
+def no_repeat_ngram_size_logit_process(logits, ngram_size: int, batch_token_ids: List[List[int]]):
    """
    enforces no repetition of n-grams to avoid repetitions of word sequences.
    """
@ -31,7 +30,6 @@ def no_repeat_ngram_size_logit_process(logits, ngram_size: int, batch: BatchBuck
        raise ValueError(f"'temperature={ngram_size}' should be a strictly positive integer.")

    if ngram_size != 0:
-        batch_token_ids = batch.batch_token_ids
        batch_size = len(batch_token_ids)

        for batch_id in range(batch_size):
@ -55,7 +53,7 @@ def no_repeat_ngram_size_logit_process(logits, ngram_size: int, batch: BatchBuck


@register_logit_processor("repetition_penalty")
-def repetition_penalty_logit_process(logits, penalty: float, batch: BatchBucket):
+def repetition_penalty_logit_process(logits, penalty: float, batch_token_ids: List[List[int]]):
    """
    apply the penalty to the tokens present in the prompt.
    """
@ -67,7 +65,6 @@ def repetition_penalty_logit_process(logits, penalty: float, batch: BatchBucket)

    # TODO(yuehuayingxueluo) This is only a temporary implementation. Later, we will implement presence_penalties, frequency_penalties, and repetition_penalties using CUDA kernels.
    if penalty != 1.0:
-        batch_token_ids = batch.batch_token_ids
        for batch_id in range(len(batch_token_ids)):
            current_logit = logits[batch_id]
            current_token = torch.tensor(batch_token_ids[batch_id], dtype=torch.long, device=logits.device)
--- a/colossalai/inference/modeling/policy/nopadding_baichuan.py
+++ b/colossalai/inference/modeling/policy/nopadding_baichuan.py
@ -1,3 +1,4 @@
+from colossalai.inference.config import RPC_PARAM
 from colossalai.inference.modeling.layers.baichuan_tp_linear import (
    BaichuanLMHeadLinear1D_Col,
    BaichuanWpackLinear1D_Col,
@ -18,7 +19,7 @@ from colossalai.shardformer.policies.base_policy import ModulePolicyDescription,
 from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy


-class NoPaddingBaichuanModelInferPolicy(LlamaForCausalLMPolicy):
+class NoPaddingBaichuanModelInferPolicy(LlamaForCausalLMPolicy, RPC_PARAM):
    def __init__(self) -> None:
        super().__init__()

@ -100,3 +101,10 @@ class NoPaddingBaichuanModelInferPolicy(LlamaForCausalLMPolicy):
    def postprocess(self):
        init_to_get_rotary(self.model.model)
        return self.model
+
+    def to_rpc_param(self) -> str:
+        return __class__.__name__
+
+    @staticmethod
+    def from_rpc_param() -> "NoPaddingBaichuanModelInferPolicy":
+        return NoPaddingBaichuanModelInferPolicy()
--- a/colossalai/inference/modeling/policy/nopadding_llama.py
+++ b/colossalai/inference/modeling/policy/nopadding_llama.py
@ -1,5 +1,6 @@
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, LlamaRMSNorm

+from colossalai.inference.config import RPC_PARAM
 from colossalai.inference.modeling.models.nopadding_llama import (
    NopadLlamaAttention,
    NopadLlamaMLP,
@ -14,7 +15,7 @@ from colossalai.shardformer.policies.base_policy import ModulePolicyDescription,
 from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy


-class NoPaddingLlamaModelInferPolicy(LlamaForCausalLMPolicy):
+class NoPaddingLlamaModelInferPolicy(LlamaForCausalLMPolicy, RPC_PARAM):
    def __init__(self) -> None:
        super().__init__()

@ -102,3 +103,10 @@ class NoPaddingLlamaModelInferPolicy(LlamaForCausalLMPolicy):
    def postprocess(self):
        init_to_get_rotary(self.model.model, self.model.config.rope_theta)
        return self.model
+
+    def to_rpc_param(self) -> str:
+        return __class__.__name__
+
+    @staticmethod
+    def from_rpc_param() -> "NoPaddingLlamaModelInferPolicy":
+        return NoPaddingLlamaModelInferPolicy()
--- a/colossalai/inference/sampler.py
+++ b/colossalai/inference/sampler.py
@ -1,6 +1,9 @@
-from typing import List, Tuple
+from typing import List, Optional, Tuple

 import torch
+from transformers.generation import GenerationConfig
+
+from colossalai.inference.logit_processors import logit_processor


 def greedy_sample(
@ -59,3 +62,47 @@ def beam_search_sample(

    results.append((next_token_ids, parent_ids))
    return results
+
+
+def _sample(probs: torch.Tensor, logprobs: torch.Tensor, generation_config: GenerationConfig, is_prompt: bool = False):
+    if generation_config.num_beams == 1:
+        if generation_config.do_sample:
+            sample_tokens = multinomial_sample(generation_config, probs)
+        else:
+            sample_tokens = greedy_sample(generation_config, logprobs)
+    else:
+        sample_tokens = beam_search_sample(generation_config, logprobs, is_prompt=is_prompt)
+
+    return sample_tokens
+
+
+def search_tokens(
+    generation_config: GenerationConfig,
+    logits,
+    is_prompt: bool = False,
+    batch_token_ids: Optional[List[List[int]]] = None,
+):
+    """
+    Sample tokens for finished requests.
+    """
+    # NOTE: need to decide the granularity to process logits (sequence or batch)
+    config_dict = generation_config.to_dict()
+    # process repetition_penalty, no_repeat_ngram_size
+    for type in ["repetition_penalty", "no_repeat_ngram_size"]:
+        if type in config_dict and config_dict[type] is not None:
+            logits = logit_processor(type, logits, config_dict[type], batch_token_ids)
+
+    # do logit processor
+    if generation_config.do_sample:
+        # process temperature, top_k, top_p
+        for type in ["temperature", "top_k", "top_p"]:
+            if type in config_dict and config_dict[type] is not None:
+                logits = logit_processor(type, logits, config_dict[type])
+
+    # calculate probs
+    probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+    logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+
+    # sample the next tokens
+    sample_tokens = _sample(probs, logprobs, generation_config, is_prompt)
+    return sample_tokens
--- a/colossalai/inference/utils.py
+++ b/colossalai/inference/utils.py
@ -9,6 +9,8 @@ from typing import Optional, Tuple
 import torch
 from torch import nn

+from colossalai.testing import free_port
+

 def init_to_get_rotary(self, base=10000, use_elem=False):
    """
@ -102,3 +104,12 @@ def get_model_size(model: nn.Module):
    for key, param in model.named_parameters():
        total_size += param.element_size() * param.numel()
    return total_size / (1024**3)
+
+
+def find_available_ports(num: int):
+    try:
+        free_ports = [free_port() for i in range(num)]
+    except OSError as e:
+        print(f"An OS error occurred: {e}")
+        raise RuntimeError("Error finding available ports")
+    return free_ports
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
@ -19,4 +19,5 @@ datasets
 pydantic
 ray
 peft>=0.7.1
+rpyc==6.0.0
 #auto-gptq now not support torch1.12
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@ -19,3 +19,4 @@ protobuf
 transformers==4.36.2
 peft>=0.7.1
 bitsandbytes>=0.39.0
+rpyc==6.0.0
--- a/tests/test_infer/test_rpc_engine.py
+++ b/tests/test_infer/test_rpc_engine.py
@ -0,0 +1,105 @@
+import random
+
+import numpy as np
+import pytest
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
+
+from colossalai.inference.config import _DEFAULT_PROMPT_TEMPLATES, InferenceConfig
+from colossalai.inference.core.rpc_engine import RPCInferenceEngine
+from colossalai.inference.modeling.policy import NoPaddingLlamaModelInferPolicy
+from colossalai.testing import parameterize, rerun_if_address_is_in_use
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def check_inference_engine(tp_size, use_engine=False, prompt_template=None, do_sample=True, policy=None):
+    setup_seed(20)
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+    model = "meta-llama/Llama-2-7b-hf"  # remote mode path
+    inputs = [
+        "介绍一下今天的北京,比如故宫，天安门，长城或者其他的一些景点,",
+        "介绍一下武汉,",
+    ]
+
+    output_len = 38
+    top_p = 0.5
+    top_k = 50
+
+    if use_engine:
+        inference_config = InferenceConfig(
+            max_output_len=output_len,
+            prompt_template=prompt_template,
+            dtype="fp32",
+            use_cuda_kernel=True,
+            tp_size=tp_size,
+        )
+        inference_engine = RPCInferenceEngine(model, tokenizer, inference_config, verbose=True, model_policy=policy)
+        assert inference_engine.generation_config.max_new_tokens == output_len
+        inference_engine.add_request(prompts=inputs)
+        assert inference_engine.request_handler._has_waiting()
+        generation_config = GenerationConfig(
+            max_new_tokens=output_len, do_sample=do_sample, dtype="fp32", top_p=top_p, top_k=top_k
+        )
+        outputs = inference_engine.generate(generation_config=generation_config)
+    else:
+        if prompt_template:
+            # apply prompt template
+            inputs = [_DEFAULT_PROMPT_TEMPLATES[prompt_template].format(input_text=input_text) for input_text in inputs]
+        model = AutoModelForCausalLM.from_pretrained(model).cuda()
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        inputs = tokenizer.batch_encode_plus(inputs, padding=True, return_tensors="pt")["input_ids"]
+        inputs = inputs.cuda()
+        generation_config = GenerationConfig(
+            do_sample=do_sample,
+            dtype="fp32",
+            top_p=top_p,
+            top_k=top_k,
+            pad_token_id=tokenizer.pad_token_id,
+            max_new_tokens=output_len,
+        )
+        outputs = model.generate(inputs, generation_config=generation_config)
+        outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    return outputs
+
+
+def run_engine(tp_size, **kwargs):
+    return check_inference_engine(tp_size=tp_size, **kwargs)
+
+
+@pytest.mark.largedist
+@parameterize("prompt_template", [None, "llama"])
+@parameterize("do_sample", [False])
+@rerun_if_address_is_in_use()
+def test_tp_engine(prompt_template, do_sample):
+    if torch.multiprocessing.get_start_method(allow_none=True) is None:
+        torch.multiprocessing.set_start_method("spawn")
+    kwargs1 = {
+        "use_engine": True,
+        "prompt_template": prompt_template,
+        "do_sample": do_sample,
+        "policy": NoPaddingLlamaModelInferPolicy(),
+    }
+
+    kwargs2 = {"use_engine": False, "prompt_template": prompt_template, "do_sample": do_sample, "policy": None}
+
+    colossal_tp_1_output = run_engine(1, **kwargs1)
+    colossal_tp_2_output = run_engine(2, **kwargs1)
+    transformer_tp_1_output = run_engine(1, **kwargs2)
+
+    for s1, s2, s3 in zip(colossal_tp_1_output, colossal_tp_2_output, transformer_tp_1_output):
+        assert s1 == s3, f"\nColossalAI TP=1 Output: {s1}\nTransformers Output: {s3}"
+        assert s1 == s2, f"\nColossalAI TP=1 Output: {s1}\nColossalAI TP=2 Output: {s2}"
+
+
+if __name__ == "__main__":
+    torch.multiprocessing.set_start_method("spawn")  # this code will not be ok for settings to fork to subprocess
+    test_tp_engine()