ColossalAI/colossalai/legacy/inference/dynamic_batching/ray_dist_init.py

import logging
import os
from typing import List

import ray
import ray.util.collective as collective
import torch
from transformers import AutoModelForCausalLM

import colossalai
from colossalai.inference.async_manager import start_dynamic_batching
from colossalai.inference.dynamic_batching.get_tokenizer import get_tokenizer
from colossalai.inference.dynamic_batching.io_struct import RequestOutput
from colossalai.inference.dynamic_batching.ray_init_config import EngineArgsClass, RooterArgsClass
from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
from colossalai.inference.tensor_parallel.engine import TPInferEngine
from colossalai.shardformer import ShardConfig
from colossalai.testing import free_port

ray_serve_logger = logging.getLogger("ray.serve")


def log_cuda_info(scope_name: str):
    ray_serve_logger.info(f" {scope_name}: ray.get_gpu_ids(): {ray.get_gpu_ids()}")
    ray_serve_logger.info(
        f" {scope_name}: CUDA_VISIBLE_DEVICES: {os.getenv('CUDA_VISIBLE_DEVICES', 'NO DEVICES FOUND!')}"
    )
    if torch.cuda.is_available():
        ray_serve_logger.info(
            f" {scope_name}: cuda current_device: {torch.cuda.current_device()}, cuda device count: {torch.cuda.device_count()}"
        )
    else:
        ray_serve_logger.info(f" {scope_name}: cuda is not available!")


@ray.remote(num_gpus=1)
class Worker:
    def __init__(
        self,
        model_path: str,
        tensor_parallel_size: int,
        max_batch_size: int,
        max_input_len: int,
        max_output_len: int,
        router_config: RooterArgsClass,
    ):
        log_cuda_info("Worker.init")
        self.tensor_parallel_size = tensor_parallel_size
        self.model_path = model_path
        self.max_batch_size = max_batch_size
        self.max_input_len = max_input_len
        self.max_output_len = max_output_len
        self.router_config = router_config

    def setup(self, world_size, rank, port):
        # initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
        collective.init_collective_group(world_size, rank, "nccl", "default")
        # initialize and set distributed environment
        colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
        ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
        log_cuda_info("Worker.setup")

        # Load model
        self.tokenizer = get_tokenizer(tokenizer_name=self.model_path)
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_path, pad_token_id=self.tokenizer.pad_token_id, torch_dtype=torch.float16
        )
        shard_config = ShardConfig(
            enable_tensor_parallelism=True if world_size > 1 else False, extra_kwargs={"inference_only": True}
        )
        self.infer_engine = TPInferEngine(
            self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len
        )
        self.start_dynamic_batching = start_dynamic_batching(self.router_config, self.infer_engine, [])

        return True

    # def generate(self, request_id: str, prompt: str, sampling_params: SamplingParams) -> List[str]:
    #     ray_serve_logger.info(f"text: {prompt}")

    #     final_outputs = self.start_dynamic_batching.generate(prompt, sampling_params, request_id)

    #     return final_outputs

    def add_input(self, request_id: str, prompt: str, sampling_params: SamplingParams):
        self.start_dynamic_batching.add_input(request_id, prompt, sampling_params)

    def abort(self, request_id: str):
        self.start_dynamic_batching.abort(request_id)

    def step(self) -> List[RequestOutput]:
        return self.start_dynamic_batching._step()

    def add_req(self, prompt_ids: List[int], sampling_params: SamplingParams, request_id: str, prompt: str):
        self.start_dynamic_batching.add_req(prompt_ids, sampling_params, request_id, prompt)

    def is_running(self):
        return self.start_dynamic_batching.is_running()


class Driver:
    def __init__(self, router_config: RooterArgsClass, engine_config: EngineArgsClass):
        log_cuda_info("Driver:init")
        model_path = engine_config.model
        tensor_parallel_size = engine_config.tensor_parallel_size

        self.num_workers = tensor_parallel_size
        self.workers = []
        init_rets = []

        # Just grab a free port on localhost
        # NOTE workers in this communication group listen to the same port
        available_port = free_port()

        for i in range(self.num_workers):
            worker_name = "worker_idx_{}".format(i)
            w = Worker.options(name=worker_name).remote(
                model_path,
                self.num_workers,
                engine_config.max_batch_size,
                engine_config.max_input_len,
                engine_config.max_output_len,
                router_config,
            )
            self.workers.append(w)
            init_rets.append(w.setup.remote(self.num_workers, i, available_port))
        _options = {
            "group_name": "default_driver",
            "world_size": self.num_workers,
            "ranks": [i for i in range(self.num_workers)],
            "backend": "nccl",
        }
        collective.create_collective_group(self.workers, **_options)
        _ = ray.get(init_rets)

    def add_input(self, request_id: str, prompt: str, sampling_params: SamplingParams):
        ray.get([w.add_input.remote(request_id, prompt, sampling_params) for w in self.workers])

    def abort(self, request_id: str):
        ray.get([w.abort.remote(request_id) for w in self.workers])

    def step(self):
        results = ray.get([w.step.remote() for w in self.workers])
        outputs = results[0]  # get any one of the copies
        return outputs

    def add_req(self, request_id: str, prompt_ids: List[int], sampling_params: SamplingParams, prompt: str):
        ray.get([w.add_req.remote(prompt_ids, sampling_params, request_id, prompt) for w in self.workers])

    def is_running(self):
        results = ray.get([w.is_running.remote() for w in self.workers])
        return any(results)
[Inference] Dynamic Batching Inference, online and offline (#4953) * [inference] Dynamic Batching for Single and Multiple GPUs (#4831) * finish batch manager * 1 * first * fix * fix dynamic batching * llama infer * finish test * support different lengths generating * del prints * del prints * fix * fix bug --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> * [inference] Async dynamic batching (#4894) * finish input and output logic * add generate * test forward * 1 * [inference]Re push async dynamic batching (#4901) * adapt to ray server * finish async * finish test * del test --------- Co-authored-by: yuehuayingxueluo <867460659@qq.com> * Revert "[inference]Re push async dynamic batching (#4901)" (#4905) This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a. * Revert "[inference] Async dynamic batching (#4894)" This reverts commit fced14025043e29ce816b315f440601188f7f79f. * Revert "[inference] Async dynamic batching (#4894)" (#4909) This reverts commit fced14025043e29ce816b315f440601188f7f79f. * Add Ray Distributed Environment Init Scripts * support DynamicBatchManager base function * revert _set_tokenizer version * add driver async generate * add async test * fix bugs in test_ray_dist.py * add get_tokenizer.py * fix code style * fix bugs about No module named 'pydantic' in ci test * fix bugs in ci test * fix bugs in ci test * fix bugs in ci test * [infer]Add Ray Distributed Environment Init Scripts (#4911) * Revert "[inference] Async dynamic batching (#4894)" This reverts commit fced14025043e29ce816b315f440601188f7f79f. * Add Ray Distributed Environment Init Scripts * support DynamicBatchManager base function * revert _set_tokenizer version * add driver async generate * add async test * fix bugs in test_ray_dist.py * add get_tokenizer.py * fix code style * fix bugs about No module named 'pydantic' in ci test * fix bugs in ci test * fix bugs in ci test * fix bugs in ci test * support dynamic batch for bloom model and is_running function * [Inference]Test for new Async engine (#4935) * infer engine * infer engine * test engine * test engine * new manager * change step * add * test * fix * fix * finish test * finish test * finish test * finish test * add license --------- Co-authored-by: yuehuayingxueluo <867460659@qq.com> * add assertion for config (#4947) * [Inference] Finish dynamic batching offline test (#4948) * test * fix test * fix quant * add default * fix * fix some bugs * fix some bugs * fix * fix bug * fix bugs * reset param --------- Co-authored-by: yuehuayingxueluo <867460659@qq.com> Co-authored-by: Cuiqing Li <lixx3527@gmail.com> Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 2023-10-30 02:52:19 +00:00			`import logging`
			`import os`
			`from typing import List`

			`import ray`
			`import ray.util.collective as collective`
			`import torch`
			`from transformers import AutoModelForCausalLM`

			`import colossalai`
			`from colossalai.inference.async_manager import start_dynamic_batching`
			`from colossalai.inference.dynamic_batching.get_tokenizer import get_tokenizer`
			`from colossalai.inference.dynamic_batching.io_struct import RequestOutput`
			`from colossalai.inference.dynamic_batching.ray_init_config import EngineArgsClass, RooterArgsClass`
			`from colossalai.inference.dynamic_batching.sampling_params import SamplingParams`
			`from colossalai.inference.tensor_parallel.engine import TPInferEngine`
			`from colossalai.shardformer import ShardConfig`
			`from colossalai.testing import free_port`

			`ray_serve_logger = logging.getLogger("ray.serve")`


			`def log_cuda_info(scope_name: str):`
			`ray_serve_logger.info(f" {scope_name}: ray.get_gpu_ids(): {ray.get_gpu_ids()}")`
			`ray_serve_logger.info(`
			`f" {scope_name}: CUDA_VISIBLE_DEVICES: {os.getenv('CUDA_VISIBLE_DEVICES', 'NO DEVICES FOUND!')}"`
			`)`
			`if torch.cuda.is_available():`
			`ray_serve_logger.info(`
			`f" {scope_name}: cuda current_device: {torch.cuda.current_device()}, cuda device count: {torch.cuda.device_count()}"`
			`)`
			`else:`
			`ray_serve_logger.info(f" {scope_name}: cuda is not available!")`


			`@ray.remote(num_gpus=1)`
			`class Worker:`
			`def __init__(`
			`self,`
			`model_path: str,`
			`tensor_parallel_size: int,`
			`max_batch_size: int,`
			`max_input_len: int,`
			`max_output_len: int,`
			`router_config: RooterArgsClass,`
			`):`
			`log_cuda_info("Worker.init")`
			`self.tensor_parallel_size = tensor_parallel_size`
			`self.model_path = model_path`
			`self.max_batch_size = max_batch_size`
			`self.max_input_len = max_input_len`
			`self.max_output_len = max_output_len`
			`self.router_config = router_config`

			`def setup(self, world_size, rank, port):`
			`# initialize a ray collective group, otherwise colossalai distributed env won't be built successfully`
			`collective.init_collective_group(world_size, rank, "nccl", "default")`
			`# initialize and set distributed environment`
			`colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")`
			`ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")`
			`log_cuda_info("Worker.setup")`

			`# Load model`
			`self.tokenizer = get_tokenizer(tokenizer_name=self.model_path)`
			`if self.tokenizer.pad_token is None:`
			`self.tokenizer.pad_token = self.tokenizer.eos_token`
			`self.model = AutoModelForCausalLM.from_pretrained(`
			`self.model_path, pad_token_id=self.tokenizer.pad_token_id, torch_dtype=torch.float16`
			`)`
[hotfix] Suport extra_kwargs in ShardConfig (#5031) * [refactor]: replace inference args with extra_kwargs in ShardConfig * modify shardconfig * polish code * fix policy bug in llama * fix bug in auto policy * remove setattr in ShardConfig 2023-11-10 02:49:50 +00:00			`shard_config = ShardConfig(`
			`enable_tensor_parallelism=True if world_size > 1 else False, extra_kwargs={"inference_only": True}`
			`)`
[Inference] Dynamic Batching Inference, online and offline (#4953) * [inference] Dynamic Batching for Single and Multiple GPUs (#4831) * finish batch manager * 1 * first * fix * fix dynamic batching * llama infer * finish test * support different lengths generating * del prints * del prints * fix * fix bug --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com> * [inference] Async dynamic batching (#4894) * finish input and output logic * add generate * test forward * 1 * [inference]Re push async dynamic batching (#4901) * adapt to ray server * finish async * finish test * del test --------- Co-authored-by: yuehuayingxueluo <867460659@qq.com> * Revert "[inference]Re push async dynamic batching (#4901)" (#4905) This reverts commit fbf3c09e673794ed18c91d4bab1a7dfea052e95a. * Revert "[inference] Async dynamic batching (#4894)" This reverts commit fced14025043e29ce816b315f440601188f7f79f. * Revert "[inference] Async dynamic batching (#4894)" (#4909) This reverts commit fced14025043e29ce816b315f440601188f7f79f. * Add Ray Distributed Environment Init Scripts * support DynamicBatchManager base function * revert _set_tokenizer version * add driver async generate * add async test * fix bugs in test_ray_dist.py * add get_tokenizer.py * fix code style * fix bugs about No module named 'pydantic' in ci test * fix bugs in ci test * fix bugs in ci test * fix bugs in ci test * [infer]Add Ray Distributed Environment Init Scripts (#4911) * Revert "[inference] Async dynamic batching (#4894)" This reverts commit fced14025043e29ce816b315f440601188f7f79f. * Add Ray Distributed Environment Init Scripts * support DynamicBatchManager base function * revert _set_tokenizer version * add driver async generate * add async test * fix bugs in test_ray_dist.py * add get_tokenizer.py * fix code style * fix bugs about No module named 'pydantic' in ci test * fix bugs in ci test * fix bugs in ci test * fix bugs in ci test * support dynamic batch for bloom model and is_running function * [Inference]Test for new Async engine (#4935) * infer engine * infer engine * test engine * test engine * new manager * change step * add * test * fix * fix * finish test * finish test * finish test * finish test * add license --------- Co-authored-by: yuehuayingxueluo <867460659@qq.com> * add assertion for config (#4947) * [Inference] Finish dynamic batching offline test (#4948) * test * fix test * fix quant * add default * fix * fix some bugs * fix some bugs * fix * fix bug * fix bugs * reset param --------- Co-authored-by: yuehuayingxueluo <867460659@qq.com> Co-authored-by: Cuiqing Li <lixx3527@gmail.com> Co-authored-by: CjhHa1 <cjh18671720497outlook.com> 2023-10-30 02:52:19 +00:00			`self.infer_engine = TPInferEngine(`
			`self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len`
			`)`
			`self.start_dynamic_batching = start_dynamic_batching(self.router_config, self.infer_engine, [])`

			`return True`

			`# def generate(self, request_id: str, prompt: str, sampling_params: SamplingParams) -> List[str]:`
			`# ray_serve_logger.info(f"text: {prompt}")`

			`# final_outputs = self.start_dynamic_batching.generate(prompt, sampling_params, request_id)`

			`# return final_outputs`

			`def add_input(self, request_id: str, prompt: str, sampling_params: SamplingParams):`
			`self.start_dynamic_batching.add_input(request_id, prompt, sampling_params)`

			`def abort(self, request_id: str):`
			`self.start_dynamic_batching.abort(request_id)`

			`def step(self) -> List[RequestOutput]:`
			`return self.start_dynamic_batching._step()`

			`def add_req(self, prompt_ids: List[int], sampling_params: SamplingParams, request_id: str, prompt: str):`
			`self.start_dynamic_batching.add_req(prompt_ids, sampling_params, request_id, prompt)`

			`def is_running(self):`
			`return self.start_dynamic_batching.is_running()`


			`class Driver:`
			`def __init__(self, router_config: RooterArgsClass, engine_config: EngineArgsClass):`
			`log_cuda_info("Driver:init")`
			`model_path = engine_config.model`
			`tensor_parallel_size = engine_config.tensor_parallel_size`

			`self.num_workers = tensor_parallel_size`
			`self.workers = []`
			`init_rets = []`

			`# Just grab a free port on localhost`
			`# NOTE workers in this communication group listen to the same port`
			`available_port = free_port()`

			`for i in range(self.num_workers):`
			`worker_name = "worker_idx_{}".format(i)`
			`w = Worker.options(name=worker_name).remote(`
			`model_path,`
			`self.num_workers,`
			`engine_config.max_batch_size,`
			`engine_config.max_input_len,`
			`engine_config.max_output_len,`
			`router_config,`
			`)`
			`self.workers.append(w)`
			`init_rets.append(w.setup.remote(self.num_workers, i, available_port))`
			`_options = {`
			`"group_name": "default_driver",`
			`"world_size": self.num_workers,`
			`"ranks": [i for i in range(self.num_workers)],`
			`"backend": "nccl",`
			`}`
			`collective.create_collective_group(self.workers, **_options)`
			`_ = ray.get(init_rets)`

			`def add_input(self, request_id: str, prompt: str, sampling_params: SamplingParams):`
			`ray.get([w.add_input.remote(request_id, prompt, sampling_params) for w in self.workers])`

			`def abort(self, request_id: str):`
			`ray.get([w.abort.remote(request_id) for w in self.workers])`

			`def step(self):`
			`results = ray.get([w.step.remote() for w in self.workers])`
			`outputs = results[0] # get any one of the copies`
			`return outputs`

			`def add_req(self, request_id: str, prompt_ids: List[int], sampling_params: SamplingParams, prompt: str):`
			`ray.get([w.add_req.remote(prompt_ids, sampling_params, request_id, prompt) for w in self.workers])`

			`def is_running(self):`
			`results = ray.get([w.is_running.remote() for w in self.workers])`
			`return any(results)`