ColossalAI/colossalai/inference/server/api_server.py

"""
Doc:
    Feature:
    - FastAPI based http server for Colossal-Inference
    - Completion Service Supported
    Usage: (for local user)
    - First, Lauch an API locally. `python3 -m colossalai.inference.server.api_server  --model path of your llama2 model`
    - Second, you can turn to the page `http://127.0.0.1:8000/docs` to check the api
    - For completion service, you can invoke it by using `curl -X POST  http://127.0.0.1:8000/v1/completion  \
         -H 'Content-Type: application/json' \
         -d '{"prompt":"hello, who are you? ","stream":"False"}'`
"""

import argparse
import json

import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
from transformers import AutoModelForCausalLM, AutoTokenizer

from colossalai.inference.config import InferenceConfig
from colossalai.inference.server.chat_service import ChatServing
from colossalai.inference.server.completion_service import CompletionServing
from colossalai.inference.server.utils import id_generator

from colossalai.inference.core.async_engine import AsyncInferenceEngine, InferenceEngine  # noqa

TIMEOUT_KEEP_ALIVE = 5  # seconds.
supported_models_dict = {"Llama_Models": ("llama2-7b",)}
prompt_template_choices = ["llama", "vicuna"]
async_engine = None
chat_serving = None
completion_serving = None

app = FastAPI()


@app.get("/v0/models")
def get_available_models() -> Response:
    return JSONResponse(supported_models_dict)


@app.post("/generate")
async def generate(request: Request) -> Response:
    """Generate completion for the request.

    A request should be a JSON object with the following fields:
    - prompts: the prompts to use for the generation.
    - stream: whether to stream the results or not.
    - other fields:
    """
    request_dict = await request.json()
    prompt = request_dict.pop("prompt")
    stream = request_dict.pop("stream", "false").lower()

    request_id = id_generator()
    generation_config = get_generation_config(request_dict)
    results = engine.generate(request_id, prompt, generation_config=generation_config)

    # Streaming case
    def stream_results():
        for request_output in results:
            ret = {"text": request_output[len(prompt) :]}
            yield (json.dumps(ret) + "\0").encode("utf-8")

    if stream == "true":
        return StreamingResponse(stream_results())

    # Non-streaming case
    final_output = None
    for request_output in results:
        if request.is_disconnected():
            # Abort the request if the client disconnects.
            engine.abort(request_id)
            return Response(status_code=499)
        final_output = request_output[len(prompt) :]

    assert final_output is not None
    ret = {"text": final_output}
    return JSONResponse(ret)


@app.post("/v1/completion")
async def create_completion(request: Request):
    request_dict = await request.json()
    stream = request_dict.pop("stream", "false").lower()
    generation_config = get_generation_config(request_dict)
    result = await completion_serving.create_completion(request, generation_config)

    ret = {"request_id": result.request_id, "text": result.output}
    if stream == "true":
        return StreamingResponse(content=json.dumps(ret) + "\0", media_type="text/event-stream")
    else:
        return JSONResponse(content=ret)


@app.post("/v1/chat")
async def create_chat(request: Request):
    request_dict = await request.json()

    stream = request_dict.get("stream", "false").lower()
    generation_config = get_generation_config(request_dict)
    message = await chat_serving.create_chat(request, generation_config)
    if stream == "true":
        return StreamingResponse(content=message, media_type="text/event-stream")
    else:
        ret = {"role": message.role, "text": message.content}
    return ret


def get_generation_config(request):
    generation_config = async_engine.engine.generation_config
    for arg in request:
        if hasattr(generation_config, arg):
            generation_config[arg] = request[arg]
    return generation_config


def add_engine_config(parser):
    parser.add_argument("--model", type=str, default="llama2-7b", help="name or path of the huggingface model to use")

    parser.add_argument(
        "--max-model-len",
        type=int,
        default=None,
        help="model context length. If unspecified, " "will be automatically derived from the model.",
    )
    # Parallel arguments
    parser.add_argument(
        "--worker-use-ray",
        action="store_true",
        help="use Ray for distributed serving, will be " "automatically set when using more than 1 GPU",
    )

    parser.add_argument("--pipeline-parallel-size", "-pp", type=int, default=1, help="number of pipeline stages")

    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1, help="number of tensor parallel replicas")

    # KV cache arguments
    parser.add_argument("--block-size", type=int, default=16, choices=[8, 16, 32], help="token block size")

    parser.add_argument("--max_batch_size", type=int, default=8, help="maximum number of batch size")

    # generation arguments
    parser.add_argument(
        "--prompt_template",
        choices=prompt_template_choices,
        default=None,
        help=f"Allowed choices are {','.join(prompt_template_choices)}. Default to None.",
    )

    # Quantization settings.
    parser.add_argument(
        "--quantization",
        "-q",
        type=str,
        choices=["awq", "gptq", "squeezellm", None],
        default=None,
        help="Method used to quantize the weights. If "
        "None, we first check the `quantization_config` "
        "attribute in the model config file. If that is "
        "None, we assume the model weights are not "
        "quantized and use `dtype` to determine the data "
        "type of the weights.",
    )
    parser.add_argument(
        "--enforce-eager",
        action="store_true",
        help="Always use eager-mode PyTorch. If False, "
        "will use eager mode and CUDA graph in hybrid "
        "for maximal performance and flexibility.",
    )
    return parser


def parse_args():
    parser = argparse.ArgumentParser(description="Colossal-Inference API server.")

    parser.add_argument("--host", type=str, default="127.0.0.1")
    parser.add_argument("--port", type=int, default=8000)
    parser.add_argument("--ssl-keyfile", type=str, default=None)
    parser.add_argument("--ssl-certfile", type=str, default=None)
    parser.add_argument(
        "--root-path", type=str, default=None, help="FastAPI root_path when app is behind a path based routing proxy"
    )
    parser.add_argument(
        "--model-name",
        type=str,
        default=None,
        help="The model name used in the API. If not "
        "specified, the model name will be the same as "
        "the huggingface name.",
    )
    parser.add_argument(
        "--chat-template",
        type=str,
        default=None,
        help="The file path to the chat template, " "or the template in single-line form " "for the specified model",
    )
    parser.add_argument(
        "--response-role",
        type=str,
        default="assistant",
        help="The role name to return if " "`request.add_generation_prompt=true`.",
    )
    parser = add_engine_config(parser)

    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    inference_config = InferenceConfig.from_dict(vars(args))
    model = AutoModelForCausalLM.from_pretrained(args.model)
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    async_engine = AsyncInferenceEngine(
        start_engine_loop=True, model=model, tokenizer=tokenizer, inference_config=inference_config
    )
    engine = async_engine.engine
    completion_serving = CompletionServing(async_engine, served_model=model.__class__.__name__)
    chat_serving = ChatServing(
        async_engine,
        served_model=model.__class__.__name__,
        tokenizer=tokenizer,
        response_role=args.response_role,
        chat_template=args.chat_template,
    )
    app.root_path = args.root_path
    uvicorn.run(
        app=app,
        host=args.host,
        port=args.port,
        log_level="debug",
        timeout_keep_alive=TIMEOUT_KEEP_ALIVE,
        ssl_keyfile=args.ssl_keyfile,
        ssl_certfile=args.ssl_certfile,
    )
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`"""`
			`Doc:`
			`Feature:`
			`- FastAPI based http server for Colossal-Inference`
			`- Completion Service Supported`
			`Usage: (for local user)`
			- First, Lauch an API locally. `python3 -m colossalai.inference.server.api_server --model path of your llama2 model`
			- Second, you can turn to the page `http://127.0.0.1:8000/docs` to check the api
			- For completion service, you can invoke it by using `curl -X POST http://127.0.0.1:8000/v1/completion \
			`-H 'Content-Type: application/json' \`
			-d '{"prompt":"hello, who are you? ","stream":"False"}'`
			`"""`

			`import argparse`
			`import json`

			`import uvicorn`
			`from fastapi import FastAPI, Request`
			`from fastapi.responses import JSONResponse, Response, StreamingResponse`
			`from transformers import AutoModelForCausalLM, AutoTokenizer`

			`from colossalai.inference.config import InferenceConfig`
[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`from colossalai.inference.server.chat_service import ChatServing`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`from colossalai.inference.server.completion_service import CompletionServing`
			`from colossalai.inference.server.utils import id_generator`

			`from colossalai.inference.core.async_engine import AsyncInferenceEngine, InferenceEngine # noqa`

			`TIMEOUT_KEEP_ALIVE = 5 # seconds.`
			`supported_models_dict = {"Llama_Models": ("llama2-7b",)}`
			`prompt_template_choices = ["llama", "vicuna"]`
[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`async_engine = None`
			`chat_serving = None`
			`completion_serving = None`

			`app = FastAPI()`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00

			`@app.get("/v0/models")`
			`def get_available_models() -> Response:`
			`return JSONResponse(supported_models_dict)`


			`@app.post("/generate")`
			`async def generate(request: Request) -> Response:`
			`"""Generate completion for the request.`

			`A request should be a JSON object with the following fields:`
			`- prompts: the prompts to use for the generation.`
			`- stream: whether to stream the results or not.`
			`- other fields:`
			`"""`
			`request_dict = await request.json()`
			`prompt = request_dict.pop("prompt")`
[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`stream = request_dict.pop("stream", "false").lower()`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00
			`request_id = id_generator()`
			`generation_config = get_generation_config(request_dict)`
			`results = engine.generate(request_id, prompt, generation_config=generation_config)`

			`# Streaming case`
			`def stream_results():`
			`for request_output in results:`
[Inference] Finish Online Serving Test, add streaming output api, continuous batching test and example (#5432) * finish online test and add examples * fix test_contionus_batching * fix some bugs * fix bash * fix * fix inference * finish revision * fix typos * revision 2024-03-18 09:06:05 +00:00			`ret = {"text": request_output[len(prompt) :]}`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`yield (json.dumps(ret) + "\0").encode("utf-8")`

[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`if stream == "true":`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`return StreamingResponse(stream_results())`

			`# Non-streaming case`
			`final_output = None`
			`for request_output in results:`
			`if request.is_disconnected():`
			`# Abort the request if the client disconnects.`
			`engine.abort(request_id)`
			`return Response(status_code=499)`
[Inference] Finish Online Serving Test, add streaming output api, continuous batching test and example (#5432) * finish online test and add examples * fix test_contionus_batching * fix some bugs * fix bash * fix * fix inference * finish revision * fix typos * revision 2024-03-18 09:06:05 +00:00			`final_output = request_output[len(prompt) :]`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00
			`assert final_output is not None`
			`ret = {"text": final_output}`
			`return JSONResponse(ret)`


			`@app.post("/v1/completion")`
			`async def create_completion(request: Request):`
			`request_dict = await request.json()`
[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`stream = request_dict.pop("stream", "false").lower()`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`generation_config = get_generation_config(request_dict)`
[Inference] Finish Online Serving Test, add streaming output api, continuous batching test and example (#5432) * finish online test and add examples * fix test_contionus_batching * fix some bugs * fix bash * fix * fix inference * finish revision * fix typos * revision 2024-03-18 09:06:05 +00:00			`result = await completion_serving.create_completion(request, generation_config)`

			`ret = {"request_id": result.request_id, "text": result.output}`
[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`if stream == "true":`
[Inference] Finish Online Serving Test, add streaming output api, continuous batching test and example (#5432) * finish online test and add examples * fix test_contionus_batching * fix some bugs * fix bash * fix * fix inference * finish revision * fix typos * revision 2024-03-18 09:06:05 +00:00			`return StreamingResponse(content=json.dumps(ret) + "\0", media_type="text/event-stream")`
			`else:`
			`return JSONResponse(content=ret)`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00

[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`@app.post("/v1/chat")`
			`async def create_chat(request: Request):`
			`request_dict = await request.json()`

			`stream = request_dict.get("stream", "false").lower()`
			`generation_config = get_generation_config(request_dict)`
			`message = await chat_serving.create_chat(request, generation_config)`
			`if stream == "true":`
			`return StreamingResponse(content=message, media_type="text/event-stream")`
			`else:`
			`ret = {"role": message.role, "text": message.content}`
			`return ret`


[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`def get_generation_config(request):`
			`generation_config = async_engine.engine.generation_config`
			`for arg in request:`
			`if hasattr(generation_config, arg):`
			`generation_config[arg] = request[arg]`
			`return generation_config`


			`def add_engine_config(parser):`
			`parser.add_argument("--model", type=str, default="llama2-7b", help="name or path of the huggingface model to use")`

			`parser.add_argument(`
			`"--max-model-len",`
			`type=int,`
			`default=None,`
			`help="model context length. If unspecified, " "will be automatically derived from the model.",`
			`)`
			`# Parallel arguments`
			`parser.add_argument(`
			`"--worker-use-ray",`
			`action="store_true",`
			`help="use Ray for distributed serving, will be " "automatically set when using more than 1 GPU",`
			`)`

			`parser.add_argument("--pipeline-parallel-size", "-pp", type=int, default=1, help="number of pipeline stages")`

			`parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1, help="number of tensor parallel replicas")`

			`# KV cache arguments`
			`parser.add_argument("--block-size", type=int, default=16, choices=[8, 16, 32], help="token block size")`

			`parser.add_argument("--max_batch_size", type=int, default=8, help="maximum number of batch size")`

			`# generation arguments`
			`parser.add_argument(`
			`"--prompt_template",`
			`choices=prompt_template_choices,`
			`default=None,`
			`help=f"Allowed choices are {','.join(prompt_template_choices)}. Default to None.",`
			`)`

			`# Quantization settings.`
			`parser.add_argument(`
			`"--quantization",`
			`"-q",`
			`type=str,`
			`choices=["awq", "gptq", "squeezellm", None],`
			`default=None,`
			`help="Method used to quantize the weights. If "`
			"None, we first check the `quantization_config` "
			`"attribute in the model config file. If that is "`
			`"None, we assume the model weights are not "`
			"quantized and use `dtype` to determine the data "
			`"type of the weights.",`
			`)`
			`parser.add_argument(`
			`"--enforce-eager",`
			`action="store_true",`
			`help="Always use eager-mode PyTorch. If False, "`
			`"will use eager mode and CUDA graph in hybrid "`
			`"for maximal performance and flexibility.",`
			`)`
			`return parser`


			`def parse_args():`
			`parser = argparse.ArgumentParser(description="Colossal-Inference API server.")`

			`parser.add_argument("--host", type=str, default="127.0.0.1")`
			`parser.add_argument("--port", type=int, default=8000)`
			`parser.add_argument("--ssl-keyfile", type=str, default=None)`
			`parser.add_argument("--ssl-certfile", type=str, default=None)`
			`parser.add_argument(`
			`"--root-path", type=str, default=None, help="FastAPI root_path when app is behind a path based routing proxy"`
			`)`
			`parser.add_argument(`
			`"--model-name",`
			`type=str,`
			`default=None,`
			`help="The model name used in the API. If not "`
			`"specified, the model name will be the same as "`
			`"the huggingface name.",`
			`)`
[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`parser.add_argument(`
			`"--chat-template",`
			`type=str,`
			`default=None,`
			`help="The file path to the chat template, " "or the template in single-line form " "for the specified model",`
			`)`
			`parser.add_argument(`
			`"--response-role",`
			`type=str,`
			`default="assistant",`
			help="The role name to return if " "`request.add_generation_prompt=true`.",
			`)`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`parser = add_engine_config(parser)`

			`return parser.parse_args()`


			`if __name__ == "__main__":`
			`args = parse_args()`
			`inference_config = InferenceConfig.from_dict(vars(args))`
			`model = AutoModelForCausalLM.from_pretrained(args.model)`
			`tokenizer = AutoTokenizer.from_pretrained(args.model)`
			`async_engine = AsyncInferenceEngine(`
			`start_engine_loop=True, model=model, tokenizer=tokenizer, inference_config=inference_config`
			`)`
			`engine = async_engine.engine`
			`completion_serving = CompletionServing(async_engine, served_model=model.__class__.__name__)`
[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`chat_serving = ChatServing(`
			`async_engine,`
			`served_model=model.__class__.__name__,`
			`tokenizer=tokenizer,`
			`response_role=args.response_role,`
			`chat_template=args.chat_template,`
			`)`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`app.root_path = args.root_path`
			`uvicorn.run(`
[Online Server] Chat Api for streaming and not streaming response (#5470) * fix bugs * fix bugs * fix api server * fix api server * add chat api and test * del request.n 2024-04-07 06:45:43 +00:00			`app=app,`
[Inference] ADD async and sync Api server using FastAPI (#5396) * add api server * fix * add * add completion service and fix bug * add generation config * revise shardformer * fix bugs * add docstrings and fix some bugs * fix bugs and add choices for prompt template 2024-03-01 06:47:36 +00:00			`host=args.host,`
			`port=args.port,`
			`log_level="debug",`
			`timeout_keep_alive=TIMEOUT_KEEP_ALIVE,`
			`ssl_keyfile=args.ssl_keyfile,`
			`ssl_certfile=args.ssl_certfile,`
			`)`