ColossalAI/applications/Chat/inference/server.py

168 lines
6.4 KiB
Python

import argparse
import os
from threading import Lock
from typing import Dict, Generator, List, Optional
import torch
import uvicorn
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from llama_gptq import load_quant
from pydantic import BaseModel, Field
from slowapi import Limiter, _rate_limit_exceeded_handler
from slowapi.errors import RateLimitExceeded
from slowapi.util import get_remote_address
from sse_starlette.sse import EventSourceResponse
from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM
from utils import ChatPromptProcessor, Dialogue, LockedIterator, sample_streamingly, update_model_kwargs_fn
CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.'
MAX_LEN = 512
running_lock = Lock()
class GenerationTaskReq(BaseModel):
max_new_tokens: int = Field(gt=0, le=512, example=64)
history: List[Dialogue] = Field(min_items=1)
top_k: Optional[int] = Field(default=None, gt=0, example=50)
top_p: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.5)
temperature: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.7)
repetition_penalty: Optional[float] = Field(default=None, gt=1.0, example=1.2)
limiter = Limiter(key_func=get_remote_address)
app = FastAPI()
app.state.limiter = limiter
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
# set CORS
origin_spec_from_env = os.environ.get('CORS_ORIGIN', None)
if origin_spec_from_env is not None:
# allow CORS from the specified origins
origins = os.environ['CORS_ORIGIN'].split(',')
else:
# allow CORS from all origins
origins = ["*"]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature):
inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
#TODO(ver217): streaming generation does not support repetition_penalty now
model_kwargs = {
'max_generate_tokens': max_new_tokens,
'early_stopping': True,
'top_k': top_k,
'top_p': top_p,
'temperature': temperature,
'prepare_inputs_fn': model.prepare_inputs_for_generation,
'update_model_kwargs_fn': update_model_kwargs_fn,
}
is_first_word = True
generator = LockedIterator(sample_streamingly(model, **inputs, **model_kwargs), running_lock)
for output in generator:
output = output.cpu()
tokens = tokenizer.convert_ids_to_tokens(output, skip_special_tokens=True)
current_sub_tokens = []
for token in tokens:
if token in tokenizer.all_special_tokens:
continue
current_sub_tokens.append(token)
if current_sub_tokens:
out_string = tokenizer.sp_model.decode(current_sub_tokens)
if is_first_word:
out_string = out_string.lstrip()
is_first_word = False
elif current_sub_tokens[0].startswith(''):
# whitespace will be ignored by the frontend
out_string = ' ' + out_string
yield out_string
async def event_generator(request: Request, generator: Generator):
while True:
if await request.is_disconnected():
break
try:
yield {'event': 'generate', 'data': next(generator)}
except StopIteration:
yield {'event': 'end', 'data': ''}
break
@app.post('/generate/stream')
@limiter.limit('1/second')
def generate(data: GenerationTaskReq, request: Request):
prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
event_source = event_generator(
request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature))
return EventSourceResponse(event_source)
@app.post('/generate')
@limiter.limit('1/second')
def generate_no_stream(data: GenerationTaskReq, request: Request):
prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
with running_lock:
output = model.generate(**inputs, **data.dict(exclude={'history'}))
output = output.cpu()
prompt_len = inputs['input_ids'].size(1)
response = output[0, prompt_len:]
out_string = tokenizer.decode(response, skip_special_tokens=True)
return prompt_processor.postprocess_output(out_string)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'pretrained',
help='Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.')
parser.add_argument('--quant',
choices=['8bit', '4bit'],
default=None,
help='Quantization mode. Default: None (no quantization, fp16).')
parser.add_argument(
'--gptq_checkpoint',
default=None,
help='Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.')
parser.add_argument('--gptq_group_size',
type=int,
default=128,
help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.')
parser.add_argument('--http_host', default='0.0.0.0')
parser.add_argument('--http_port', type=int, default=7070)
args = parser.parse_args()
if args.quant == '4bit':
assert args.gptq_checkpoint is not None, 'Please specify a GPTQ checkpoint.'
tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
prompt_processor = ChatPromptProcessor(tokenizer, CONTEXT, MAX_LEN)
if args.quant == '4bit':
model = load_quant(args.pretrained, args.gptq_checkpoint, 4, args.gptq_group_size)
model.cuda()
else:
model = LlamaForCausalLM.from_pretrained(
args.pretrained,
load_in_8bit=(args.quant == '8bit'),
torch_dtype=torch.float16,
device_map="auto",
)
if args.quant != '8bit':
model.half() # seems to fix bugs for some users.
model.eval()
config = uvicorn.Config(app, host=args.http_host, port=args.http_port)
server = uvicorn.Server(config=config)
server.run()