mirror of https://github.com/hpcaitech/ColossalAI
168 lines
6.4 KiB
Python
168 lines
6.4 KiB
Python
import argparse
|
|
import os
|
|
from threading import Lock
|
|
from typing import Dict, Generator, List, Optional
|
|
|
|
import torch
|
|
import uvicorn
|
|
from fastapi import FastAPI, HTTPException, Request
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from llama_gptq import load_quant
|
|
from pydantic import BaseModel, Field
|
|
from slowapi import Limiter, _rate_limit_exceeded_handler
|
|
from slowapi.errors import RateLimitExceeded
|
|
from slowapi.util import get_remote_address
|
|
from sse_starlette.sse import EventSourceResponse
|
|
from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM
|
|
from utils import ChatPromptProcessor, Dialogue, LockedIterator, sample_streamingly, update_model_kwargs_fn
|
|
|
|
CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.'
|
|
MAX_LEN = 512
|
|
running_lock = Lock()
|
|
|
|
|
|
class GenerationTaskReq(BaseModel):
|
|
max_new_tokens: int = Field(gt=0, le=512, example=64)
|
|
history: List[Dialogue] = Field(min_items=1)
|
|
top_k: Optional[int] = Field(default=None, gt=0, example=50)
|
|
top_p: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.5)
|
|
temperature: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.7)
|
|
repetition_penalty: Optional[float] = Field(default=None, gt=1.0, example=1.2)
|
|
|
|
|
|
limiter = Limiter(key_func=get_remote_address)
|
|
app = FastAPI()
|
|
app.state.limiter = limiter
|
|
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
|
|
|
|
# set CORS
|
|
origin_spec_from_env = os.environ.get('CORS_ORIGIN', None)
|
|
|
|
if origin_spec_from_env is not None:
|
|
# allow CORS from the specified origins
|
|
origins = os.environ['CORS_ORIGIN'].split(',')
|
|
else:
|
|
# allow CORS from all origins
|
|
origins = ["*"]
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=origins,
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
|
|
def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature):
|
|
inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
|
|
#TODO(ver217): streaming generation does not support repetition_penalty now
|
|
model_kwargs = {
|
|
'max_generate_tokens': max_new_tokens,
|
|
'early_stopping': True,
|
|
'top_k': top_k,
|
|
'top_p': top_p,
|
|
'temperature': temperature,
|
|
'prepare_inputs_fn': model.prepare_inputs_for_generation,
|
|
'update_model_kwargs_fn': update_model_kwargs_fn,
|
|
}
|
|
is_first_word = True
|
|
generator = LockedIterator(sample_streamingly(model, **inputs, **model_kwargs), running_lock)
|
|
for output in generator:
|
|
output = output.cpu()
|
|
tokens = tokenizer.convert_ids_to_tokens(output, skip_special_tokens=True)
|
|
current_sub_tokens = []
|
|
for token in tokens:
|
|
if token in tokenizer.all_special_tokens:
|
|
continue
|
|
current_sub_tokens.append(token)
|
|
if current_sub_tokens:
|
|
out_string = tokenizer.sp_model.decode(current_sub_tokens)
|
|
if is_first_word:
|
|
out_string = out_string.lstrip()
|
|
is_first_word = False
|
|
elif current_sub_tokens[0].startswith('▁'):
|
|
# whitespace will be ignored by the frontend
|
|
out_string = ' ' + out_string
|
|
yield out_string
|
|
|
|
|
|
async def event_generator(request: Request, generator: Generator):
|
|
while True:
|
|
if await request.is_disconnected():
|
|
break
|
|
try:
|
|
yield {'event': 'generate', 'data': next(generator)}
|
|
except StopIteration:
|
|
yield {'event': 'end', 'data': ''}
|
|
break
|
|
|
|
|
|
@app.post('/generate/stream')
|
|
@limiter.limit('1/second')
|
|
def generate(data: GenerationTaskReq, request: Request):
|
|
prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
|
|
event_source = event_generator(
|
|
request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature))
|
|
return EventSourceResponse(event_source)
|
|
|
|
|
|
@app.post('/generate')
|
|
@limiter.limit('1/second')
|
|
def generate_no_stream(data: GenerationTaskReq, request: Request):
|
|
prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens)
|
|
inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()}
|
|
with running_lock:
|
|
output = model.generate(**inputs, **data.dict(exclude={'history'}))
|
|
output = output.cpu()
|
|
prompt_len = inputs['input_ids'].size(1)
|
|
response = output[0, prompt_len:]
|
|
out_string = tokenizer.decode(response, skip_special_tokens=True)
|
|
return prompt_processor.postprocess_output(out_string)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
'pretrained',
|
|
help='Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.')
|
|
parser.add_argument('--quant',
|
|
choices=['8bit', '4bit'],
|
|
default=None,
|
|
help='Quantization mode. Default: None (no quantization, fp16).')
|
|
parser.add_argument(
|
|
'--gptq_checkpoint',
|
|
default=None,
|
|
help='Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.')
|
|
parser.add_argument('--gptq_group_size',
|
|
type=int,
|
|
default=128,
|
|
help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.')
|
|
parser.add_argument('--http_host', default='0.0.0.0')
|
|
parser.add_argument('--http_port', type=int, default=7070)
|
|
args = parser.parse_args()
|
|
|
|
if args.quant == '4bit':
|
|
assert args.gptq_checkpoint is not None, 'Please specify a GPTQ checkpoint.'
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(args.pretrained)
|
|
prompt_processor = ChatPromptProcessor(tokenizer, CONTEXT, MAX_LEN)
|
|
|
|
if args.quant == '4bit':
|
|
model = load_quant(args.pretrained, args.gptq_checkpoint, 4, args.gptq_group_size)
|
|
model.cuda()
|
|
else:
|
|
model = LlamaForCausalLM.from_pretrained(
|
|
args.pretrained,
|
|
load_in_8bit=(args.quant == '8bit'),
|
|
torch_dtype=torch.float16,
|
|
device_map="auto",
|
|
)
|
|
if args.quant != '8bit':
|
|
model.half() # seems to fix bugs for some users.
|
|
model.eval()
|
|
|
|
config = uvicorn.Config(app, host=args.http_host, port=args.http_port)
|
|
server = uvicorn.Server(config=config)
|
|
server.run()
|