import argparse import os from threading import Lock from typing import Dict, Generator, List, Optional import torch import uvicorn from fastapi import FastAPI, HTTPException, Request from fastapi.middleware.cors import CORSMiddleware from llama_gptq import load_quant from pydantic import BaseModel, Field from slowapi import Limiter, _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded from slowapi.util import get_remote_address from sse_starlette.sse import EventSourceResponse from transformers import AutoTokenizer, GenerationConfig, LlamaForCausalLM from utils import ChatPromptProcessor, Dialogue, LockedIterator, sample_streamingly, update_model_kwargs_fn CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.' MAX_LEN = 2048 running_lock = Lock() class GenerationTaskReq(BaseModel): max_new_tokens: int = Field(gt=0, le=512, example=64) history: List[Dialogue] = Field(min_items=1) top_k: Optional[int] = Field(default=None, gt=0, example=50) top_p: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.5) temperature: Optional[float] = Field(default=None, gt=0.0, lt=1.0, example=0.7) limiter = Limiter(key_func=get_remote_address) app = FastAPI() app.state.limiter = limiter app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) # set CORS origin_spec_from_env = os.environ.get('CORS_ORIGIN', None) if origin_spec_from_env is not None: # allow CORS from the specified origins origins = os.environ['CORS_ORIGIN'].split(',') else: # allow CORS from all origins origins = ["*"] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) def generate_streamingly(prompt, max_new_tokens, top_k, top_p, temperature): inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()} model_kwargs = { 'max_generate_tokens': max_new_tokens, 'early_stopping': True, 'top_k': top_k, 'top_p': top_p, 'temperature': temperature, 'prepare_inputs_fn': model.prepare_inputs_for_generation, 'update_model_kwargs_fn': update_model_kwargs_fn, } is_first_word = True generator = LockedIterator(sample_streamingly(model, **inputs, **model_kwargs), running_lock) for output in generator: output = output.cpu() tokens = tokenizer.convert_ids_to_tokens(output, skip_special_tokens=True) current_sub_tokens = [] for token in tokens: if token in tokenizer.all_special_tokens: continue current_sub_tokens.append(token) if current_sub_tokens: out_string = tokenizer.sp_model.decode(current_sub_tokens) if is_first_word: out_string = out_string.lstrip() is_first_word = False elif current_sub_tokens[0].startswith('▁'): # whitespace will be ignored by the frontend out_string = ' ' + out_string yield out_string async def event_generator(request: Request, generator: Generator): while True: if await request.is_disconnected(): break try: yield {'event': 'generate', 'data': next(generator)} except StopIteration: yield {'event': 'end', 'data': ''} break @app.post('/generate/stream') @limiter.limit('1/second') def generate(data: GenerationTaskReq, request: Request): prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens) event_source = event_generator( request, generate_streamingly(prompt, data.max_new_tokens, data.top_k, data.top_p, data.temperature)) return EventSourceResponse(event_source) @app.post('/generate') @limiter.limit('1/second') def generate_no_stream(data: GenerationTaskReq, request: Request): prompt = prompt_processor.preprocess_prompt(data.history, data.max_new_tokens) inputs = {k: v.cuda() for k, v in tokenizer(prompt, return_tensors="pt").items()} with running_lock: output = model.generate(**inputs, **data.dict(exclude={'history'})) output = output.cpu() prompt_len = inputs['input_ids'].size(1) response = output[0, prompt_len:] out_string = tokenizer.decode(response, skip_special_tokens=True) return out_string.lstrip() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( 'pretrained', help='Path to pretrained model. Can be a local path or a model name from the HuggingFace model hub.') parser.add_argument('--quant', choices=['8bit', '4bit'], default=None, help='Quantization mode. Default: None (no quantization, fp16).') parser.add_argument( '--gptq_checkpoint', default=None, help='Path to GPTQ checkpoint. This is only useful when quantization mode is 4bit. Default: None.') parser.add_argument('--gptq_group_size', type=int, default=128, help='Group size for GPTQ. This is only useful when quantization mode is 4bit. Default: 128.') parser.add_argument('--http_host', default='0.0.0.0') parser.add_argument('--http_port', type=int, default=7070) args = parser.parse_args() if args.quant == '4bit': assert args.gptq_checkpoint is not None, 'Please specify a GPTQ checkpoint.' tokenizer = AutoTokenizer.from_pretrained(args.pretrained) prompt_processor = ChatPromptProcessor(tokenizer, CONTEXT, MAX_LEN) if args.quant == '4bit': model = load_quant(args.pretrained, args.gptq_checkpoint, 4, args.gptq_group_size) model.cuda() else: model = LlamaForCausalLM.from_pretrained( args.pretrained, load_in_8bit=(args.quant == '8bit'), torch_dtype=torch.float16, device_map="auto", ) if args.quant != '8bit': model.half() # seems to fix bugs for some users. model.eval() config = uvicorn.Config(app, host=args.http_host, port=args.http_port) server = uvicorn.Server(config=config) server.run()