mirror of https://github.com/hpcaitech/ColossalAI
[Fix/Inference]Fix vllm benchmark (#5630)
* Fix bugs about OOM when running vllm-0.4.0 * rm used params * change generation_config * change benchmark log file namepull/5650/head
parent
279300dc5f
commit
90cd5227a3
|
@ -105,20 +105,28 @@ def benchmark_inference(args):
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
config = CONFIG_MAP[args.model]
|
config = CONFIG_MAP[args.model]
|
||||||
config.pad_token_id = config.eos_token_id
|
config.pad_token_id = config.eos_token_id
|
||||||
if args.test_random_weight:
|
|
||||||
model = transformers.LlamaForCausalLM(config)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
|
||||||
else:
|
|
||||||
assert args.model_path, "When testing pretrained weights, the model path must be provided.'"
|
|
||||||
model = transformers.LlamaForCausalLM.from_pretrained(args.model_path)
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
|
||||||
|
|
||||||
model = model.eval()
|
if args.mode != "vllm":
|
||||||
|
if args.test_random_weight:
|
||||||
|
model = transformers.LlamaForCausalLM(config).cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
||||||
|
else:
|
||||||
|
assert args.model_path, "When testing pretrained weights, the model path must be provided.'"
|
||||||
|
model = transformers.LlamaForCausalLM.from_pretrained(args.model_path).cuda()
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
|
||||||
|
|
||||||
if args.dtype == "fp16":
|
model = model.eval()
|
||||||
model = model.half()
|
|
||||||
elif args.dtype == "bf16":
|
if args.dtype == "fp16":
|
||||||
model = model.to(torch.bfloat16)
|
model = model.half()
|
||||||
|
elif args.dtype == "bf16":
|
||||||
|
model = model.to(torch.bfloat16)
|
||||||
|
|
||||||
|
generation_config = GenerationConfig(
|
||||||
|
pad_token_id=tokenizer.pad_token_id,
|
||||||
|
max_length=args.seq_len + args.output_len,
|
||||||
|
# max_new_tokens=args.max_output_len,
|
||||||
|
)
|
||||||
|
|
||||||
if args.continous_batching:
|
if args.continous_batching:
|
||||||
mbsz = args.mbsz
|
mbsz = args.mbsz
|
||||||
|
@ -156,12 +164,6 @@ def benchmark_inference(args):
|
||||||
if args.mode == "colossalai" or args.mode == "vllm":
|
if args.mode == "colossalai" or args.mode == "vllm":
|
||||||
data = data.tolist()
|
data = data.tolist()
|
||||||
|
|
||||||
generation_config = GenerationConfig(
|
|
||||||
pad_token_id=tokenizer.pad_token_id,
|
|
||||||
max_length=args.seq_len + args.output_len,
|
|
||||||
# max_new_tokens=args.output_len,
|
|
||||||
)
|
|
||||||
|
|
||||||
N_WARMUP_STEPS = 2
|
N_WARMUP_STEPS = 2
|
||||||
|
|
||||||
ctx = (
|
ctx = (
|
||||||
|
@ -225,7 +227,7 @@ def benchmark_inference(args):
|
||||||
if args.profile:
|
if args.profile:
|
||||||
ctx.step()
|
ctx.step()
|
||||||
print(f"config:batch_size {args.batch_size}, input_len{ args.seq_len}, output_len {args.output_len}")
|
print(f"config:batch_size {args.batch_size}, input_len{ args.seq_len}, output_len {args.output_len}")
|
||||||
print_details_info(model.config, args, whole_end2end, total_token_num)
|
print_details_info(config, args, whole_end2end, total_token_num)
|
||||||
|
|
||||||
|
|
||||||
def hybrid_inference(rank, world_size, port, args):
|
def hybrid_inference(rank, world_size, port, args):
|
||||||
|
|
|
@ -106,9 +106,9 @@ def benchmark_inference(args):
|
||||||
|
|
||||||
config = CONFIG_MAP[args.model]
|
config = CONFIG_MAP[args.model]
|
||||||
config.pad_token_id = config.eos_token_id
|
config.pad_token_id = config.eos_token_id
|
||||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
|
|
||||||
if args.model_path is not None:
|
if args.model_path is not None:
|
||||||
model = transformers.LlamaForCausalLM.from_pretrained(args.model_path)
|
model = transformers.LlamaForCausalLM.from_pretrained(args.model_path)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(args.model_path)
|
||||||
else:
|
else:
|
||||||
# Random weights
|
# Random weights
|
||||||
model = transformers.LlamaForCausalLM(config)
|
model = transformers.LlamaForCausalLM(config)
|
||||||
|
|
|
@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES_set_n_least_memory_usage 1
|
||||||
for input_len in 128 512 1024; do
|
for input_len in 128 512 1024; do
|
||||||
for output_len in 128 256; do
|
for output_len in 128 256; do
|
||||||
for bsz in 16 32 64; do
|
for bsz in 16 32 64; do
|
||||||
python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --test_random_weight | tee logs/${input_len}_${output_len}_${mode}_${GPU}_${bsz}.txt
|
python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --test_random_weight | tee logs/${bsz}_${input_len}_${output_len}_${mode}_${GPU}.txt
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
done
|
done
|
||||||
|
|
Loading…
Reference in New Issue