diff --git a/examples/inference/benchmark_llama.py b/examples/inference/benchmark_llama.py index 1708c615d..a5b295a40 100644 --- a/examples/inference/benchmark_llama.py +++ b/examples/inference/benchmark_llama.py @@ -105,20 +105,28 @@ def benchmark_inference(args): with torch.no_grad(): config = CONFIG_MAP[args.model] config.pad_token_id = config.eos_token_id - if args.test_random_weight: - model = transformers.LlamaForCausalLM(config) - tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") - else: - assert args.model_path, "When testing pretrained weights, the model path must be provided.'" - model = transformers.LlamaForCausalLM.from_pretrained(args.model_path) - tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") - model = model.eval() + if args.mode != "vllm": + if args.test_random_weight: + model = transformers.LlamaForCausalLM(config).cuda() + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") + else: + assert args.model_path, "When testing pretrained weights, the model path must be provided.'" + model = transformers.LlamaForCausalLM.from_pretrained(args.model_path).cuda() + tokenizer = AutoTokenizer.from_pretrained(args.model_path) + + model = model.eval() - if args.dtype == "fp16": - model = model.half() - elif args.dtype == "bf16": - model = model.to(torch.bfloat16) + if args.dtype == "fp16": + model = model.half() + elif args.dtype == "bf16": + model = model.to(torch.bfloat16) + + generation_config = GenerationConfig( + pad_token_id=tokenizer.pad_token_id, + max_length=args.seq_len + args.output_len, + # max_new_tokens=args.max_output_len, + ) if args.continous_batching: mbsz = args.mbsz @@ -156,12 +164,6 @@ def benchmark_inference(args): if args.mode == "colossalai" or args.mode == "vllm": data = data.tolist() - generation_config = GenerationConfig( - pad_token_id=tokenizer.pad_token_id, - max_length=args.seq_len + args.output_len, - # max_new_tokens=args.output_len, - ) - N_WARMUP_STEPS = 2 ctx = ( @@ -225,7 +227,7 @@ def benchmark_inference(args): if args.profile: ctx.step() print(f"config:batch_size {args.batch_size}, input_len{ args.seq_len}, output_len {args.output_len}") - print_details_info(model.config, args, whole_end2end, total_token_num) + print_details_info(config, args, whole_end2end, total_token_num) def hybrid_inference(rank, world_size, port, args): diff --git a/examples/inference/benchmark_llama3.py b/examples/inference/benchmark_llama3.py index c9294bf62..2829090f0 100644 --- a/examples/inference/benchmark_llama3.py +++ b/examples/inference/benchmark_llama3.py @@ -106,9 +106,9 @@ def benchmark_inference(args): config = CONFIG_MAP[args.model] config.pad_token_id = config.eos_token_id - tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer") if args.model_path is not None: model = transformers.LlamaForCausalLM.from_pretrained(args.model_path) + tokenizer = AutoTokenizer.from_pretrained(args.model_path) else: # Random weights model = transformers.LlamaForCausalLM(config) diff --git a/examples/inference/run_benchmark.sh b/examples/inference/run_benchmark.sh index 4b015757e..192715976 100755 --- a/examples/inference/run_benchmark.sh +++ b/examples/inference/run_benchmark.sh @@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES_set_n_least_memory_usage 1 for input_len in 128 512 1024; do for output_len in 128 256; do for bsz in 16 32 64; do - python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --test_random_weight | tee logs/${input_len}_${output_len}_${mode}_${GPU}_${bsz}.txt + python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --test_random_weight | tee logs/${bsz}_${input_len}_${output_len}_${mode}_${GPU}.txt done done done