diff --git a/examples/inference/benchmark_llama.py b/examples/inference/benchmark_llama.py
index 1708c615d..a5b295a40 100644
--- a/examples/inference/benchmark_llama.py
+++ b/examples/inference/benchmark_llama.py
@@ -105,20 +105,28 @@ def benchmark_inference(args):
     with torch.no_grad():
         config = CONFIG_MAP[args.model]
         config.pad_token_id = config.eos_token_id
-        if args.test_random_weight:
-            model = transformers.LlamaForCausalLM(config)
-            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
-        else:
-            assert args.model_path, "When testing pretrained weights, the model path must be provided.'"
-            model = transformers.LlamaForCausalLM.from_pretrained(args.model_path)
-            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
 
-        model = model.eval()
+        if args.mode != "vllm":
+            if args.test_random_weight:
+                model = transformers.LlamaForCausalLM(config).cuda()
+                tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
+            else:
+                assert args.model_path, "When testing pretrained weights, the model path must be provided.'"
+                model = transformers.LlamaForCausalLM.from_pretrained(args.model_path).cuda()
+                tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+
+            model = model.eval()
 
-        if args.dtype == "fp16":
-            model = model.half()
-        elif args.dtype == "bf16":
-            model = model.to(torch.bfloat16)
+            if args.dtype == "fp16":
+                model = model.half()
+            elif args.dtype == "bf16":
+                model = model.to(torch.bfloat16)
+
+            generation_config = GenerationConfig(
+                pad_token_id=tokenizer.pad_token_id,
+                max_length=args.seq_len + args.output_len,
+                # max_new_tokens=args.max_output_len,
+            )
 
         if args.continous_batching:
             mbsz = args.mbsz
@@ -156,12 +164,6 @@ def benchmark_inference(args):
         if args.mode == "colossalai" or args.mode == "vllm":
             data = data.tolist()
 
-        generation_config = GenerationConfig(
-            pad_token_id=tokenizer.pad_token_id,
-            max_length=args.seq_len + args.output_len,
-            # max_new_tokens=args.output_len,
-        )
-
         N_WARMUP_STEPS = 2
 
         ctx = (
@@ -225,7 +227,7 @@ def benchmark_inference(args):
             if args.profile:
                 ctx.step()
     print(f"config:batch_size {args.batch_size}, input_len{ args.seq_len}, output_len {args.output_len}")
-    print_details_info(model.config, args, whole_end2end, total_token_num)
+    print_details_info(config, args, whole_end2end, total_token_num)
 
 
 def hybrid_inference(rank, world_size, port, args):
diff --git a/examples/inference/benchmark_llama3.py b/examples/inference/benchmark_llama3.py
index c9294bf62..2829090f0 100644
--- a/examples/inference/benchmark_llama3.py
+++ b/examples/inference/benchmark_llama3.py
@@ -106,9 +106,9 @@ def benchmark_inference(args):
 
     config = CONFIG_MAP[args.model]
     config.pad_token_id = config.eos_token_id
-    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
     if args.model_path is not None:
         model = transformers.LlamaForCausalLM.from_pretrained(args.model_path)
+        tokenizer = AutoTokenizer.from_pretrained(args.model_path)
     else:
         # Random weights
         model = transformers.LlamaForCausalLM(config)
diff --git a/examples/inference/run_benchmark.sh b/examples/inference/run_benchmark.sh
index 4b015757e..192715976 100755
--- a/examples/inference/run_benchmark.sh
+++ b/examples/inference/run_benchmark.sh
@@ -27,7 +27,7 @@ CUDA_VISIBLE_DEVICES_set_n_least_memory_usage 1
 for input_len in  128 512 1024; do
     for output_len in 128 256; do
         for bsz in 16 32 64; do
-            python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 --pp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --test_random_weight | tee logs/${input_len}_${output_len}_${mode}_${GPU}_${bsz}.txt
+            python3 ${PY_SCRIPT} -m llama2-7b --tp_size 1 -b ${bsz} -s ${input_len} --output_len ${output_len} --mode ${mode} --test_random_weight | tee logs/${bsz}_${input_len}_${output_len}_${mode}_${GPU}.txt
         done
     done
 done