mirror of https://github.com/hpcaitech/ColossalAI
parent
5446fb70c4
commit
20332a7a34
|
@ -14,7 +14,14 @@ from .microbatch_manager import MicroBatchManager
|
||||||
|
|
||||||
PP_AXIS, TP_AXIS = 0, 1
|
PP_AXIS, TP_AXIS = 0, 1
|
||||||
|
|
||||||
_supported_models = ["LlamaForCausalLM", "BloomForCausalLM", "LlamaGPTQForCausalLM", "SmoothLlamaForCausalLM", "ChatGLMForConditionalGeneration"]
|
_supported_models = [
|
||||||
|
"LlamaForCausalLM",
|
||||||
|
"BloomForCausalLM",
|
||||||
|
"LlamaGPTQForCausalLM",
|
||||||
|
"SmoothLlamaForCausalLM",
|
||||||
|
"ChatGLMForConditionalGeneration",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class CaiInferEngine:
|
class CaiInferEngine:
|
||||||
"""
|
"""
|
||||||
|
@ -161,7 +168,7 @@ class CaiInferEngine:
|
||||||
enable_flash_attention=False,
|
enable_flash_attention=False,
|
||||||
enable_jit_fused=False,
|
enable_jit_fused=False,
|
||||||
enable_sequence_parallelism=False,
|
enable_sequence_parallelism=False,
|
||||||
quant=self.quant,
|
extra_kwargs={"quant": self.quant},
|
||||||
)
|
)
|
||||||
shardformer = ShardFormer(shard_config=shardconfig)
|
shardformer = ShardFormer(shard_config=shardconfig)
|
||||||
shard_model, _ = shardformer.optimize(model, model_policy)
|
shard_model, _ = shardformer.optimize(model, model_policy)
|
||||||
|
|
|
@ -1,84 +0,0 @@
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from _utils import print_perf_stats
|
|
||||||
from transformers import BloomForCausalLM, BloomTokenizerFast
|
|
||||||
|
|
||||||
import colossalai
|
|
||||||
from colossalai.inference.tensor_parallel.engine import TPInferEngine
|
|
||||||
from colossalai.logging import disable_existing_loggers
|
|
||||||
from colossalai.shardformer import ShardConfig
|
|
||||||
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
|
||||||
|
|
||||||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
|
||||||
|
|
||||||
|
|
||||||
def bench_bloom(args):
|
|
||||||
model_path = args.path
|
|
||||||
max_batch_size = args.batch_size
|
|
||||||
max_input_len = args.input_len
|
|
||||||
max_output_len = args.output_len
|
|
||||||
|
|
||||||
tokenizer = BloomTokenizerFast.from_pretrained(model_path)
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
model = BloomForCausalLM.from_pretrained(model_path, pad_token_id=tokenizer.eos_token_id)
|
|
||||||
model = model.half()
|
|
||||||
|
|
||||||
# init TPInferEngine and shard the original model
|
|
||||||
# To benchmark torch original, comment out the line of optimizing model
|
|
||||||
shard_config = ShardConfig(
|
|
||||||
enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
|
|
||||||
)
|
|
||||||
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
|
|
||||||
|
|
||||||
# prepare data for generation
|
|
||||||
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
|
|
||||||
input_tokens = {
|
|
||||||
"input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)),
|
|
||||||
"attention_mask": torch.ones((max_batch_size, max_input_len)),
|
|
||||||
}
|
|
||||||
for t in input_tokens:
|
|
||||||
if torch.is_tensor(input_tokens[t]):
|
|
||||||
input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
|
|
||||||
print(f" input_tokens[{t}].shape: {input_tokens[t].shape}")
|
|
||||||
|
|
||||||
iters = 10
|
|
||||||
times = []
|
|
||||||
for i in range(iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.time()
|
|
||||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
out_len = outputs.shape[1]
|
|
||||||
print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
|
|
||||||
times.append((end - start) / (out_len - max_input_len))
|
|
||||||
|
|
||||||
print_perf_stats(times, model.config, max_batch_size)
|
|
||||||
|
|
||||||
|
|
||||||
def check_bloom(rank, world_size, port, args):
|
|
||||||
disable_existing_loggers()
|
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
|
||||||
bench_bloom(args)
|
|
||||||
|
|
||||||
|
|
||||||
@rerun_if_address_is_in_use()
|
|
||||||
@clear_cache_before_run()
|
|
||||||
def test_bloom(args):
|
|
||||||
spawn(check_bloom, args.tp_size, args=args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
|
|
||||||
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
|
|
||||||
parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
|
|
||||||
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
|
|
||||||
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
test_bloom(args)
|
|
|
@ -1,118 +0,0 @@
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from _utils import print_perf_stats
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
|
|
||||||
import colossalai
|
|
||||||
from colossalai.inference.tensor_parallel.engine import TPInferEngine
|
|
||||||
from colossalai.logging import disable_existing_loggers
|
|
||||||
from colossalai.shardformer import ShardConfig
|
|
||||||
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration
|
|
||||||
from colossalai.testing import rerun_if_address_is_in_use, spawn
|
|
||||||
|
|
||||||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
|
||||||
|
|
||||||
|
|
||||||
def run_chatglm2_test(args):
|
|
||||||
chatglm2_model_path = args.path
|
|
||||||
max_batch_size = args.batch_size
|
|
||||||
max_input_len = args.input_len
|
|
||||||
max_output_len = args.output_len
|
|
||||||
args.test_mode
|
|
||||||
|
|
||||||
print("max_batch_size : " + str(max_batch_size))
|
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
|
|
||||||
model = ChatGLMForConditionalGeneration.from_pretrained(chatglm2_model_path, pad_token_id=tokenizer.eos_token_id)
|
|
||||||
model = model.half()
|
|
||||||
model.config
|
|
||||||
|
|
||||||
shard_config = ShardConfig(
|
|
||||||
enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
|
|
||||||
)
|
|
||||||
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
|
|
||||||
|
|
||||||
generate_kwargs = dict(max_new_tokens=1, do_sample=False)
|
|
||||||
input_tokens = {
|
|
||||||
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
|
|
||||||
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
|
|
||||||
}
|
|
||||||
|
|
||||||
iters = 10
|
|
||||||
prefill_times = []
|
|
||||||
|
|
||||||
warmup = 3
|
|
||||||
|
|
||||||
for i in range(iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.time()
|
|
||||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
out_len = outputs.shape[1]
|
|
||||||
print("generation time {} s".format(str(end - start)))
|
|
||||||
print(out_len - max_input_len)
|
|
||||||
prefill_times.append((end - start) / (out_len - max_input_len))
|
|
||||||
|
|
||||||
prefill_times = prefill_times[warmup:]
|
|
||||||
prefill_time_avg = sum(prefill_times) / len(prefill_times)
|
|
||||||
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
|
|
||||||
|
|
||||||
times = []
|
|
||||||
decoder_times = []
|
|
||||||
for i in range(iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.time()
|
|
||||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
out_len = outputs.shape[1]
|
|
||||||
print("generation time {} s".format(str(end - start)))
|
|
||||||
print(out_len - max_input_len)
|
|
||||||
times.append((end - start) / (out_len - max_input_len))
|
|
||||||
if args.test_mode == "decoder_test":
|
|
||||||
decoder_times.append((end - start - prefill_time_avg) / (out_len - max_input_len - 1))
|
|
||||||
|
|
||||||
times = times[warmup:]
|
|
||||||
latency = sum(times) / len(times)
|
|
||||||
print("total process latency is : " + str(latency) + " s")
|
|
||||||
print("total throughput is : " + str(1 / latency * max_batch_size))
|
|
||||||
|
|
||||||
if args.test_mode == "decoder_test":
|
|
||||||
decoder_times = decoder_times[warmup:]
|
|
||||||
latency = sum(decoder_times) / len(decoder_times)
|
|
||||||
|
|
||||||
print("decoder process latency is : " + str(latency) + " s")
|
|
||||||
print("decoder throughput is : " + str(1 / latency * max_batch_size))
|
|
||||||
|
|
||||||
print_perf_stats(times, model.config, max_batch_size)
|
|
||||||
|
|
||||||
|
|
||||||
def check_chatglm2(rank, world_size, port, args):
|
|
||||||
disable_existing_loggers()
|
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
|
||||||
run_chatglm2_test(args)
|
|
||||||
|
|
||||||
|
|
||||||
@rerun_if_address_is_in_use()
|
|
||||||
def test_chatglm2(args):
|
|
||||||
spawn(check_chatglm2, args.tp_size, args=args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
|
|
||||||
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
|
|
||||||
parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
|
|
||||||
parser.add_argument("--input_len", type=int, default=256, help="Maximum input length")
|
|
||||||
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
|
|
||||||
parser.add_argument(
|
|
||||||
"--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"]
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
test_chatglm2(args)
|
|
|
@ -1,119 +0,0 @@
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from _utils import print_perf_stats
|
|
||||||
from transformers import LlamaForCausalLM, LlamaTokenizer
|
|
||||||
|
|
||||||
import colossalai
|
|
||||||
from colossalai.inference.tensor_parallel.engine import TPInferEngine
|
|
||||||
from colossalai.logging import disable_existing_loggers
|
|
||||||
from colossalai.shardformer import ShardConfig
|
|
||||||
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
|
||||||
|
|
||||||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
|
||||||
|
|
||||||
|
|
||||||
def run_llama_test(args):
|
|
||||||
llama_model_path = args.path
|
|
||||||
max_batch_size = args.batch_size
|
|
||||||
max_input_len = args.input_len
|
|
||||||
max_output_len = args.output_len
|
|
||||||
args.test_mode
|
|
||||||
|
|
||||||
print("max_batch_size : " + str(max_batch_size))
|
|
||||||
|
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)
|
|
||||||
tokenizer.pad_token_id = tokenizer.unk_token_id
|
|
||||||
model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id)
|
|
||||||
model = model.half()
|
|
||||||
model.config
|
|
||||||
|
|
||||||
shard_config = ShardConfig(
|
|
||||||
enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
|
|
||||||
)
|
|
||||||
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
|
|
||||||
|
|
||||||
generate_kwargs = dict(max_new_tokens=1, do_sample=False)
|
|
||||||
input_tokens = {
|
|
||||||
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
|
|
||||||
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
|
|
||||||
}
|
|
||||||
|
|
||||||
iters = 10
|
|
||||||
prefill_times = []
|
|
||||||
|
|
||||||
warmup = 3
|
|
||||||
|
|
||||||
for i in range(iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.time()
|
|
||||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
out_len = outputs.shape[1]
|
|
||||||
print("generation time {} s".format(str(end - start)))
|
|
||||||
print(out_len - max_input_len)
|
|
||||||
prefill_times.append((end - start) / (out_len - max_input_len))
|
|
||||||
|
|
||||||
prefill_times = prefill_times[warmup:]
|
|
||||||
prefill_time_avg = sum(prefill_times) / len(prefill_times)
|
|
||||||
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
|
|
||||||
|
|
||||||
times = []
|
|
||||||
decoder_times = []
|
|
||||||
for i in range(iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.time()
|
|
||||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
out_len = outputs.shape[1]
|
|
||||||
print("generation time {} s".format(str(end - start)))
|
|
||||||
print(out_len - max_input_len)
|
|
||||||
times.append((end - start) / (out_len - max_input_len))
|
|
||||||
if args.test_mode == "decoder_test":
|
|
||||||
decoder_times.append((end - start - prefill_time_avg) / (out_len - max_input_len - 1))
|
|
||||||
|
|
||||||
times = times[warmup:]
|
|
||||||
latency = sum(times) / len(times)
|
|
||||||
print("total process latency is : " + str(latency) + " s")
|
|
||||||
print("total throughput is : " + str(1 / latency * max_batch_size))
|
|
||||||
|
|
||||||
if args.test_mode == "decoder_test":
|
|
||||||
decoder_times = decoder_times[warmup:]
|
|
||||||
latency = sum(decoder_times) / len(decoder_times)
|
|
||||||
|
|
||||||
print("decoder process latency is : " + str(latency) + " s")
|
|
||||||
print("decoder throughput is : " + str(1 / latency * max_batch_size))
|
|
||||||
|
|
||||||
print_perf_stats(times, model.config, max_batch_size)
|
|
||||||
|
|
||||||
|
|
||||||
def check_llama(rank, world_size, port, args):
|
|
||||||
disable_existing_loggers()
|
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
|
||||||
run_llama_test(args)
|
|
||||||
|
|
||||||
|
|
||||||
@rerun_if_address_is_in_use()
|
|
||||||
@clear_cache_before_run()
|
|
||||||
def test_llama(args):
|
|
||||||
spawn(check_llama, args.tp_size, args=args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
|
|
||||||
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
|
|
||||||
parser.add_argument("-b", "--batch_size", type=int, default=32, help="Maximum batch size")
|
|
||||||
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
|
|
||||||
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
|
|
||||||
parser.add_argument(
|
|
||||||
"--test_mode", type=str, help="Test mode", default="e2e_test", choices=["e2e_test", "decoder_test"]
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
test_llama(args)
|
|
|
@ -1,4 +1,5 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
@ -6,14 +7,12 @@ import torch.distributed as dist
|
||||||
import transformers
|
import transformers
|
||||||
|
|
||||||
import colossalai
|
import colossalai
|
||||||
from colossalai.inference import PPInferEngine
|
from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy
|
||||||
from colossalai.inference.pipeline.policies import LlamaModelInferPolicy
|
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
||||||
|
|
||||||
GIGABYTE = 1024**3
|
GIGABYTE = 1024**3
|
||||||
MEGABYTE = 1024 * 1024
|
MEGABYTE = 1024 * 1024
|
||||||
|
|
||||||
colossalai.launch_from_torch(config={})
|
|
||||||
|
|
||||||
|
|
||||||
def data_gen(batch_size: int = 4, seq_len: int = 512):
|
def data_gen(batch_size: int = 4, seq_len: int = 512):
|
||||||
input_ids = torch.randint(10, 30000, (1, seq_len), dtype=torch.int32)
|
input_ids = torch.randint(10, 30000, (1, seq_len), dtype=torch.int32)
|
||||||
|
@ -28,6 +27,9 @@ def data_gen(batch_size: int = 4, seq_len: int = 512):
|
||||||
|
|
||||||
|
|
||||||
def print_details_info(timestamps, model_config, args, whole_end2end):
|
def print_details_info(timestamps, model_config, args, whole_end2end):
|
||||||
|
log_file_name = f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.output_len}_bsz{args.batch_size}_mbsz{args.mb_size}.log"
|
||||||
|
os.makedirs(os.path.dirname(log_file_name), exist_ok=True)
|
||||||
|
|
||||||
if dist.get_rank() == 0:
|
if dist.get_rank() == 0:
|
||||||
prefill = []
|
prefill = []
|
||||||
encoder = []
|
encoder = []
|
||||||
|
@ -39,13 +41,14 @@ def print_details_info(timestamps, model_config, args, whole_end2end):
|
||||||
)
|
)
|
||||||
end2end.append(timestamp[-1] - timestamp[0])
|
end2end.append(timestamp[-1] - timestamp[0])
|
||||||
print(whole_end2end)
|
print(whole_end2end)
|
||||||
|
|
||||||
with open(
|
with open(
|
||||||
f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.new_length}_bsz{args.batch_size}_mbsz{args.mb_size}.log",
|
log_file_name,
|
||||||
"w+",
|
"w+",
|
||||||
) as f:
|
) as f:
|
||||||
mb_avg_end2end = sum(end2end) / len(end2end)
|
mb_avg_end2end = sum(end2end) / len(end2end)
|
||||||
mb_avg_latency = mb_avg_end2end / (args.new_length * args.mb_size)
|
mb_avg_latency = mb_avg_end2end / (args.output_len * args.mb_size)
|
||||||
whole_avg_latency = whole_end2end / (args.new_length * args.batch_size)
|
whole_avg_latency = whole_end2end / (args.output_len * args.batch_size)
|
||||||
num_layers = getattr(model_config, "num_layers", model_config.num_hidden_layers)
|
num_layers = getattr(model_config, "num_layers", model_config.num_hidden_layers)
|
||||||
num_parameters = num_layers * model_config.hidden_size * model_config.hidden_size * 12 / args.pp_size
|
num_parameters = num_layers * model_config.hidden_size * model_config.hidden_size * 12 / args.pp_size
|
||||||
if args.dtype in ["fp16", "bf16"]:
|
if args.dtype in ["fp16", "bf16"]:
|
||||||
|
@ -54,7 +57,7 @@ def print_details_info(timestamps, model_config, args, whole_end2end):
|
||||||
num_bytes = 4
|
num_bytes = 4
|
||||||
|
|
||||||
f.write(
|
f.write(
|
||||||
f"llama-{args.model}{args.dtype}_pp{args.pp_size}, input_len:{args.seq_len}, output_len:{args.new_length}, bsz:{args.batch_size}, mbsz:{args.mb_size}\n"
|
f"llama-{args.model}{args.dtype}_pp{args.pp_size}, input_len:{args.seq_len}, output_len:{args.output_len}, bsz:{args.batch_size}, mbsz:{args.mb_size}\n"
|
||||||
)
|
)
|
||||||
f.write("Average prefill time: {0:8.2f} ms\n".format(sum(prefill) / len(prefill) * 1000))
|
f.write("Average prefill time: {0:8.2f} ms\n".format(sum(prefill) / len(prefill) * 1000))
|
||||||
f.write("Average encode time: {0:8.2f} ms\n".format(sum(encoder) / len(encoder) * 1000))
|
f.write("Average encode time: {0:8.2f} ms\n".format(sum(encoder) / len(encoder) * 1000))
|
||||||
|
@ -76,7 +79,7 @@ def print_details_info(timestamps, model_config, args, whole_end2end):
|
||||||
memory_reserved = torch.cuda.memory_reserved()
|
memory_reserved = torch.cuda.memory_reserved()
|
||||||
max_memory_reserved = torch.cuda.max_memory_reserved()
|
max_memory_reserved = torch.cuda.max_memory_reserved()
|
||||||
with open(
|
with open(
|
||||||
f"{args.log_path}/llama-{args.model}{args.dtype}_pp{args.pp_size}_{args.seq_len}_{args.new_length}_bsz{args.batch_size}_mbsz{args.mb_size}.log",
|
log_file_name,
|
||||||
"a",
|
"a",
|
||||||
) as f:
|
) as f:
|
||||||
f.write(
|
f.write(
|
||||||
|
@ -90,18 +93,7 @@ def print_details_info(timestamps, model_config, args, whole_end2end):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def benchmark_inference(args):
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--model", default="toy", help="the size of model")
|
|
||||||
parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size")
|
|
||||||
parser.add_argument("-s", "--seq_len", type=int, default=8, help="sequence length")
|
|
||||||
parser.add_argument("--new_length", type=int, default=4, help="new tokens length")
|
|
||||||
parser.add_argument("--mb_size", type=int, default=1, help="micro_batch_size")
|
|
||||||
parser.add_argument("--pp_size", type=int, default=2, help="pipeline size")
|
|
||||||
parser.add_argument("--log_path", type=str, default="./log", help="where to store the benchmark log")
|
|
||||||
parser.add_argument("--dtype", type=str, default="fp16", help="data type")
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
if args.model == "toy":
|
if args.model == "toy":
|
||||||
model = transformers.LlamaForCausalLM(transformers.LlamaConfig(num_hidden_layers=8))
|
model = transformers.LlamaForCausalLM(transformers.LlamaConfig(num_hidden_layers=8))
|
||||||
elif args.model == "7b":
|
elif args.model == "7b":
|
||||||
|
@ -111,24 +103,50 @@ if __name__ == "__main__":
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
engine = PPInferEngine(
|
engine = CaiInferEngine(
|
||||||
pp_size=args.pp_size,
|
pp_size=args.pp_size,
|
||||||
|
tp_size=args.tp_size,
|
||||||
dtype=args.dtype,
|
dtype=args.dtype,
|
||||||
micro_batch_size=args.mb_size,
|
micro_batch_size=args.mb_size,
|
||||||
new_length=args.new_length,
|
|
||||||
model=model,
|
model=model,
|
||||||
model_policy=LlamaModelInferPolicy(),
|
model_policy=LlamaModelInferPolicy(),
|
||||||
verbose=True,
|
verbose=True,
|
||||||
max_batch_size=args.mb_size,
|
max_batch_size=args.mb_size,
|
||||||
max_input_len=args.seq_len,
|
max_input_len=args.seq_len,
|
||||||
max_output_len=args.seq_len + args.new_length + 256,
|
max_output_len=args.output_len,
|
||||||
)
|
)
|
||||||
data = data_gen(args.batch_size, args.seq_len)
|
data = data_gen(args.batch_size, args.seq_len)
|
||||||
|
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
whole_end2end = time.time()
|
whole_end2end = time.time()
|
||||||
output, timestamps = engine.inference([data])
|
output, timestamps = engine.inference(data)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
whole_end2end = time.time() - whole_end2end
|
whole_end2end = time.time() - whole_end2end
|
||||||
|
|
||||||
print_details_info(timestamps, model.config, args, whole_end2end)
|
print_details_info(timestamps, model.config, args, whole_end2end)
|
||||||
|
|
||||||
|
|
||||||
|
def hybrid_inference(rank, world_size, port, args):
|
||||||
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
|
benchmark_inference(args)
|
||||||
|
|
||||||
|
|
||||||
|
@rerun_if_address_is_in_use()
|
||||||
|
@clear_cache_before_run()
|
||||||
|
def benchmark(args):
|
||||||
|
spawn(hybrid_inference, nprocs=args.tp_size * args.pp_size, args=args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--model", default="toy", help="the size of model")
|
||||||
|
parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size")
|
||||||
|
parser.add_argument("-s", "--seq_len", type=int, default=8, help="sequence length")
|
||||||
|
parser.add_argument("--mb_size", type=int, default=1, help="micro_batch_size")
|
||||||
|
parser.add_argument("--pp_size", type=int, default=2, help="pipeline size")
|
||||||
|
parser.add_argument("--tp_size", type=int, default=2, help="pipeline size")
|
||||||
|
parser.add_argument("--output_len", type=int, default=16, help="Output length")
|
||||||
|
parser.add_argument("--log_path", type=str, default="./log", help="where to store the benchmark log")
|
||||||
|
parser.add_argument("--dtype", type=str, default="fp16", help="data type")
|
||||||
|
args = parser.parse_args()
|
||||||
|
benchmark(args)
|
||||||
|
|
|
@ -1,105 +0,0 @@
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from _utils import print_perf_stats
|
|
||||||
from auto_gptq import AutoGPTQForCausalLM
|
|
||||||
from transformers import BloomTokenizerFast
|
|
||||||
|
|
||||||
import colossalai
|
|
||||||
from colossalai.inference.tensor_parallel.engine import TPInferEngine
|
|
||||||
from colossalai.logging import disable_existing_loggers
|
|
||||||
from colossalai.shardformer import ShardConfig
|
|
||||||
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
|
||||||
|
|
||||||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
|
||||||
|
|
||||||
|
|
||||||
def bench_bloom(args):
|
|
||||||
pretrained_model_dir = args.path
|
|
||||||
quantized_model_dir = args.quantized_path
|
|
||||||
max_batch_size = args.batch_size
|
|
||||||
max_input_len = args.input_len
|
|
||||||
max_output_len = args.output_len
|
|
||||||
|
|
||||||
tokenizer = BloomTokenizerFast.from_pretrained(pretrained_model_dir)
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
|
|
||||||
# load quantized model to the first GPU
|
|
||||||
model = AutoGPTQForCausalLM.from_quantized(
|
|
||||||
quantized_model_dir, device=torch.cuda.current_device(), inject_fused_attention=False
|
|
||||||
)
|
|
||||||
|
|
||||||
model = model.half()
|
|
||||||
|
|
||||||
model_config = model.config
|
|
||||||
shard_config = ShardConfig(
|
|
||||||
enable_tensor_parallelism=True if args.tp_size > 1 else False, extra_kwargs={"inference_only": True}
|
|
||||||
)
|
|
||||||
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
|
|
||||||
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
|
|
||||||
|
|
||||||
input_tokens = {
|
|
||||||
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
|
|
||||||
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
|
|
||||||
}
|
|
||||||
|
|
||||||
# init TPInferEngine and shard the original model
|
|
||||||
# To benchmark torch original, comment out the line of optimizing model
|
|
||||||
shard_config = ShardConfig(
|
|
||||||
enable_tensor_parallelism=True if args.tp_size > 1 else False,
|
|
||||||
extra_kwargs={"inference_only": True, "quant": "gptq"},
|
|
||||||
)
|
|
||||||
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
|
|
||||||
|
|
||||||
# prepare data for generation
|
|
||||||
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
|
|
||||||
input_tokens = {
|
|
||||||
"input_ids": torch.randint(10, 1000, (max_batch_size, max_input_len)),
|
|
||||||
"attention_mask": torch.ones((max_batch_size, max_input_len)),
|
|
||||||
}
|
|
||||||
for t in input_tokens:
|
|
||||||
if torch.is_tensor(input_tokens[t]):
|
|
||||||
input_tokens[t] = input_tokens[t].to(torch.cuda.current_device())
|
|
||||||
# print(f" input_tokens[{t}].shape: {input_tokens[t].shape}")
|
|
||||||
|
|
||||||
iters = 10
|
|
||||||
times = []
|
|
||||||
for i in range(iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.time()
|
|
||||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
out_len = outputs.shape[1]
|
|
||||||
print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
|
|
||||||
times.append((end - start) / (out_len - max_input_len))
|
|
||||||
|
|
||||||
print_perf_stats(times, model_config, max_batch_size)
|
|
||||||
|
|
||||||
|
|
||||||
def check_bloom(rank, world_size, port, args):
|
|
||||||
disable_existing_loggers()
|
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
|
||||||
bench_bloom(args)
|
|
||||||
|
|
||||||
|
|
||||||
@rerun_if_address_is_in_use()
|
|
||||||
@clear_cache_before_run()
|
|
||||||
def test_bloom(args):
|
|
||||||
spawn(check_bloom, args.tp_size, args=args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
|
|
||||||
parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
|
|
||||||
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
|
|
||||||
parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
|
|
||||||
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
|
|
||||||
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
test_bloom(args)
|
|
|
@ -1,87 +0,0 @@
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from _utils import print_perf_stats
|
|
||||||
from auto_gptq import AutoGPTQForCausalLM
|
|
||||||
from transformers import LlamaTokenizer
|
|
||||||
|
|
||||||
import colossalai
|
|
||||||
from colossalai.inference.tensor_parallel.engine import TPInferEngine
|
|
||||||
from colossalai.logging import disable_existing_loggers
|
|
||||||
from colossalai.shardformer import ShardConfig
|
|
||||||
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
|
||||||
|
|
||||||
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "true"
|
|
||||||
|
|
||||||
|
|
||||||
def run_llama_test(args):
|
|
||||||
pretrained_model_dir = args.path
|
|
||||||
quantized_model_dir = args.quantized_path
|
|
||||||
max_batch_size = args.batch_size
|
|
||||||
max_input_len = args.input_len
|
|
||||||
max_output_len = args.output_len
|
|
||||||
|
|
||||||
tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_dir, use_fast=True)
|
|
||||||
tokenizer.pad_token_id = tokenizer.eos_token_id
|
|
||||||
|
|
||||||
# load quantized model to the first GPU
|
|
||||||
model = AutoGPTQForCausalLM.from_quantized(
|
|
||||||
quantized_model_dir, device=torch.cuda.current_device(), inject_fused_attention=False
|
|
||||||
)
|
|
||||||
|
|
||||||
model_config = model.config
|
|
||||||
shard_config = ShardConfig(
|
|
||||||
enable_tensor_parallelism=True if args.tp_size > 1 else False,
|
|
||||||
extra_kwargs={"inference_only": True, "quant": "gptq"},
|
|
||||||
)
|
|
||||||
infer_engine = TPInferEngine(model, shard_config, max_batch_size, max_input_len, max_output_len)
|
|
||||||
|
|
||||||
generate_kwargs = dict(max_new_tokens=max_output_len, do_sample=False)
|
|
||||||
|
|
||||||
input_tokens = {
|
|
||||||
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
|
|
||||||
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
|
|
||||||
}
|
|
||||||
|
|
||||||
iters = 10
|
|
||||||
times = []
|
|
||||||
|
|
||||||
for i in range(iters):
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
start = time.time()
|
|
||||||
outputs = infer_engine.generate(input_tokens, **generate_kwargs)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.time()
|
|
||||||
out_len = outputs.shape[1]
|
|
||||||
print(f" iter {i}: out len {str(out_len)}, generation time {str(end - start)} s")
|
|
||||||
times.append((end - start) / (out_len - max_input_len))
|
|
||||||
|
|
||||||
print_perf_stats(times, model_config, max_batch_size)
|
|
||||||
|
|
||||||
|
|
||||||
def check_llama(rank, world_size, port, args):
|
|
||||||
disable_existing_loggers()
|
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
|
||||||
run_llama_test(args)
|
|
||||||
|
|
||||||
|
|
||||||
@rerun_if_address_is_in_use()
|
|
||||||
@clear_cache_before_run()
|
|
||||||
def test_llama(args):
|
|
||||||
spawn(check_llama, args.tp_size, args=args)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
|
|
||||||
parser.add_argument("-q", "--quantized_path", type=str, help="Model path", required=True)
|
|
||||||
parser.add_argument("-tp", "--tp_size", type=int, default=1, help="Tensor parallel size")
|
|
||||||
parser.add_argument("-b", "--batch_size", type=int, default=16, help="Maximum batch size")
|
|
||||||
parser.add_argument("--input_len", type=int, default=1024, help="Maximum input length")
|
|
||||||
parser.add_argument("--output_len", type=int, default=128, help="Maximum output length")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
test_llama(args)
|
|
|
@ -0,0 +1,96 @@
|
||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
import transformers
|
||||||
|
from transformers import LlamaForCausalLM, LlamaTokenizer
|
||||||
|
|
||||||
|
import colossalai
|
||||||
|
from colossalai.inference import CaiInferEngine, LlamaModelInferPolicy
|
||||||
|
from colossalai.testing import clear_cache_before_run, rerun_if_address_is_in_use, spawn
|
||||||
|
|
||||||
|
|
||||||
|
def pipeline_inference_test(args):
|
||||||
|
llama_model_path = args.path
|
||||||
|
max_input_len = args.max_input_len
|
||||||
|
max_output_len = args.max_output_len
|
||||||
|
max_batch_size = args.batch_size
|
||||||
|
micro_batch_size = args.micro_batch_size
|
||||||
|
tp_size = args.tp_size
|
||||||
|
pp_size = args.pp_size
|
||||||
|
rank = dist.get_rank()
|
||||||
|
|
||||||
|
tokenizer = LlamaTokenizer.from_pretrained(llama_model_path)
|
||||||
|
tokenizer.pad_token_id = tokenizer.unk_token_id
|
||||||
|
model = LlamaForCausalLM.from_pretrained(llama_model_path, pad_token_id=tokenizer.eos_token_id)
|
||||||
|
model = model.half()
|
||||||
|
|
||||||
|
model = transformers.LlamaForCausalLM(
|
||||||
|
transformers.LlamaConfig(
|
||||||
|
vocab_size=20000, hidden_size=512, intermediate_size=1536, num_attention_heads=4, num_hidden_layers=4
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
engine = CaiInferEngine(
|
||||||
|
tp_size=tp_size,
|
||||||
|
pp_size=pp_size,
|
||||||
|
model=model,
|
||||||
|
model_policy=LlamaModelInferPolicy(),
|
||||||
|
max_output_len=max_output_len,
|
||||||
|
micro_batch_size=micro_batch_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
input_tokens = {
|
||||||
|
"input_ids": torch.randint(1, 1000, (max_batch_size, max_input_len), device="cuda"),
|
||||||
|
"attention_mask": torch.ones((max_batch_size, max_input_len), device="cuda"),
|
||||||
|
}
|
||||||
|
|
||||||
|
iters = 10
|
||||||
|
warmup = 3
|
||||||
|
times = []
|
||||||
|
|
||||||
|
for i in range(iters):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.time()
|
||||||
|
outputs = engine.inference(input_tokens)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = time.time()
|
||||||
|
if rank == 0:
|
||||||
|
out_len = len(outputs[0])
|
||||||
|
print("generation time {} s".format(str(end - start)))
|
||||||
|
print(out_len)
|
||||||
|
times.append((end - start) / out_len)
|
||||||
|
if rank == 0:
|
||||||
|
times = times[warmup:]
|
||||||
|
latency = sum(times) / len(times)
|
||||||
|
print("total process latency is : " + str(latency) + " s")
|
||||||
|
print("total throughput is : " + str(1 / latency * max_batch_size))
|
||||||
|
|
||||||
|
|
||||||
|
def check_tp_pipeline_inference(rank, world_size, port, args):
|
||||||
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
|
run_tp_pipeline_inference_test(args)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.dist
|
||||||
|
@rerun_if_address_is_in_use()
|
||||||
|
@clear_cache_before_run()
|
||||||
|
def test_inference(args):
|
||||||
|
spawn(check_tp_pipeline_inference, nprocs=args.tp_size * args.pp_size, args=args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("-p", "--path", type=str, help="Model path", required=True)
|
||||||
|
parser.add_argument("-tp", "--tp_size", type=int, default=2, help="Tensor parallel size")
|
||||||
|
parser.add_argument("-pp", "--pp_size", type=int, default=2, help="Tensor parallel size")
|
||||||
|
parser.add_argument("-b", "--batch_size", type=int, default=8, help="Maximum batch size")
|
||||||
|
parser.add_argument("--max_input_len", type=int, default=32, help="Maximum input length")
|
||||||
|
parser.add_argument("--max_output_len", type=int, default=16, help="Maximum output length")
|
||||||
|
parser.add_argument("--micro_batch_size", type=int, default=2, help="Micro batch size")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
test_inference(args)
|
|
@ -1,50 +1,55 @@
|
||||||
script_dir=$(cd "$(dirname "$0")" && pwd)
|
script_dir=$(cd "$(dirname "$0")" && pwd)
|
||||||
cd "${script_dir}"
|
cd "${script_dir}"
|
||||||
|
|
||||||
|
|
||||||
# 7b, fp16, 2 gpu, 1024, 128
|
# 7b, fp16, 2 gpu, 1024, 128
|
||||||
for BATCH_SIZE in 2 4 8 16; do
|
for BATCH_SIZE in 2 4 8 16; do
|
||||||
CUDA_VISIBLE_DEVICES=0,1 colossalai run --nproc_per_node 2 --master_port 29800 ./benchmark.py \
|
CUDA_VISIBLE_DEVICES=0,1,2,3 python ./benchmark.py \
|
||||||
--model="7b" \
|
--model="7b" \
|
||||||
--dtype="fp16" \
|
--dtype="fp16" \
|
||||||
--batch_size=${BATCH_SIZE} \
|
--batch_size=${BATCH_SIZE} \
|
||||||
--seq_len=1024 \
|
--seq_len=1024 \
|
||||||
--new_length=128 \
|
--new_length=128 \
|
||||||
--mb_size=$((${BATCH_SIZE}/2)) \
|
--mb_size=$((${BATCH_SIZE}/2)) \
|
||||||
--pp_size=2
|
--pp_size=2 \
|
||||||
|
--tp_size=2
|
||||||
done
|
done
|
||||||
|
|
||||||
# 7b, fp16, 2 gpu, 512, 512
|
# 7b, fp16, 2 gpu, 512, 512
|
||||||
for BATCH_SIZE in 2 4 8 16 32; do
|
for BATCH_SIZE in 2 4 8 16 32; do
|
||||||
CUDA_VISIBLE_DEVICES=0,1 colossalai run --nproc_per_node 2 --master_port 29800 ./benchmark.py \
|
CUDA_VISIBLE_DEVICES=0,1,2,3 python ./benchmark.py \
|
||||||
--model="7b" \
|
--model="7b" \
|
||||||
--dtype="fp16" \
|
--dtype="fp16" \
|
||||||
--batch_size=${BATCH_SIZE} \
|
--batch_size=${BATCH_SIZE} \
|
||||||
--seq_len=512 \
|
--seq_len=512 \
|
||||||
--new_length=512 \
|
--new_length=512 \
|
||||||
--mb_size=$((${BATCH_SIZE}/2)) \
|
--mb_size=$((${BATCH_SIZE}/2)) \
|
||||||
--pp_size=2
|
--pp_size=2 \
|
||||||
|
--tp_size=2
|
||||||
done
|
done
|
||||||
|
|
||||||
# 7b, fp16, 2 gpu, 1024, 128
|
# 7b, fp16, 2 gpu, 1024, 128
|
||||||
for BATCH_SIZE in 2 4 8; do
|
for BATCH_SIZE in 2 4 8; do
|
||||||
CUDA_VISIBLE_DEVICES=0,1 colossalai run --nproc_per_node 2 --master_port 29800 ./benchmark.py \
|
CUDA_VISIBLE_DEVICES=0,1,2,3 python ./benchmark.py \
|
||||||
--model="13b" \
|
--model="13b" \
|
||||||
--dtype="fp16" \
|
--dtype="fp16" \
|
||||||
--batch_size=${BATCH_SIZE} \
|
--batch_size=${BATCH_SIZE} \
|
||||||
--seq_len=1024 \
|
--seq_len=1024 \
|
||||||
--new_length=128 \
|
--new_length=128 \
|
||||||
--mb_size=$((${BATCH_SIZE}/2)) \
|
--mb_size=$((${BATCH_SIZE}/2)) \
|
||||||
--pp_size=2
|
--pp_size=2 \
|
||||||
|
--tp_size=2
|
||||||
done
|
done
|
||||||
|
|
||||||
# 13b, fp16, 2 gpu, 512, 512
|
# 13b, fp16, 2 gpu, 512, 512
|
||||||
for BATCH_SIZE in 2 4 8 16; do
|
for BATCH_SIZE in 2 4 8 16; do
|
||||||
CUDA_VISIBLE_DEVICES=0,1 colossalai run --nproc_per_node 2 --master_port 29800 ./benchmark.py \
|
CUDA_VISIBLE_DEVICES=0,1,2,3 python ./benchmark.py \
|
||||||
--model="13b" \
|
--model="13b" \
|
||||||
--dtype="fp16" \
|
--dtype="fp16" \
|
||||||
--batch_size=${BATCH_SIZE} \
|
--batch_size=${BATCH_SIZE} \
|
||||||
--seq_len=512 \
|
--seq_len=512 \
|
||||||
--new_length=512 \
|
--new_length=512 \
|
||||||
--mb_size=$((${BATCH_SIZE}/2)) \
|
--mb_size=$((${BATCH_SIZE}/2)) \
|
||||||
--pp_size=2
|
--pp_size=2 \
|
||||||
|
--tp_size=2
|
||||||
done
|
done
|
||||||
|
|
Loading…
Reference in New Issue