mirror of https://github.com/hpcaitech/ColossalAI
[hotfix/hybridengine] fix bug when tp*pp size = 1 (#5069)
parent
e5ce4c8ea6
commit
0c7d8bebd5
|
@ -126,7 +126,7 @@ class CaiInferEngine:
|
||||||
# Init pg mesh
|
# Init pg mesh
|
||||||
pg_mesh = ProcessGroupMesh(pp_size, tp_size)
|
pg_mesh = ProcessGroupMesh(pp_size, tp_size)
|
||||||
|
|
||||||
stage_manager = PipelineStageManager(pg_mesh, PP_AXIS, True)
|
stage_manager = PipelineStageManager(pg_mesh, PP_AXIS, True if pp_size * tp_size > 1 else False)
|
||||||
self.cache_manager_list = [
|
self.cache_manager_list = [
|
||||||
self._init_manager(model, max_batch_size, max_input_len, max_output_len)
|
self._init_manager(model, max_batch_size, max_input_len, max_output_len)
|
||||||
for _ in range(micro_batch_buffer_size or pp_size)
|
for _ in range(micro_batch_buffer_size or pp_size)
|
||||||
|
@ -142,7 +142,9 @@ class CaiInferEngine:
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.schedule = GenerateSchedule(stage_manager, self.mb_manager, verbose)
|
self.schedule = GenerateSchedule(stage_manager, self.mb_manager, verbose)
|
||||||
|
|
||||||
self.model = self._shardformer(model, model_policy, stage_manager, pg_mesh.get_group_along_axis(TP_AXIS))
|
self.model = self._shardformer(
|
||||||
|
model, model_policy, stage_manager, pg_mesh.get_group_along_axis(TP_AXIS) if pp_size * tp_size > 1 else None
|
||||||
|
)
|
||||||
if quant == "gptq":
|
if quant == "gptq":
|
||||||
self.gptq_manager.post_init_gptq_buffer(self.model)
|
self.gptq_manager.post_init_gptq_buffer(self.model)
|
||||||
|
|
||||||
|
|
|
@ -78,17 +78,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
def check_tp_pipeline_inference(rank, world_size, port):
|
@parameterize("tp_size", [1])
|
||||||
|
@parameterize("pp_size", [1])
|
||||||
|
@parameterize("max_output_len", [2])
|
||||||
|
@parameterize("micro_batch_size", [1])
|
||||||
|
@clear_cache_before_run()
|
||||||
|
def run_single_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||||
|
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def check_tp_pp_inference(rank, world_size, port):
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
run_tp_pipeline_inference_test()
|
run_tp_pipeline_inference_test()
|
||||||
|
|
||||||
|
|
||||||
def check_single_inference(rank, world_size, port):
|
def check_tp_or_pp_inference(rank, world_size, port):
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
run_tp_inference_test()
|
run_tp_inference_test()
|
||||||
run_pipeline_inference_test()
|
run_pipeline_inference_test()
|
||||||
|
|
||||||
|
|
||||||
|
def check_single_inference(rank, world_size, port):
|
||||||
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
|
run_single_inference_test
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||||
|
@ -97,8 +112,9 @@ def check_single_inference(rank, world_size, port):
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
@clear_cache_before_run()
|
@clear_cache_before_run()
|
||||||
def test_pipeline_inference():
|
def test_pipeline_inference():
|
||||||
spawn(check_tp_pipeline_inference, nprocs=4)
|
spawn(check_tp_pp_inference, nprocs=4)
|
||||||
spawn(check_single_inference, nprocs=2)
|
spawn(check_tp_or_pp_inference, nprocs=2)
|
||||||
|
spawn(check_single_inference, nprocs=1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -86,17 +86,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
def check_tp_pipeline_inference(rank, world_size, port):
|
@parameterize("tp_size", [1])
|
||||||
|
@parameterize("pp_size", [1])
|
||||||
|
@parameterize("max_output_len", [2])
|
||||||
|
@parameterize("micro_batch_size", [1])
|
||||||
|
@clear_cache_before_run()
|
||||||
|
def run_single_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||||
|
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def check_tp_pp_inference(rank, world_size, port):
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
run_tp_pipeline_inference_test()
|
run_tp_pipeline_inference_test()
|
||||||
|
|
||||||
|
|
||||||
def check_single_inference(rank, world_size, port):
|
def check_tp_or_pp_inference(rank, world_size, port):
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
run_tp_inference_test()
|
run_tp_inference_test()
|
||||||
run_pipeline_inference_test()
|
run_pipeline_inference_test()
|
||||||
|
|
||||||
|
|
||||||
|
def check_single_inference(rank, world_size, port):
|
||||||
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
|
run_single_inference_test
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||||
|
@ -105,8 +120,9 @@ def check_single_inference(rank, world_size, port):
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
@clear_cache_before_run()
|
@clear_cache_before_run()
|
||||||
def test_pipeline_inference():
|
def test_pipeline_inference():
|
||||||
spawn(check_tp_pipeline_inference, nprocs=4)
|
spawn(check_tp_pp_inference, nprocs=4)
|
||||||
spawn(check_single_inference, nprocs=2)
|
spawn(check_tp_or_pp_inference, nprocs=2)
|
||||||
|
spawn(check_single_inference, nprocs=1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -83,17 +83,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
def check_tp_pipeline_inference(rank, world_size, port):
|
@parameterize("tp_size", [1])
|
||||||
|
@parameterize("pp_size", [1])
|
||||||
|
@parameterize("max_output_len", [2])
|
||||||
|
@parameterize("micro_batch_size", [1])
|
||||||
|
@clear_cache_before_run()
|
||||||
|
def run_single_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
|
||||||
|
pipeline_inference_test(tp_size, pp_size, max_output_len, micro_batch_size)
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def check_tp_pp_inference(rank, world_size, port):
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
run_tp_pipeline_inference_test()
|
run_tp_pipeline_inference_test()
|
||||||
|
|
||||||
|
|
||||||
def check_single_inference(rank, world_size, port):
|
def check_tp_or_pp_inference(rank, world_size, port):
|
||||||
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
run_tp_inference_test()
|
run_tp_inference_test()
|
||||||
run_pipeline_inference_test()
|
run_pipeline_inference_test()
|
||||||
|
|
||||||
|
|
||||||
|
def check_single_inference(rank, world_size, port):
|
||||||
|
colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
|
||||||
|
run_single_inference_test
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(
|
@pytest.mark.skipif(
|
||||||
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
not CUDA_SUPPORT or not HAS_LIGHTLLM_KERNEL,
|
||||||
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
reason="kv-cache manager engine requires cuda version to be higher than 11.5",
|
||||||
|
@ -102,8 +117,9 @@ def check_single_inference(rank, world_size, port):
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
@clear_cache_before_run()
|
@clear_cache_before_run()
|
||||||
def test_pipeline_inference():
|
def test_pipeline_inference():
|
||||||
spawn(check_tp_pipeline_inference, nprocs=4)
|
spawn(check_tp_pp_inference, nprocs=4)
|
||||||
spawn(check_single_inference, nprocs=2)
|
spawn(check_tp_or_pp_inference, nprocs=2)
|
||||||
|
spawn(check_single_inference, nprocs=1)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue