[ci] fix shardformer tests. (#5255)

* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------

Co-authored-by: Wenhao Chen <cwher@outlook.com>
pull/5230/head
flybird11111 2024-01-11 19:07:45 +08:00 committed by GitHub
parent 756c400ad2
commit e830ef917d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 20 additions and 3 deletions

View File

@ -919,6 +919,7 @@ class HybridParallelPlugin(PipelinePluginBase):
custom_policy (Policy, optional): Custom policy for Shardformer. Defaults to None.
pp_style (str, optional): The style for pipeline parallelism. Defaults to '1f1b'.
num_model_chunks (int, optional): The number of model chunks for interleaved pipeline parallelism. Defaults to 1.
enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
"""
def __init__(
@ -956,6 +957,7 @@ class HybridParallelPlugin(PipelinePluginBase):
custom_policy: Policy = None,
pp_style: str = "1f1b",
num_model_chunks: int = 1,
enable_metadata_cache: bool = True,
) -> None:
super().__init__()
assert (
@ -1002,10 +1004,14 @@ class HybridParallelPlugin(PipelinePluginBase):
num_model_chunks=num_model_chunks,
num_microbatch=num_microbatches,
microbatch_size=microbatch_size,
enable_metadata_cache=enable_metadata_cache,
)
elif pp_style == "1f1b":
self.schedule = OneForwardOneBackwardSchedule(
self.stage_manager, num_microbatches=num_microbatches, microbatch_size=microbatch_size
stage_manager=self.stage_manager,
num_microbatches=num_microbatches,
microbatch_size=microbatch_size,
enable_metadata_cache=enable_metadata_cache,
)
else:
raise NotImplementedError()

View File

@ -165,7 +165,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
)
@clear_cache_before_run()
def run_gpt2_test(test_config):
sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt")
sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt", exclude="transformers_gptj")
for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
@ -200,7 +200,7 @@ def run_gpt2_test(test_config):
)
@clear_cache_before_run()
def run_gpt2_3d_test(test_config):
sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt")
sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt", exclude="transformers_gptj")
for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)

View File

@ -86,6 +86,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 2,
"pp_size": 2,
"num_microbatches": 2,
"enable_metadata_cache": False,
"enable_all_optimization": True,
"use_lazy_init": True,
"precision": "fp16",
@ -95,6 +96,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1,
"pp_size": 2,
"num_microbatches": 4,
"enable_metadata_cache": False,
"use_lazy_init": False,
"precision": "fp16",
"initial_scale": 1,
@ -110,6 +112,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1,
"pp_size": 4,
"num_microbatches": 4,
"enable_metadata_cache": False,
"enable_all_optimization": False,
"use_lazy_init": False,
"precision": "fp32",
@ -128,6 +131,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1,
"pp_size": 2,
"num_microbatches": 2,
"enable_metadata_cache": False,
"enable_all_optimization": True,
"use_lazy_init": True,
"zero_stage": 1,
@ -159,6 +163,7 @@ def run_t5_test(test_config):
"tp_size": 2,
"pp_size": 2,
"num_microbatches": 4,
"enable_metadata_cache": False,
"enable_all_optimization": False,
"use_lazy_init": False,
"precision": "fp32",
@ -168,6 +173,7 @@ def run_t5_test(test_config):
"tp_size": 2,
"pp_size": 2,
"num_microbatches": 4,
"enable_metadata_cache": False,
"enable_all_optimization": False,
"use_lazy_init": False,
"precision": "fp16",

View File

@ -114,6 +114,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 2,
"pp_size": 2,
"num_microbatches": 2,
"enable_metadata_cache": False,
"enable_all_optimization": True,
"use_lazy_init": True,
"precision": "fp32",
@ -123,6 +124,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1,
"pp_size": 2,
"num_microbatches": 4,
"enable_metadata_cache": False,
"use_lazy_init": False,
"precision": "fp32",
"initial_scale": 1,
@ -138,6 +140,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1,
"pp_size": 4,
"num_microbatches": 4,
"enable_metadata_cache": False,
"use_lazy_init": False,
"precision": "fp32",
},
@ -163,6 +166,7 @@ def run_whisper_test(test_config):
"tp_size": 2,
"pp_size": 2,
"num_microbatches": 4,
"enable_metadata_cache": False,
"enable_all_optimization": False,
"use_lazy_init": False,
"precision": "fp32",
@ -172,6 +176,7 @@ def run_whisper_test(test_config):
"tp_size": 2,
"pp_size": 2,
"num_microbatches": 2,
"enable_metadata_cache": False,
"enable_all_optimization": False,
"use_lazy_init": False,
"precision": "fp32",