|
|
|
@ -1,6 +1,7 @@
|
|
|
|
|
import pytest |
|
|
|
|
import torch |
|
|
|
|
import torch.distributed as dist |
|
|
|
|
from packaging.version import Version |
|
|
|
|
from torch.optim import Adam |
|
|
|
|
from utils import shared_tempdir |
|
|
|
|
|
|
|
|
@ -19,14 +20,8 @@ from colossalai.testing import (
|
|
|
|
|
) |
|
|
|
|
from tests.kit.model_zoo import model_zoo |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@clear_cache_before_run() |
|
|
|
|
@parameterize("shard", [True, False]) |
|
|
|
|
@parameterize("model_name", ["transformers_gpt"]) |
|
|
|
|
@parameterize("size_per_shard", [32]) |
|
|
|
|
@parameterize( |
|
|
|
|
"test_config", |
|
|
|
|
[ |
|
|
|
|
if Version(torch.__version__) < Version("2.0.0"): |
|
|
|
|
TEST_CONFIGS = [ |
|
|
|
|
{ |
|
|
|
|
"tp_size": 4, |
|
|
|
|
"pp_size": 1, |
|
|
|
@ -35,8 +30,19 @@ from tests.kit.model_zoo import model_zoo
|
|
|
|
|
{"tp_size": 2, "pp_size": 2, "num_microbatches": 4, "precision": "fp16", "initial_scale": 1}, |
|
|
|
|
{"tp_size": 2, "pp_size": 1, "zero_stage": 2, "precision": "fp16", "initial_scale": 1}, |
|
|
|
|
{"tp_size": 1, "pp_size": 2, "num_microbatches": 4, "zero_stage": 1, "precision": "fp16", "initial_scale": 1}, |
|
|
|
|
], |
|
|
|
|
) |
|
|
|
|
] |
|
|
|
|
else: |
|
|
|
|
TEST_CONFIGS = [ |
|
|
|
|
# TODO(ver217): other configs lead to hang |
|
|
|
|
{"tp_size": 1, "pp_size": 2, "num_microbatches": 4, "zero_stage": 1, "precision": "fp16", "initial_scale": 1}, |
|
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@clear_cache_before_run() |
|
|
|
|
@parameterize("shard", [True, False]) |
|
|
|
|
@parameterize("model_name", ["transformers_gpt"]) |
|
|
|
|
@parameterize("size_per_shard", [32]) |
|
|
|
|
@parameterize("test_config", TEST_CONFIGS) |
|
|
|
|
def exam_state_dict(shard: bool, model_name: str, size_per_shard: int, test_config: dict): |
|
|
|
|
(model_fn, data_gen_fn, output_transform_fn, loss_fn, _) = next( |
|
|
|
|
iter(model_zoo.get_sub_registry(model_name).values()) |
|
|
|
|