|
|
|
@ -14,7 +14,7 @@ from tests.components_to_test.registry import non_distributed_component_funcs
|
|
|
|
|
from torch.nn.parallel import DistributedDataParallel as DDP |
|
|
|
|
from colossalai.nn.optimizer import CPUAdam |
|
|
|
|
from colossalai.zero.sharded_optim._utils import has_inf_or_nan |
|
|
|
|
|
|
|
|
|
from colossalai.testing import parameterize |
|
|
|
|
from common import CONFIG, check_sharded_params_padding |
|
|
|
|
|
|
|
|
|
|
|
|
|
@ -36,8 +36,10 @@ def _run_step(model, optimizer, data, label, criterion, enable_autocast=False):
|
|
|
|
|
optimizer.step() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _run_dist(rank, world_size, port, cpu_offload, shard_strategy, use_cpuadam): |
|
|
|
|
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') |
|
|
|
|
@parameterize("cpu_offload", [True, False]) |
|
|
|
|
@parameterize("use_cpuadam", [True, False]) |
|
|
|
|
@parameterize("shard_strategy", [TensorShardStrategy, BucketTensorShardStrategy]) |
|
|
|
|
def _run_test_sharded_optim_v2(cpu_offload, shard_strategy, use_cpuadam): |
|
|
|
|
test_models = ['repeated_computed_layers', 'resnet18', 'bert'] |
|
|
|
|
shard_strategy = shard_strategy() |
|
|
|
|
|
|
|
|
@ -76,36 +78,18 @@ def _run_dist(rank, world_size, port, cpu_offload, shard_strategy, use_cpuadam):
|
|
|
|
|
assert not has_inf_or_nan(param) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# use_cpuadam = True can be used with cpu_offload = False |
|
|
|
|
@pytest.mark.dist |
|
|
|
|
@pytest.mark.parametrize("world_size", [1, 2]) |
|
|
|
|
@pytest.mark.parametrize("cpu_offload", [False]) |
|
|
|
|
@pytest.mark.parametrize("use_cpuadam", [False]) |
|
|
|
|
@pytest.mark.parametrize("shard_strategy", [TensorShardStrategy, BucketTensorShardStrategy]) |
|
|
|
|
def test_sharded_optim_v2(world_size, cpu_offload, shard_strategy, use_cpuadam): |
|
|
|
|
run_func = partial(_run_dist, |
|
|
|
|
world_size=world_size, |
|
|
|
|
port=free_port(), |
|
|
|
|
cpu_offload=cpu_offload, |
|
|
|
|
shard_strategy=shard_strategy, |
|
|
|
|
use_cpuadam=use_cpuadam) |
|
|
|
|
mp.spawn(run_func, nprocs=world_size) |
|
|
|
|
def _run_dist(rank, world_size, port): |
|
|
|
|
colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') |
|
|
|
|
_run_test_sharded_optim_v2() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# use_cpuadam = True can be used with cpu_offload = False |
|
|
|
|
@pytest.mark.dist |
|
|
|
|
@pytest.mark.parametrize("world_size", [1, 2]) |
|
|
|
|
@pytest.mark.parametrize("cpu_offload", [True]) |
|
|
|
|
@pytest.mark.parametrize("shard_strategy", [TensorShardStrategy, BucketTensorShardStrategy]) |
|
|
|
|
@pytest.mark.parametrize("use_cpuadam", [True, False]) |
|
|
|
|
def test_sharded_optim_v2_cpu_adam(world_size, cpu_offload, shard_strategy, use_cpuadam): |
|
|
|
|
run_func = partial(_run_dist, |
|
|
|
|
world_size=world_size, |
|
|
|
|
port=free_port(), |
|
|
|
|
cpu_offload=cpu_offload, |
|
|
|
|
shard_strategy=shard_strategy, |
|
|
|
|
use_cpuadam=use_cpuadam) |
|
|
|
|
def test_sharded_optim_v2(world_size): |
|
|
|
|
run_func = partial(_run_dist, world_size=world_size, port=free_port()) |
|
|
|
|
mp.spawn(run_func, nprocs=world_size) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
test_sharded_optim_v2_cpu_adam(world_size=2, cpu_offload=True, shard_strategy=TensorShardStrategy, use_cpuadam=True) |
|
|
|
|
test_sharded_optim_v2(world_size=2) |
|
|
|
|