ColossalAI/tests/test_utils/test_checkpoint/test_checkpoint_3d.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

import pprint
from functools import partial

import colossalai.nn as col_nn
import pytest
import torch
import torch.multiprocessing as mp
import torch.nn as nn
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
from colossalai.logging import disable_existing_loggers
from colossalai.utils import free_port, get_current_device, is_using_pp
from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint
from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus


def build_pipeline(model):
    from colossalai.pipeline.utils import partition_uniform

    pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
    depth = len(model)
    start, end = partition_uniform(depth, pipeline_size, 1)[pipeline_rank][0]
    layers = []
    for i in range(depth):
        if start <= i < end:
            layers.append(model[i])
        else:
            layers.append(nn.Identity())
    return nn.Sequential(*tuple(layers))


def check_equal(A, B):
    assert torch.allclose(A, B, rtol=1e-3, atol=1e-2)

def check_checkpoint_3d(rank, world_size, port):
    config = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=8, mode="3d")),)

    disable_existing_loggers()
    launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")

    m1 = nn.Sequential(nn.Linear(4, 8), nn.Linear(8, 4))
    sd1 = m1.state_dict()
    if gpc.get_global_rank() == 0:
        print(f"Rank {gpc.get_global_rank()}:\n{pprint.pformat(sd1)}\n")
    save_checkpoint("test.pt", 0, m1)

    m2 = nn.Sequential(col_nn.Linear(4, 8), col_nn.Linear(8, 4))
    if is_using_pp():
        m2 = build_pipeline(m2)

    load_checkpoint("test.pt", m2)
    sd2 = m2.state_dict()
    if is_using_pp() and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
        sd2 = gather_pipeline_parallel_state_dict(sd2)
    print(f"Rank {gpc.get_global_rank()}:\n{pprint.pformat(sd2)}\n")

    if gpc.get_global_rank() == 0:
        for k, v in sd1.items():
            assert k in sd2
            check_equal(v, sd2[k].to(torch.device("cpu")))


@pytest.mark.dist
@skip_if_not_enough_gpus(min_gpus=8)
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
def test_checkpoint_3d():
    world_size = 8
    run_func = partial(check_checkpoint_3d, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


if __name__ == "__main__":
    test_checkpoint_3d()
[model checkpoint] added unit tests for checkpoint save/load (#599) 3 years ago			`#!/usr/bin/env python`
			`# -- encoding: utf-8 --`

			`import pprint`
			`from functools import partial`

			`import colossalai.nn as col_nn`
			`import pytest`
			`import torch`
			`import torch.multiprocessing as mp`
			`import torch.nn as nn`
			`from colossalai.context.parallel_mode import ParallelMode`
			`from colossalai.core import global_context as gpc`
			`from colossalai.initialize import launch`
			`from colossalai.logging import disable_existing_loggers`
			`from colossalai.utils import free_port, get_current_device, is_using_pp`
			`from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint`
[test] skip tests when not enough GPUs are detected (#1090) * [test] skip tests when not enough GPUs are detected * polish code * polish code 3 years ago			`from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus`
[model checkpoint] added unit tests for checkpoint save/load (#599) 3 years ago

			`def build_pipeline(model):`
[pipeline] refactor the pipeline module (#1087) * [pipeline] refactor the pipeline module * polish code 3 years ago			`from colossalai.pipeline.utils import partition_uniform`
[model checkpoint] added unit tests for checkpoint save/load (#599) 3 years ago
			`pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)`
			`pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)`
			`depth = len(model)`
			`start, end = partition_uniform(depth, pipeline_size, 1)[pipeline_rank][0]`
			`layers = []`
			`for i in range(depth):`
			`if start <= i < end:`
			`layers.append(model[i])`
			`else:`
			`layers.append(nn.Identity())`
			`return nn.Sequential(*tuple(layers))`


			`def check_equal(A, B):`
			`assert torch.allclose(A, B, rtol=1e-3, atol=1e-2)`

			`def check_checkpoint_3d(rank, world_size, port):`
[test] added missing decorators to model checkpointing tests 3 years ago			`config = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=8, mode="3d")),)`
[model checkpoint] added unit tests for checkpoint save/load (#599) 3 years ago
			`disable_existing_loggers()`
			`launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")`

			`m1 = nn.Sequential(nn.Linear(4, 8), nn.Linear(8, 4))`
			`sd1 = m1.state_dict()`
			`if gpc.get_global_rank() == 0:`
			`print(f"Rank {gpc.get_global_rank()}:\n{pprint.pformat(sd1)}\n")`
			`save_checkpoint("test.pt", 0, m1)`

			`m2 = nn.Sequential(col_nn.Linear(4, 8), col_nn.Linear(8, 4))`
			`if is_using_pp():`
			`m2 = build_pipeline(m2)`

			`load_checkpoint("test.pt", m2)`
			`sd2 = m2.state_dict()`
			`if is_using_pp() and gpc.get_local_rank(ParallelMode.TENSOR) == 0:`
			`sd2 = gather_pipeline_parallel_state_dict(sd2)`
			`print(f"Rank {gpc.get_global_rank()}:\n{pprint.pformat(sd2)}\n")`

			`if gpc.get_global_rank() == 0:`
			`for k, v in sd1.items():`
			`assert k in sd2`
			`check_equal(v, sd2[k].to(torch.device("cpu")))`


			`@pytest.mark.dist`
[test] skip tests when not enough GPUs are detected (#1090) * [test] skip tests when not enough GPUs are detected * polish code * polish code 3 years ago			`@skip_if_not_enough_gpus(min_gpus=8)`
[test] added missing decorators to model checkpointing tests 3 years ago			`@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".Address already in use.")`
[model checkpoint] added unit tests for checkpoint save/load (#599) 3 years ago			`def test_checkpoint_3d():`
			`world_size = 8`
			`run_func = partial(check_checkpoint_3d, world_size=world_size, port=free_port())`
			`mp.spawn(run_func, nprocs=world_size)`


			`if __name__ == "__main__":`
			`test_checkpoint_3d()`