ColossalAI/tests/test_zero_data_parallel/test_zero_engine.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from functools import partial

import colossalai
import pytest
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from colossalai.core import global_context as gpc
from colossalai.utils import free_port
from colossalai.zero.init_ctx import ZeroInitContext
from colossalai.zero.sharded_model.utils import col_model_deepcopy
from colossalai.zero.sharded_optim._utils import has_inf_or_nan
from colossalai.testing import rerun_on_exception
from tests.components_to_test.registry import non_distributed_component_funcs
from torch.nn.parallel import DistributedDataParallel as DDP

from common import (MP_PARALLEL_CONFIG, ZERO_PARALLEL_CONFIG, check_params, check_sharded_model_params)


def run_dist(rank, world_size, port, parallel_config):
    colossalai.launch(config=parallel_config,
                      rank=rank,
                      world_size=world_size,
                      host='localhost',
                      port=port,
                      backend='nccl')

    test_models = ['repeated_computed_layers', 'resnet18', 'bert']
    for model_name in test_models:
        get_components_func = non_distributed_component_funcs.get_callable(model_name)
        model_builder, train_dataloader, _, optimizer_class, criterion = get_components_func()
        with ZeroInitContext(convert_fp16=hasattr(gpc.config, 'fp16'),
                             target_device=torch.cuda.current_device(),
                             shard_strategy=gpc.config.zero.model_config.shard_strategy,
                             shard_param=True):
            colo_model = model_builder(checkpoint=True)

        colo_optimizer = optimizer_class(colo_model.parameters(), lr=1e-3)
        engine, train_dataloader, _, _ = colossalai.initialize(colo_model,
                                                               optimizer=colo_optimizer,
                                                               criterion=criterion,
                                                               train_dataloader=train_dataloader)
        torch_model = model_builder(checkpoint=True).half()
        col_model_deepcopy(engine.model, torch_model)
        torch_model = torch_model.cuda().float()

        engine.train()
        torch_optimizer = optimizer_class(torch_model.parameters(), lr=1e-3)

        if dist.get_world_size() > 1:
            torch_model = DDP(torch_model)

        i = 0
        for data, label in train_dataloader:
            if i > 4:
                break

            data, label = data.cuda(), label.cuda()

            engine.zero_grad()
            torch_optimizer.zero_grad()

            if criterion:
                output = engine(data)
                loss = engine.criterion(output, label)

                torch_output = torch_model(data)
                torch_loss = engine.criterion(torch_output, label)
            else:
                loss = engine(data, label)
                torch_loss = torch_model(data, label)

            engine.backward(loss)
            engine.step()

            torch_loss.backward()

            for param in torch_model.parameters():
                if param.grad is not None:
                    assert not has_inf_or_nan(param.grad)

            torch_optimizer.step()
            i += 1

        if parallel_config == MP_PARALLEL_CONFIG:
            check_params(torch_model, colo_model, loose=True)
        elif parallel_config == ZERO_PARALLEL_CONFIG:
            check_sharded_model_params(torch_model, colo_model, loose=True)


# FIXME: enable this test in next PR


@pytest.mark.skip
@pytest.mark.dist
@pytest.mark.parametrize("world_size", [2, 4])
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
def test_mp_engine(world_size):
    run_func = partial(run_dist, world_size=world_size, port=free_port(), parallel_config=MP_PARALLEL_CONFIG)
    mp.spawn(run_func, nprocs=world_size)


@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
def test_zero_engine(world_size):
    run_func = partial(run_dist, world_size=world_size, port=free_port(), parallel_config=ZERO_PARALLEL_CONFIG)
    mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
    test_zero_engine(world_size=4)
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`#!/usr/bin/env python`
			`# -- encoding: utf-8 --`

			`from functools import partial`

			`import colossalai`
update sharded optim and fix zero init ctx (#457) 2022-03-18 07:44:47 +00:00			`import pytest`
			`import torch`
			`import torch.distributed as dist`
			`import torch.multiprocessing as mp`
			`from colossalai.core import global_context as gpc`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`from colossalai.utils import free_port`
update sharded optim and fix zero init ctx (#457) 2022-03-18 07:44:47 +00:00			`from colossalai.zero.init_ctx import ZeroInitContext`
			`from colossalai.zero.sharded_model.utils import col_model_deepcopy`
[test] make zero engine test really work (#447) 2022-03-17 09:24:25 +00:00			`from colossalai.zero.sharded_optim._utils import has_inf_or_nan`
[test] fixed rerun_on_exception and adapted test cases (#487) 2022-03-25 09:25:12 +00:00			`from colossalai.testing import rerun_on_exception`
update sharded optim and fix zero init ctx (#457) 2022-03-18 07:44:47 +00:00			`from tests.components_to_test.registry import non_distributed_component_funcs`
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`from torch.nn.parallel import DistributedDataParallel as DDP`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00
[zero] sharded model support the reuse of fp16 shard (#495) * sharded model supports reuse fp16 shard * rename variable * polish code * polish code * polish code 2022-03-23 06:59:59 +00:00			`from common import (MP_PARALLEL_CONFIG, ZERO_PARALLEL_CONFIG, check_params, check_sharded_model_params)`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00

[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`def run_dist(rank, world_size, port, parallel_config):`
			`colossalai.launch(config=parallel_config,`
[unitest] polish zero config in unittest (#438) 2022-03-17 02:20:53 +00:00			`rank=rank,`
			`world_size=world_size,`
			`host='localhost',`
			`port=port,`
			`backend='nccl')`

			`test_models = ['repeated_computed_layers', 'resnet18', 'bert']`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`for model_name in test_models:`
			`get_components_func = non_distributed_component_funcs.get_callable(model_name)`
			`model_builder, train_dataloader, _, optimizer_class, criterion = get_components_func()`
update sharded optim and fix zero init ctx (#457) 2022-03-18 07:44:47 +00:00			`with ZeroInitContext(convert_fp16=hasattr(gpc.config, 'fp16'),`
			`target_device=torch.cuda.current_device(),`
[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 2022-03-18 08:18:31 +00:00			`shard_strategy=gpc.config.zero.model_config.shard_strategy,`
update sharded optim and fix zero init ctx (#457) 2022-03-18 07:44:47 +00:00			`shard_param=True):`
			`colo_model = model_builder(checkpoint=True)`

[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 2022-03-18 08:18:31 +00:00			`colo_optimizer = optimizer_class(colo_model.parameters(), lr=1e-3)`
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`engine, train_dataloader, _, _ = colossalai.initialize(colo_model,`
[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 2022-03-18 08:18:31 +00:00			`optimizer=colo_optimizer,`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`criterion=criterion,`
			`train_dataloader=train_dataloader)`
[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 2022-03-18 08:18:31 +00:00			`torch_model = model_builder(checkpoint=True).half()`
			`col_model_deepcopy(engine.model, torch_model)`
			`torch_model = torch_model.cuda().float()`

[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`engine.train()`
[test] make zero engine test really work (#447) 2022-03-17 09:24:25 +00:00			`torch_optimizer = optimizer_class(torch_model.parameters(), lr=1e-3)`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`if dist.get_world_size() > 1:`
			`torch_model = DDP(torch_model)`

[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`i = 0`
			`for data, label in train_dataloader:`
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`if i > 4:`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`break`

			`data, label = data.cuda(), label.cuda()`

			`engine.zero_grad()`
			`torch_optimizer.zero_grad()`

			`if criterion:`
			`output = engine(data)`
			`loss = engine.criterion(output, label)`

[unitest] polish zero config in unittest (#438) 2022-03-17 02:20:53 +00:00			`torch_output = torch_model(data)`
			`torch_loss = engine.criterion(torch_output, label)`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`else:`
			`loss = engine(data, label)`
			`torch_loss = torch_model(data, label)`

			`engine.backward(loss)`
			`engine.step()`

			`torch_loss.backward()`
[test] make zero engine test really work (#447) 2022-03-17 09:24:25 +00:00
			`for param in torch_model.parameters():`
			`if param.grad is not None:`
			`assert not has_inf_or_nan(param.grad)`

[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`torch_optimizer.step()`
			`i += 1`

[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`if parallel_config == MP_PARALLEL_CONFIG:`
			`check_params(torch_model, colo_model, loose=True)`
[test] make zero engine test really work (#447) 2022-03-17 09:24:25 +00:00			`elif parallel_config == ZERO_PARALLEL_CONFIG:`
[zero] sharded model support the reuse of fp16 shard (#495) * sharded model supports reuse fp16 shard * rename variable * polish code * polish code * polish code 2022-03-23 06:59:59 +00:00			`check_sharded_model_params(torch_model, colo_model, loose=True)`
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00

update sharded optim and fix zero init ctx (#457) 2022-03-18 07:44:47 +00:00			`# FIXME: enable this test in next PR`


			`@pytest.mark.skip`
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`@pytest.mark.dist`
			`@pytest.mark.parametrize("world_size", [2, 4])`
[test] fixed rerun_on_exception and adapted test cases (#487) 2022-03-25 09:25:12 +00:00			`@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".Address already in use.")`
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`def test_mp_engine(world_size):`
			`run_func = partial(run_dist, world_size=world_size, port=free_port(), parallel_config=MP_PARALLEL_CONFIG)`
			`mp.spawn(run_func, nprocs=world_size)`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00

			`@pytest.mark.dist`
			`@pytest.mark.parametrize("world_size", [1, 2])`
[test] fixed rerun_on_exception and adapted test cases (#487) 2022-03-25 09:25:12 +00:00			`@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".Address already in use.")`
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`def test_zero_engine(world_size):`
			`run_func = partial(run_dist, world_size=world_size, port=free_port(), parallel_config=ZERO_PARALLEL_CONFIG)`
[refactory] refactory the initialize method for new zero design (#431) 2022-03-16 11:29:37 +00:00			`mp.spawn(run_func, nprocs=world_size)`


			`if __name__ == '__main__':`
[hotfix] fix initialize bug with zero (#442) 2022-03-17 05:16:22 +00:00			`test_zero_engine(world_size=4)`