ColossalAI/tests/test_zero/test_gemini/test_chunkv2.py

import pytest
import torch
import torch.distributed as dist
from torch.distributed.distributed_c10d import _get_default_group

import colossalai
from colossalai.tensor import ColoParameter
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
from colossalai.utils import get_current_device
from colossalai.zero.gemini import TensorState
from colossalai.zero.gemini.chunk import Chunk


def dist_sum(x):
    temp = torch.tensor([x], device=get_current_device())
    dist.all_reduce(temp)
    return temp.item()


def add_param(param_list, param_cp_list, *args, **kwargs):
    param = ColoParameter(torch.randn(*args, **kwargs))
    param_list.append(param)
    param_cp_list.append(param.clone())


def check_equal(param, param_cp):
    if param.device != param_cp.device:
        temp = param.data.to(param_cp.device)
    else:
        temp = param.data
    return torch.equal(temp, param_cp.data)


@parameterize('init_device', [None, torch.device('cpu')])
@parameterize('keep_gathered', [True, False])
@parameterize('pin_memory', [True, False])
def exam_chunk_basic(init_device, keep_gathered, pin_memory):
    world_size = torch.distributed.get_world_size()
    pg = _get_default_group()
    my_chunk = Chunk(chunk_size=1024,
                     process_group=pg,
                     dtype=torch.float32,
                     init_device=init_device,
                     cpu_shard_init=True,
                     keep_gathered=keep_gathered,
                     pin_memory=pin_memory)

    param_list = []
    param_cp_list = []

    add_param(param_list, param_cp_list, 8, 8, 8, device='cuda')
    add_param(param_list, param_cp_list, 4, 4)
    add_param(param_list, param_cp_list, 4, 8, 2, device='cuda')
    add_param(param_list, param_cp_list, 1, 1, 5)

    for param in param_list:
        my_chunk.append_tensor(param)
    assert my_chunk.utilized_size == 597
    for param, param_cp in zip(param_list, param_cp_list):
        check_equal(param, param_cp)
    my_chunk.close_chunk()

    if keep_gathered is False:
        assert my_chunk.cpu_shard.size(0) == 1024 // world_size
        assert my_chunk.device_type == 'cpu'
        assert my_chunk.can_move
        my_chunk.shard_move(get_current_device())
    else:
        assert my_chunk.cuda_global_chunk.size(0) == 1024
        assert my_chunk.device_type == 'cuda'
        assert not my_chunk.can_move

    assert dist_sum(my_chunk.valid_end) == my_chunk.utilized_size
    flag = my_chunk.has_inf_or_nan
    assert not flag, "has_inf_or_nan is {}".format(flag)

    my_chunk.access_chunk()
    assert my_chunk.device_type == 'cuda'
    for param, param_cp in zip(param_list, param_cp_list):
        check_equal(param, param_cp)

    assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 4
    my_chunk.tensor_trans_state(param_list[0], TensorState.COMPUTE)
    assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 3
    assert my_chunk.tensor_state_cnter[TensorState.COMPUTE] == 1
    assert not my_chunk.can_release

    for param in param_list:
        my_chunk.tensor_trans_state(param, TensorState.COMPUTE)
        my_chunk.tensor_trans_state(param, TensorState.HOLD_AFTER_BWD)
        my_chunk.tensor_trans_state(param, TensorState.READY_FOR_REDUCE)

    assert my_chunk.tensor_state_cnter[TensorState.READY_FOR_REDUCE] == 4
    assert my_chunk.can_reduce
    my_chunk.reduce()
    assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 4

    if keep_gathered is False:
        assert my_chunk.cuda_shard.size(0) == 1024 // world_size
        assert my_chunk.device_type == 'cuda'
        assert my_chunk.can_move
    else:
        assert my_chunk.cuda_global_chunk.size(0) == 1024
        assert my_chunk.device_type == 'cuda'
        assert not my_chunk.can_move


def run_dist(rank, world_size, port):
    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
    exam_chunk_basic()


@pytest.mark.dist
@pytest.mark.parametrize('world_size', [1, 2, 4])
@rerun_if_address_is_in_use()
def test_chunk_function(world_size):
    spawn(run_dist, world_size)


if __name__ == '__main__':
    test_chunk_function(4)
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`import pytest`
			`import torch`
			`import torch.distributed as dist`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`from torch.distributed.distributed_c10d import _get_default_group`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00
			`import colossalai`
			`from colossalai.tensor import ColoParameter`
[test] refactor tests with spawn (#3452) * [test] added spawn decorator * polish code * polish code * polish code * polish code * polish code * polish code 2023-04-06 06:51:35 +00:00			`from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn`
			`from colossalai.utils import get_current_device`
[zero] reorganize zero/gemini folder structure (#3424) * [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import 2023-04-04 05:48:16 +00:00			`from colossalai.zero.gemini import TensorState`
			`from colossalai.zero.gemini.chunk import Chunk`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00

			`def dist_sum(x):`
			`temp = torch.tensor([x], device=get_current_device())`
			`dist.all_reduce(temp)`
			`return temp.item()`


			`def add_param(param_list, param_cp_list, args, *kwargs):`
			`param = ColoParameter(torch.randn(args, *kwargs))`
			`param_list.append(param)`
			`param_cp_list.append(param.clone())`


[CI] fix typo with tests/ etc. (#3727) * fix spelling error with examples/comminity/ * fix spelling error with tests/ * fix some spelling error with tests/ colossalai/ etc. * fix spelling error with tests/ etc. date:2023.5.10 2023-05-11 08:30:58 +00:00			`def check_equal(param, param_cp):`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`if param.device != param_cp.device:`
			`temp = param.data.to(param_cp.device)`
			`else:`
			`temp = param.data`
			`return torch.equal(temp, param_cp.data)`


			`@parameterize('init_device', [None, torch.device('cpu')])`
			`@parameterize('keep_gathered', [True, False])`
			`@parameterize('pin_memory', [True, False])`
			`def exam_chunk_basic(init_device, keep_gathered, pin_memory):`
			`world_size = torch.distributed.get_world_size()`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`pg = _get_default_group()`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`my_chunk = Chunk(chunk_size=1024,`
			`process_group=pg,`
			`dtype=torch.float32,`
			`init_device=init_device,`
			`cpu_shard_init=True,`
			`keep_gathered=keep_gathered,`
			`pin_memory=pin_memory)`

			`param_list = []`
			`param_cp_list = []`

			`add_param(param_list, param_cp_list, 8, 8, 8, device='cuda')`
			`add_param(param_list, param_cp_list, 4, 4)`
			`add_param(param_list, param_cp_list, 4, 8, 2, device='cuda')`
			`add_param(param_list, param_cp_list, 1, 1, 5)`

			`for param in param_list:`
			`my_chunk.append_tensor(param)`
			`assert my_chunk.utilized_size == 597`
			`for param, param_cp in zip(param_list, param_cp_list):`
[CI] fix typo with tests/ etc. (#3727) * fix spelling error with examples/comminity/ * fix spelling error with tests/ * fix some spelling error with tests/ colossalai/ etc. * fix spelling error with tests/ etc. date:2023.5.10 2023-05-11 08:30:58 +00:00			`check_equal(param, param_cp)`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`my_chunk.close_chunk()`

			`if keep_gathered is False:`
			`assert my_chunk.cpu_shard.size(0) == 1024 // world_size`
			`assert my_chunk.device_type == 'cpu'`
			`assert my_chunk.can_move`
			`my_chunk.shard_move(get_current_device())`
			`else:`
[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00			`assert my_chunk.cuda_global_chunk.size(0) == 1024`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`assert my_chunk.device_type == 'cuda'`
			`assert not my_chunk.can_move`

			`assert dist_sum(my_chunk.valid_end) == my_chunk.utilized_size`
			`flag = my_chunk.has_inf_or_nan`
			`assert not flag, "has_inf_or_nan is {}".format(flag)`

			`my_chunk.access_chunk()`
			`assert my_chunk.device_type == 'cuda'`
			`for param, param_cp in zip(param_list, param_cp_list):`
[CI] fix typo with tests/ etc. (#3727) * fix spelling error with examples/comminity/ * fix spelling error with tests/ * fix some spelling error with tests/ colossalai/ etc. * fix spelling error with tests/ etc. date:2023.5.10 2023-05-11 08:30:58 +00:00			`check_equal(param, param_cp)`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00
[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00			`assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 4`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`my_chunk.tensor_trans_state(param_list[0], TensorState.COMPUTE)`
[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00			`assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 3`
			`assert my_chunk.tensor_state_cnter[TensorState.COMPUTE] == 1`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`assert not my_chunk.can_release`

			`for param in param_list:`
			`my_chunk.tensor_trans_state(param, TensorState.COMPUTE)`
[zero] fix error for BEiT models (#2169) * [zero] fix error for BEiT models * [ColoParameter] add unpack operation for tuple arguments * fix bugs * fix chunkv2 unit testing * add assertion for gradient state 2022-12-26 07:03:54 +00:00			`my_chunk.tensor_trans_state(param, TensorState.HOLD_AFTER_BWD)`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`my_chunk.tensor_trans_state(param, TensorState.READY_FOR_REDUCE)`

[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00			`assert my_chunk.tensor_state_cnter[TensorState.READY_FOR_REDUCE] == 4`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`assert my_chunk.can_reduce`
			`my_chunk.reduce()`
[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00			`assert my_chunk.tensor_state_cnter[TensorState.HOLD] == 4`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00
			`if keep_gathered is False:`
			`assert my_chunk.cuda_shard.size(0) == 1024 // world_size`
			`assert my_chunk.device_type == 'cuda'`
			`assert my_chunk.can_move`
			`else:`
[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00			`assert my_chunk.cuda_global_chunk.size(0) == 1024`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00			`assert my_chunk.device_type == 'cuda'`
			`assert not my_chunk.can_move`


			`def run_dist(rank, world_size, port):`
			`colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')`
			`exam_chunk_basic()`


			`@pytest.mark.dist`
			`@pytest.mark.parametrize('world_size', [1, 2, 4])`
			`@rerun_if_address_is_in_use()`
			`def test_chunk_function(world_size):`
[test] refactor tests with spawn (#3452) * [test] added spawn decorator * polish code * polish code * polish code * polish code * polish code * polish code 2023-04-06 06:51:35 +00:00			`spawn(run_dist, world_size)`
[hotfix] fix zero's incompatibility with checkpoint in torch-1.12 (#1786) * [hotfix] fix zero's incompatibility with checkpoint in torch-1.12 * [zero] add cpu shard init * [zero] add tiny example test * [colo_tensor] fix bugs for torch-1.11 2022-11-02 08:11:34 +00:00

			`if __name__ == '__main__':`
			`test_chunk_function(4)`