ColossalAI/tests/test_zero_data_parallel/test_shard_param.py

from copy import deepcopy
from functools import partial

import colossalai
import pytest
import torch
import torch.multiprocessing as mp
from colossalai.testing import parameterize
from colossalai.utils import free_port
from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardStrategy)
from colossalai.zero.sharded_param import ShardedTensor
from colossalai.zero.sharded_param.sharded_param import ShardedParamV2
from colossalai.testing import rerun_on_exception
from tests.test_zero_data_parallel.common import CONFIG, allclose


@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])
def run_shard_tensor_with_strategy(shard_strategy_class, world_size):
    t = ShardedTensor(tensor=torch.randn(world_size * 2, 3))
    assert list(t.origin_shape) == [world_size * 2, 3]
    assert list(t.shape) == [world_size * 2, 3]

    shard_strategy = shard_strategy_class()

    # test shard strategy
    shard_strategy.shard([t])
    assert list(t.shape) == [6], f"{list(t.shape)} vs 6"
    shard_strategy.gather([t])
    assert list(t.shape) == [world_size * 2, 3], f"{list(t.shape)} vs {[world_size * 2, 3]}"


def _run_shard_tensor(rank, world_size, port):
    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
    run_shard_tensor_with_strategy(world_size=world_size)


@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
def test_shard_tensor(world_size):
    run_func = partial(_run_shard_tensor, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


def _run_shard_param_v2(rank, world_size, port):
    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')

    param = torch.nn.Parameter(torch.randn(2, 3))
    param_ref = deepcopy(param)
    sparam = ShardedParamV2(param=param, process_group=None)

    allclose(sparam.sharded_data_tensor.payload, param_ref.data)

    # Test get memory usage
    sparam.fp32_grad = torch.randn(2, 3)
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2, f"cpu_mem_use: {cpu_mem_use}"

    sparam.remove_torch_payload()
    assert (param.data.numel() == 1)
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    # 4 is size of dummy tensor of param.data
    assert cpu_mem_use == 2 * 3 * 4 * 2 + 4

    sparam.fp16_grad = torch.randn(2, 3).cuda().half()
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2 + 4
    assert cuda_mem_use == 2 * 3 * 2

    sparam.fp16_grad = None
    sparam.fp32_grad = torch.randn(2, 3)
    sparam.remove_torch_payload()
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2 + 4
    assert cuda_mem_use == 0

    # append a grad to torch param
    param.data = sparam.sharded_data_tensor.payload
    param.grad = torch.randn(2, 3)
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2 + 2 * 3 * 4, f"cpu_mem_use {cpu_mem_use}"
    assert cuda_mem_use == 0

    # reuse torch grad for sparam
    sparam.fp32_grad = param.grad
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2
    assert cuda_mem_use == 0


@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
def test_shard_param_v2(world_size):
    run_func = partial(_run_shard_param_v2, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
    # test_shard_tensor(2)
    test_shard_param_v2(2)
[zero] yet an improved sharded param (#311) 2022-03-04 07:49:23 +00:00			`from copy import deepcopy`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`from functools import partial`

[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`import colossalai`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`import pytest`
			`import torch`
			`import torch.multiprocessing as mp`
[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 2022-03-18 08:18:31 +00:00			`from colossalai.testing import parameterize`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`from colossalai.utils import free_port`
polish unit test 2022-03-14 07:06:02 +00:00			`from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardStrategy)`
[zero] polish sharded param name (#484) * [zero] polish sharded param name * polish code * polish * polish code * polish * polsih * polish 2022-03-22 06:36:16 +00:00			`from colossalai.zero.sharded_param import ShardedTensor`
[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`from colossalai.zero.sharded_param.sharded_param import ShardedParamV2`
[test] fixed rerun_on_exception and adapted test cases (#487) 2022-03-25 09:25:12 +00:00			`from colossalai.testing import rerun_on_exception`
polish unit test 2022-03-14 07:06:02 +00:00			`from tests.test_zero_data_parallel.common import CONFIG, allclose`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00
[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 2022-03-18 08:18:31 +00:00			`@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])`
			`def run_shard_tensor_with_strategy(shard_strategy_class, world_size):`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 2022-03-04 02:46:13 +00:00			`t = ShardedTensor(tensor=torch.randn(world_size * 2, 3))`
[zero] polish shard strategy (#310) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code * add shard stratgy * move shard and gather logic to shard strategy from shard tensor. * polish code 2022-03-04 07:35:07 +00:00			`assert list(t.origin_shape) == [world_size * 2, 3]`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 2022-03-04 02:46:13 +00:00			`assert list(t.shape) == [world_size * 2, 3]`
[zero] polish shard strategy (#310) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code * add shard stratgy * move shard and gather logic to shard strategy from shard tensor. * polish code 2022-03-04 07:35:07 +00:00
[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 2022-03-18 08:18:31 +00:00			`shard_strategy = shard_strategy_class()`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 2022-03-04 02:46:13 +00:00
[zero] a shard strategy in granularity of tensor (#307) 2022-03-04 03:59:35 +00:00			`# test shard strategy`
			`shard_strategy.shard([t])`
[zero] polish shard strategy (#310) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code * add shard stratgy * move shard and gather logic to shard strategy from shard tensor. * polish code 2022-03-04 07:35:07 +00:00			`assert list(t.shape) == [6], f"{list(t.shape)} vs 6"`
[zero] a shard strategy in granularity of tensor (#307) 2022-03-04 03:59:35 +00:00			`shard_strategy.gather([t])`
[zero] polish shard strategy (#310) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code * add shard stratgy * move shard and gather logic to shard strategy from shard tensor. * polish code 2022-03-04 07:35:07 +00:00			`assert list(t.shape) == [world_size * 2, 3], f"{list(t.shape)} vs {[world_size * 2, 3]}"`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 2022-03-04 02:46:13 +00:00

[test] optimized zero data parallel test (#452) 2022-03-18 03:35:54 +00:00			`def _run_shard_tensor(rank, world_size, port):`
			`colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')`
			`run_shard_tensor_with_strategy(world_size=world_size)`


[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 2022-03-04 02:46:13 +00:00			`@pytest.mark.dist`
using pytest parametrize 2022-03-08 04:03:35 +00:00			`@pytest.mark.parametrize("world_size", [1, 2])`
[test] fixed rerun_on_exception and adapted test cases (#487) 2022-03-25 09:25:12 +00:00			`@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".Address already in use.")`
[test] optimized zero data parallel test (#452) 2022-03-18 03:35:54 +00:00			`def test_shard_tensor(world_size):`
			`run_func = partial(_run_shard_tensor, world_size=world_size, port=free_port())`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 2022-03-04 02:46:13 +00:00			`mp.spawn(run_func, nprocs=world_size)`


[zero] yet an improved sharded param (#311) 2022-03-04 07:49:23 +00:00			`def _run_shard_param_v2(rank, world_size, port):`
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00			`colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')`

[zero] yet an improved sharded param (#311) 2022-03-04 07:49:23 +00:00			`param = torch.nn.Parameter(torch.randn(2, 3))`
			`param_ref = deepcopy(param)`
			`sparam = ShardedParamV2(param=param, process_group=None)`

[zero] polish sharded param name (#484) * [zero] polish sharded param name * polish code * polish * polish code * polish * polsih * polish 2022-03-22 06:36:16 +00:00			`allclose(sparam.sharded_data_tensor.payload, param_ref.data)`
[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00
[zero] get memory usage for sharded param (#536) 2022-03-28 07:01:21 +00:00			`# Test get memory usage`
			`sparam.fp32_grad = torch.randn(2, 3)`
			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
[zero] improve the accuracy of get_memory_usage of sharded param (#538) 2022-03-28 08:19:19 +00:00			`assert cpu_mem_use == 2 * 3 * 4 * 2, f"cpu_mem_use: {cpu_mem_use}"`

			`sparam.remove_torch_payload()`
			`assert (param.data.numel() == 1)`
			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
			`# 4 is size of dummy tensor of param.data`
			`assert cpu_mem_use == 2 * 3 * 4 * 2 + 4`
[zero] get memory usage for sharded param (#536) 2022-03-28 07:01:21 +00:00
			`sparam.fp16_grad = torch.randn(2, 3).cuda().half()`
			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
[zero] improve the accuracy of get_memory_usage of sharded param (#538) 2022-03-28 08:19:19 +00:00			`assert cpu_mem_use == 2 * 3 * 4 * 2 + 4`
[zero] get memory usage for sharded param (#536) 2022-03-28 07:01:21 +00:00			`assert cuda_mem_use == 2 * 3 * 2`

			`sparam.fp16_grad = None`
			`sparam.fp32_grad = torch.randn(2, 3)`
			`sparam.remove_torch_payload()`
			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
[zero] improve the accuracy of get_memory_usage of sharded param (#538) 2022-03-28 08:19:19 +00:00			`assert cpu_mem_use == 2 * 3 * 4 * 2 + 4`
			`assert cuda_mem_use == 0`

			`# append a grad to torch param`
			`param.data = sparam.sharded_data_tensor.payload`
			`param.grad = torch.randn(2, 3)`
			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
			`assert cpu_mem_use == 2 * 3 * 4 * 2 + 2 * 3 * 4, f"cpu_mem_use {cpu_mem_use}"`
			`assert cuda_mem_use == 0`

			`# reuse torch grad for sparam`
			`sparam.fp32_grad = param.grad`
			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
[zero] get memory usage for sharded param (#536) 2022-03-28 07:01:21 +00:00			`assert cpu_mem_use == 2 * 3 * 4 * 2`
			`assert cuda_mem_use == 0`

Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00
[zero] yet an improved sharded param (#311) 2022-03-04 07:49:23 +00:00			`@pytest.mark.dist`
using pytest parametrize 2022-03-08 04:03:35 +00:00			`@pytest.mark.parametrize("world_size", [1, 2])`
[test] fixed rerun_on_exception and adapted test cases (#487) 2022-03-25 09:25:12 +00:00			`@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".Address already in use.")`
using pytest parametrize 2022-03-08 04:03:35 +00:00			`def test_shard_param_v2(world_size):`
[zero] yet an improved sharded param (#311) 2022-03-04 07:49:23 +00:00			`run_func = partial(_run_shard_param_v2, world_size=world_size, port=free_port())`
			`mp.spawn(run_func, nprocs=world_size)`
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 2022-03-03 04:42:57 +00:00
[zero] yet an improved sharded param (#311) 2022-03-04 07:49:23 +00:00
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`if __name__ == '__main__':`
[zero] get memory usage for sharded param (#536) 2022-03-28 07:01:21 +00:00			`# test_shard_tensor(2)`
using pytest parametrize 2022-03-08 04:03:35 +00:00			`test_shard_param_v2(2)`