ColossalAI/tests/test_zero/test_shard_param.py

from copy import deepcopy
from functools import partial

import colossalai
import pytest
import torch
import torch.multiprocessing as mp
from colossalai.testing import parameterize, rerun_if_address_is_in_use
from colossalai.utils import free_port
from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardStrategy)
from colossalai.zero.sharded_param import ShardedTensor
from colossalai.zero.sharded_param.sharded_param import ShardedParamV2
from tests.test_zero.common import CONFIG, allclose
from colossalai.gemini.stateful_tensor import StatefulTensor


@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])
def run_shard_tensor_with_strategy(shard_strategy_class, world_size):
    t = ShardedTensor(tensor=torch.randn(world_size * 2, 3))
    assert list(t.origin_shape) == [world_size * 2, 3]
    assert list(t.shape) == [world_size * 2, 3]

    shard_strategy = shard_strategy_class()

    # test shard strategy
    shard_strategy.shard([t])
    assert list(t.shape) == [6], f"{list(t.shape)} vs 6"
    shard_strategy.gather([t])
    assert list(t.shape) == [world_size * 2, 3], f"{list(t.shape)} vs {[world_size * 2, 3]}"


def _run_shard_tensor(rank, world_size, port):
    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
    run_shard_tensor_with_strategy(world_size=world_size)


@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_if_address_is_in_use()
def test_shard_tensor(world_size):
    run_func = partial(_run_shard_tensor, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


def _run_shard_param_v2(rank, world_size, port):
    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')

    param = torch.nn.Parameter(torch.randn(2, 3))
    param_ref = deepcopy(param)
    sparam = ShardedParamV2(param=param)

    allclose(sparam.data_payload, param_ref.data)

    # Test get memory usage
    sparam.saved_grad = StatefulTensor(torch.randn(2, 3))
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2, f"cpu_mem_use: {cpu_mem_use}"

    sparam.set_data_none()
    assert (param.data.numel() == 0)
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    # 4 is size of dummy tensor of param.data
    assert cpu_mem_use == 2 * 3 * 4 * 2

    sparam.saved_grad = StatefulTensor(torch.randn(2, 3))
    sparam.set_data_none()
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2
    assert cuda_mem_use == 0

    # append a grad to torch param
    param.data = sparam.data_payload
    param.grad = torch.randn(2, 3)
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2 + 2 * 3 * 4, f"cpu_mem_use {cpu_mem_use}"
    assert cuda_mem_use == 0

    # reuse torch grad for sparam
    sparam.saved_grad = StatefulTensor(param.grad)
    cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()
    assert cpu_mem_use == 2 * 3 * 4 * 2
    assert cuda_mem_use == 0


@pytest.mark.dist
@pytest.mark.parametrize("world_size", [1, 2])
@rerun_if_address_is_in_use()
def test_shard_param_v2(world_size):
    run_func = partial(_run_shard_param_v2, world_size=world_size, port=free_port())
    mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
    # test_shard_tensor(2)
    test_shard_param_v2(2)
[zero] yet an improved sharded param (#311) 3 years ago			`from copy import deepcopy`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 3 years ago			`from functools import partial`

[zero] Update sharded model v2 using sharded param v2 (#323) 3 years ago			`import colossalai`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 3 years ago			`import pytest`
			`import torch`
			`import torch.multiprocessing as mp`
[test] refactored with the new rerun decorator (#763) * [test] refactored with the new rerun decorator * polish test case 3 years ago			`from colossalai.testing import parameterize, rerun_if_address_is_in_use`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 3 years ago			`from colossalai.utils import free_port`
polish unit test 3 years ago			`from colossalai.zero.shard_utils import (BucketTensorShardStrategy, TensorShardStrategy)`
[zero] polish sharded param name (#484) * [zero] polish sharded param name * polish code * polish * polish code * polish * polsih * polish 3 years ago			`from colossalai.zero.sharded_param import ShardedTensor`
[zero] Update sharded model v2 using sharded param v2 (#323) 3 years ago			`from colossalai.zero.sharded_param.sharded_param import ShardedParamV2`
[utils] correct cpu memory used and capacity in the context of multi-process (#726) 3 years ago			`from tests.test_zero.common import CONFIG, allclose`
[gemini] add GeminiMemoryManger (#832) * refactor StatefulTensor, tensor utilities * add unitest for GeminiMemoryManager 3 years ago			`from colossalai.gemini.stateful_tensor import StatefulTensor`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 3 years ago
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 3 years ago
[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 3 years ago			`@parameterize("shard_strategy_class", [TensorShardStrategy, BucketTensorShardStrategy])`
			`def run_shard_tensor_with_strategy(shard_strategy_class, world_size):`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 3 years ago			`t = ShardedTensor(tensor=torch.randn(world_size * 2, 3))`
[zero] polish shard strategy (#310) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code * add shard stratgy * move shard and gather logic to shard strategy from shard tensor. * polish code 3 years ago			`assert list(t.origin_shape) == [world_size * 2, 3]`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 3 years ago			`assert list(t.shape) == [world_size * 2, 3]`
[zero] polish shard strategy (#310) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code * add shard stratgy * move shard and gather logic to shard strategy from shard tensor. * polish code 3 years ago
[zero] Update initialize for ZeRO (#458) * polish code * shard strategy receive pg in shard() / gather() * update zero engine * polish code 3 years ago			`shard_strategy = shard_strategy_class()`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 3 years ago
[zero] a shard strategy in granularity of tensor (#307) 3 years ago			`# test shard strategy`
			`shard_strategy.shard([t])`
[zero] polish shard strategy (#310) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code * add shard stratgy * move shard and gather logic to shard strategy from shard tensor. * polish code 3 years ago			`assert list(t.shape) == [6], f"{list(t.shape)} vs 6"`
[zero] a shard strategy in granularity of tensor (#307) 3 years ago			`shard_strategy.gather([t])`
[zero] polish shard strategy (#310) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code * add shard stratgy * move shard and gather logic to shard strategy from shard tensor. * polish code 3 years ago			`assert list(t.shape) == [world_size * 2, 3], f"{list(t.shape)} vs {[world_size * 2, 3]}"`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 3 years ago

[test] optimized zero data parallel test (#452) 3 years ago			`def _run_shard_tensor(rank, world_size, port):`
			`colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')`
			`run_shard_tensor_with_strategy(world_size=world_size)`


[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 3 years ago			`@pytest.mark.dist`
using pytest parametrize 3 years ago			`@pytest.mark.parametrize("world_size", [1, 2])`
[test] refactored with the new rerun decorator (#763) * [test] refactored with the new rerun decorator * polish test case 3 years ago			`@rerun_if_address_is_in_use()`
[test] optimized zero data parallel test (#452) 3 years ago			`def test_shard_tensor(world_size):`
			`run_func = partial(_run_shard_tensor, world_size=world_size, port=free_port())`
[zero] sharded tensor (#305) * init shard param from shape tuple * add more unitest for shard param * add set_payload method for ShardedParam * [zero] add shareded tensor class * polish code 3 years ago			`mp.spawn(run_func, nprocs=world_size)`


[zero] yet an improved sharded param (#311) 3 years ago			`def _run_shard_param_v2(rank, world_size, port):`
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 3 years ago			`colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')`

[zero] yet an improved sharded param (#311) 3 years ago			`param = torch.nn.Parameter(torch.randn(2, 3))`
			`param_ref = deepcopy(param)`
[zero] label state for param fp16 and grad (#551) 3 years ago			`sparam = ShardedParamV2(param=param)`
[zero] yet an improved sharded param (#311) 3 years ago
[zero] refactor ShardedParamV2 for convenience (#742) 3 years ago			`allclose(sparam.data_payload, param_ref.data)`
[zero] update zero context init with the updated test utils (#327) 3 years ago
[zero] get memory usage for sharded param (#536) 3 years ago			`# Test get memory usage`
[zero] hijack p.grad in sharded model (#554) * hijack p.grad in sharded model * polish comments * polish comments 3 years ago			`sparam.saved_grad = StatefulTensor(torch.randn(2, 3))`
[zero] get memory usage for sharded param (#536) 3 years ago			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
[zero] improve the accuracy of get_memory_usage of sharded param (#538) 3 years ago			`assert cpu_mem_use == 2 * 3 * 4 * 2, f"cpu_mem_use: {cpu_mem_use}"`

[zero] refactor ShardedParamV2 for convenience (#742) 3 years ago			`sparam.set_data_none()`
[zero] adapt zero hooks for unsharded module (#699) 3 years ago			`assert (param.data.numel() == 0)`
[zero] improve the accuracy of get_memory_usage of sharded param (#538) 3 years ago			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
			`# 4 is size of dummy tensor of param.data`
[zero] adapt zero hooks for unsharded module (#699) 3 years ago			`assert cpu_mem_use == 2 * 3 * 4 * 2`
[zero] get memory usage for sharded param (#536) 3 years ago
[zero] hijack p.grad in sharded model (#554) * hijack p.grad in sharded model * polish comments * polish comments 3 years ago			`sparam.saved_grad = StatefulTensor(torch.randn(2, 3))`
[zero] refactor ShardedParamV2 for convenience (#742) 3 years ago			`sparam.set_data_none()`
[zero] get memory usage for sharded param (#536) 3 years ago			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
[zero] adapt zero hooks for unsharded module (#699) 3 years ago			`assert cpu_mem_use == 2 * 3 * 4 * 2`
[zero] improve the accuracy of get_memory_usage of sharded param (#538) 3 years ago			`assert cuda_mem_use == 0`

			`# append a grad to torch param`
[zero] refactor ShardedParamV2 for convenience (#742) 3 years ago			`param.data = sparam.data_payload`
[zero] improve the accuracy of get_memory_usage of sharded param (#538) 3 years ago			`param.grad = torch.randn(2, 3)`
			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
			`assert cpu_mem_use == 2 * 3 * 4 * 2 + 2 * 3 * 4, f"cpu_mem_use {cpu_mem_use}"`
			`assert cuda_mem_use == 0`

			`# reuse torch grad for sparam`
[zero] hijack p.grad in sharded model (#554) * hijack p.grad in sharded model * polish comments * polish comments 3 years ago			`sparam.saved_grad = StatefulTensor(param.grad)`
[zero] improve the accuracy of get_memory_usage of sharded param (#538) 3 years ago			`cuda_mem_use, cpu_mem_use = sparam.get_memory_usage()`
[zero] get memory usage for sharded param (#536) 3 years ago			`assert cpu_mem_use == 2 * 3 * 4 * 2`
			`assert cuda_mem_use == 0`

Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 3 years ago
[zero] yet an improved sharded param (#311) 3 years ago			`@pytest.mark.dist`
using pytest parametrize 3 years ago			`@pytest.mark.parametrize("world_size", [1, 2])`
[test] refactored with the new rerun decorator (#763) * [test] refactored with the new rerun decorator * polish test case 3 years ago			`@rerun_if_address_is_in_use()`
using pytest parametrize 3 years ago			`def test_shard_param_v2(world_size):`
[zero] yet an improved sharded param (#311) 3 years ago			`run_func = partial(_run_shard_param_v2, world_size=world_size, port=free_port())`
			`mp.spawn(run_func, nprocs=world_size)`
Polish sharded parameter (#297) * init shard param from shape tuple * add more unitest for shard param * add more unittests to shareded param 3 years ago
[zero] yet an improved sharded param (#311) 3 years ago
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 3 years ago			`if __name__ == '__main__':`
[zero] get memory usage for sharded param (#536) 3 years ago			`# test_shard_tensor(2)`
using pytest parametrize 3 years ago			`test_shard_param_v2(2)`