ColossalAI/colossalai/zero/legacy/sharded_model/_utils.py

from typing import Any, Callable, List, Tuple, Union

import torch
import torch.nn.functional as F

from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor


def get_gradient_predivide_factor(world_size: int) -> float:
    factor: int = 1
    while world_size % factor == 0 and world_size / factor > factor:
        factor *= 2
    return float(factor)


def free_storage(data: torch.Tensor) -> None:
    """Free underlying storage of a Tensor."""
    if data.storage().size() > 0:
        # Since we're modifying the Tensor's Storage directly, make sure the Tensor
        # is the sole occupant of the Storage.
        assert data.storage_offset() == 0
        data.storage().resize_(0)


@torch.no_grad()
def alloc_storage(data: torch.Tensor, size: torch.Size) -> None:
    """Allocate storage for a tensor."""
    if data.storage().size() == size.numel():    # no need to reallocate
        return
    assert data.storage().size() == 0
    data.storage().resize_(size.numel())


def cast_tensor_to_fp16(tensor: torch.Tensor) -> torch.Tensor:
    if isinstance(tensor, StatefulTensor):
        tensor = tensor.payload
    if torch.is_floating_point(tensor) and tensor.dtype is torch.float32:
        return tensor.half()
    return tensor


def cast_tensor_to_fp32(tensor: Union[torch.Tensor, StatefulTensor]) -> torch.Tensor:
    if isinstance(tensor, StatefulTensor):
        tensor = tensor.payload

    if torch.is_floating_point(tensor) and tensor.dtype in (torch.float16, torch.bfloat16):
        return tensor.float()
    return tensor


def cast_tensor_to_bf16(tensor: torch.Tensor) -> torch.Tensor:
    if isinstance(tensor, StatefulTensor):
        tensor = tensor.payload
    if torch.is_floating_point(tensor) and tensor.dtype is torch.float32:
        return tensor.bfloat16()
    return tensor


def apply_to_tensors(x: Any, fn: Callable):
    if torch.is_tensor(x):
        return fn(x)
    elif isinstance(x, list):
        return [apply_to_tensors(t, fn) for t in x]
    elif isinstance(x, tuple):
        return tuple(apply_to_tensors(t, fn) for t in x)
    elif isinstance(x, dict):
        return {key: apply_to_tensors(val, fn) for key, val in x.items()}
    else:
        return x


def cast_float_arguments(fn: Callable, *args: Any, **kwargs: Any) -> Tuple[Any, Any]:
    return apply_to_tensors(args, fn), apply_to_tensors(kwargs, fn)


def chunk_and_pad(tensor: torch.Tensor, num_chunks: int) -> List[torch.Tensor]:
    """Chunk a given Tensor into num_chunks parts and add any necessary padding."""
    chunks = list(torch.flatten(tensor).chunk(num_chunks))
    # torch.chunk may return fewer than num_chunks chunks, pad accordingly.
    num_pad_for_partial_chunk = chunks[0].numel() - chunks[-1].numel()
    if num_pad_for_partial_chunk > 0:
        chunks[-1] = F.pad(chunks[-1], [0, num_pad_for_partial_chunk])
    if len(chunks) < num_chunks:
        chunks.extend([torch.zeros_like(chunks[0]) for _ in range(num_chunks - len(chunks))])
    return chunks
[zero] reorganize zero/gemini folder structure (#3424) * [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import 2023-04-04 05:48:16 +00:00			`from typing import Any, Callable, List, Tuple, Union`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00
			`import torch`
			`import torch.nn.functional as F`
[zero] reorganize zero/gemini folder structure (#3424) * [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import 2023-04-04 05:48:16 +00:00
			`from colossalai.zero.legacy.gemini.stateful_tensor import StatefulTensor`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00

			`def get_gradient_predivide_factor(world_size: int) -> float:`
			`factor: int = 1`
			`while world_size % factor == 0 and world_size / factor > factor:`
			`factor *= 2`
			`return float(factor)`


			`def free_storage(data: torch.Tensor) -> None:`
			`"""Free underlying storage of a Tensor."""`
			`if data.storage().size() > 0:`
			`# Since we're modifying the Tensor's Storage directly, make sure the Tensor`
			`# is the sole occupant of the Storage.`
			`assert data.storage_offset() == 0`
			`data.storage().resize_(0)`


			`@torch.no_grad()`
			`def alloc_storage(data: torch.Tensor, size: torch.Size) -> None:`
			`"""Allocate storage for a tensor."""`
[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`if data.storage().size() == size.numel(): # no need to reallocate`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`return`
			`assert data.storage().size() == 0`
			`data.storage().resize_(size.numel())`


[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`def cast_tensor_to_fp16(tensor: torch.Tensor) -> torch.Tensor:`
[zero] add stateful tensor (#549) 2022-03-30 05:51:37 +00:00			`if isinstance(tensor, StatefulTensor):`
			`tensor = tensor.payload`
[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`if torch.is_floating_point(tensor) and tensor.dtype is torch.float32:`
			`return tensor.half()`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`return tensor`


[zero] add stateful tensor (#549) 2022-03-30 05:51:37 +00:00			`def cast_tensor_to_fp32(tensor: Union[torch.Tensor, StatefulTensor]) -> torch.Tensor:`
			`if isinstance(tensor, StatefulTensor):`
			`tensor = tensor.payload`

[bf16] add bf16 support (#3882) * [bf16] add bf16 support for fused adam (#3844) * [bf16] fused adam kernel support bf16 * [test] update fused adam kernel test * [test] update fused adam test * [bf16] cpu adam and hybrid adam optimizers support bf16 (#3860) * [bf16] implement mixed precision mixin and add bf16 support for low level zero (#3869) * [bf16] add mixed precision mixin * [bf16] low level zero optim support bf16 * [text] update low level zero test * [text] fix low level zero grad acc test * [bf16] add bf16 support for gemini (#3872) * [bf16] gemini support bf16 * [test] update gemini bf16 test * [doc] update gemini docstring * [bf16] add bf16 support for plugins (#3877) * [bf16] add bf16 support for legacy zero (#3879) * [zero] init context support bf16 * [zero] legacy zero support bf16 * [test] add zero bf16 test * [doc] add bf16 related docstring for legacy zero 2023-06-05 07:58:31 +00:00			`if torch.is_floating_point(tensor) and tensor.dtype in (torch.float16, torch.bfloat16):`
[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`return tensor.float()`
Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`return tensor`


[bf16] add bf16 support (#3882) * [bf16] add bf16 support for fused adam (#3844) * [bf16] fused adam kernel support bf16 * [test] update fused adam kernel test * [test] update fused adam test * [bf16] cpu adam and hybrid adam optimizers support bf16 (#3860) * [bf16] implement mixed precision mixin and add bf16 support for low level zero (#3869) * [bf16] add mixed precision mixin * [bf16] low level zero optim support bf16 * [text] update low level zero test * [text] fix low level zero grad acc test * [bf16] add bf16 support for gemini (#3872) * [bf16] gemini support bf16 * [test] update gemini bf16 test * [doc] update gemini docstring * [bf16] add bf16 support for plugins (#3877) * [bf16] add bf16 support for legacy zero (#3879) * [zero] init context support bf16 * [zero] legacy zero support bf16 * [test] add zero bf16 test * [doc] add bf16 related docstring for legacy zero 2023-06-05 07:58:31 +00:00			`def cast_tensor_to_bf16(tensor: torch.Tensor) -> torch.Tensor:`
			`if isinstance(tensor, StatefulTensor):`
			`tensor = tensor.payload`
			`if torch.is_floating_point(tensor) and tensor.dtype is torch.float32:`
			`return tensor.bfloat16()`
			`return tensor`


Feature/zero (#279) * add zero1 (#209) * add zero1 * add test zero1 * update zero stage 1 develop (#212) * Implement naive zero3 (#240) * naive zero3 works well * add zero3 param manager * add TODOs in comments * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * fix bugs of hook and add unit tests (#252) * add gather full param ctx * fix sub module streams * add offload * fix bugs of hook and add unit tests * polish code and add state dict hook * fix bug * update unit test * refactor reconstructed zero code * clip_grad support zero3 and add unit test * add unit test for Zero3ParameterManager * [WIP] initialize the shard param class * [WIP] Yet another sharded model implementation (#274) * [WIP] initialize the shard param class * [WIP] Yes another implementation of shardModel. Using a better hook method. * torch.concat -> torch.cat * fix test_zero_level_1.py::test_zero_level_1 unitest * remove deepspeed implementation and refactor for the reconstructed zero module * polish zero dp unittests Co-authored-by: ver217 <lhx0217@gmail.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> 2022-03-01 10:17:01 +00:00			`def apply_to_tensors(x: Any, fn: Callable):`
			`if torch.is_tensor(x):`
			`return fn(x)`
			`elif isinstance(x, list):`
			`return [apply_to_tensors(t, fn) for t in x]`
			`elif isinstance(x, tuple):`
			`return tuple(apply_to_tensors(t, fn) for t in x)`
			`elif isinstance(x, dict):`
			`return {key: apply_to_tensors(val, fn) for key, val in x.items()}`
			`else:`
			`return x`


			`def cast_float_arguments(fn: Callable, args: Any, *kwargs: Any) -> Tuple[Any, Any]:`
			`return apply_to_tensors(args, fn), apply_to_tensors(kwargs, fn)`


			`def chunk_and_pad(tensor: torch.Tensor, num_chunks: int) -> List[torch.Tensor]:`
			`"""Chunk a given Tensor into num_chunks parts and add any necessary padding."""`
			`chunks = list(torch.flatten(tensor).chunk(num_chunks))`
			`# torch.chunk may return fewer than num_chunks chunks, pad accordingly.`
			`num_pad_for_partial_chunk = chunks[0].numel() - chunks[-1].numel()`
			`if num_pad_for_partial_chunk > 0:`
			`chunks[-1] = F.pad(chunks[-1], [0, num_pad_for_partial_chunk])`
			`if len(chunks) < num_chunks:`
			`chunks.extend([torch.zeros_like(chunks[0]) for _ in range(num_chunks - len(chunks))])`
			`return chunks`