ColossalAI/colossalai/zero/gemini/utils.py

from collections import OrderedDict
from copy import copy
from typing import Optional, Set

import torch
import torch.distributed as dist
import torch.nn as nn

from colossalai.utils import get_current_device

from .chunk import Chunk


def get_temp_total_chunk_on_cuda(chunk: Chunk):
    if chunk.is_gathered:
        return chunk.cuda_global_chunk

    if chunk.cuda_shard is not None:
        shard_temp = chunk.cuda_shard
    else:
        shard_temp = chunk.cpu_shard.to(get_current_device())

    total_temp = torch.zeros(chunk.chunk_size, dtype=chunk.dtype, device=get_current_device())
    gather_list = list(torch.chunk(input=total_temp, chunks=chunk.pg_size, dim=0))
    dist.all_gather(tensor_list=gather_list, tensor=shard_temp, group=chunk.torch_pg)

    return total_temp


def _get_dfs_module_list(module: nn.Module, memo: Optional[Set[nn.Module]] = None, prefix: str = ''):
    """Get a dfs module list of the given module. Its order is same as the order of creations of modules.
    """
    if memo is None:
        memo = set()
    if module not in memo:
        for name, submodule in module._modules.items():
            if submodule is None:
                continue
            submodule_prefix = prefix + ('.' if prefix else '') + name
            for m in _get_dfs_module_list(submodule, memo, submodule_prefix):
                yield m

        memo.add(module)
        yield prefix, module


def _get_shallow_copy_model(model: nn.Module):
    """Get a shallow copy of the given model. Each submodule is different from the original submodule.
    But the new submodule and the old submodule share all attributes.
    """
    old_to_new = dict()
    for name, module in _get_dfs_module_list(model):
        new_module = copy(module)
        new_module._modules = OrderedDict()
        for subname, submodule in module._modules.items():
            if submodule is None:
                continue
            setattr(new_module, subname, old_to_new[submodule])
        old_to_new[module] = new_module
    return old_to_new[model]


def get_static_torch_model(zero_ddp_model,
                           device=torch.device("cpu"),
                           dtype=torch.float32,
                           only_rank_0=True) -> torch.nn.Module:
    """Get a static torch.nn.Module model from the given GeminiDDP module.
    You should notice that the original GeminiDDP model is not modified.
    Thus, you can use the original model in further training.
    But you should not use the returned torch model to train, this can cause unexpected errors.

    Args:
        zero_ddp_model (GeminiDDP): a zero ddp model
        device (torch.device): the device of the final torch model
        dtype (torch.dtype): the dtype of the final torch model
        only_rank_0 (bool): if True, only rank0 has the converted torch model

    Returns:
        torch.nn.Module: a static torch model used for saving checkpoints or numeric checks
    """
    from colossalai.zero.gemini.gemini_ddp import GeminiDDP
    assert isinstance(zero_ddp_model, GeminiDDP)

    state_dict = zero_ddp_model.state_dict(only_rank_0=only_rank_0)
    colo_model = zero_ddp_model.module
    torch_model = _get_shallow_copy_model(colo_model)

    if not only_rank_0 or dist.get_rank() == 0:
        for (name, colo_module), (_, torch_module) in \
                zip(_get_dfs_module_list(colo_model), _get_dfs_module_list(torch_model)):
            # clean the parameter list of the new torch module
            torch_module._parameters = OrderedDict()
            for sufix_param_name, param in colo_module.named_parameters(recurse=False):
                # get the full name of the parameter
                full_param_name = name + ('.' if name else '') + sufix_param_name
                assert full_param_name in state_dict, \
                    f"Can not find parameter `{full_param_name}` in the GeminiDDP module"
                state_param = state_dict[full_param_name]
                torch_param = torch.nn.Parameter(state_param.data.to(device=device, dtype=dtype))

                setattr(torch_module, sufix_param_name, torch_param)
    dist.barrier()

    return torch_model
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`from collections import OrderedDict`
			`from copy import copy`
			`from typing import Optional, Set`

[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00			`import torch`
			`import torch.distributed as dist`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`import torch.nn as nn`
[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00
			`from colossalai.utils import get_current_device`

[zero] reorganize zero/gemini folder structure (#3424) * [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import 2023-04-04 05:48:16 +00:00			`from .chunk import Chunk`

[NFC] polish comments for Chunk class (#2116) 2022-12-12 07:39:31 +00:00
			`def get_temp_total_chunk_on_cuda(chunk: Chunk):`
			`if chunk.is_gathered:`
			`return chunk.cuda_global_chunk`

			`if chunk.cuda_shard is not None:`
			`shard_temp = chunk.cuda_shard`
			`else:`
			`shard_temp = chunk.cpu_shard.to(get_current_device())`

			`total_temp = torch.zeros(chunk.chunk_size, dtype=chunk.dtype, device=get_current_device())`
			`gather_list = list(torch.chunk(input=total_temp, chunks=chunk.pg_size, dim=0))`
			`dist.all_gather(tensor_list=gather_list, tensor=shard_temp, group=chunk.torch_pg)`

			`return total_temp`
[Gemini] GeminiDPP convert to PyTorch Module. (#2151) 2022-12-20 02:19:36 +00:00

[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`def _get_dfs_module_list(module: nn.Module, memo: Optional[Set[nn.Module]] = None, prefix: str = ''):`
			`"""Get a dfs module list of the given module. Its order is same as the order of creations of modules.`
			`"""`
			`if memo is None:`
			`memo = set()`
			`if module not in memo:`
			`for name, submodule in module._modules.items():`
			`if submodule is None:`
			`continue`
			`submodule_prefix = prefix + ('.' if prefix else '') + name`
			`for m in _get_dfs_module_list(submodule, memo, submodule_prefix):`
			`yield m`

			`memo.add(module)`
			`yield prefix, module`
[Gemini] GeminiDPP convert to PyTorch Module. (#2151) 2022-12-20 02:19:36 +00:00

[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`def _get_shallow_copy_model(model: nn.Module):`
			`"""Get a shallow copy of the given model. Each submodule is different from the original submodule.`
			`But the new submodule and the old submodule share all attributes.`
			`"""`
[polish] polish code for get_static_torch_model (#2405) * [gemini] polish code * [testing] remove code * [gemini] make more robust 2023-01-09 09:41:38 +00:00			`old_to_new = dict()`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`for name, module in _get_dfs_module_list(model):`
			`new_module = copy(module)`
			`new_module._modules = OrderedDict()`
			`for subname, submodule in module._modules.items():`
			`if submodule is None:`
			`continue`
[polish] polish code for get_static_torch_model (#2405) * [gemini] polish code * [testing] remove code * [gemini] make more robust 2023-01-09 09:41:38 +00:00			`setattr(new_module, subname, old_to_new[submodule])`
			`old_to_new[module] = new_module`
			`return old_to_new[model]`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00

Fix state_dict key missing issue of the ZeroDDP (#2363) * Fix state_dict output for ZeroDDP duplicated parameters * Rewrite state_dict based on get_static_torch_model * Modify get_static_torch_model to be compatible with the lower version (ZeroDDP) 2023-01-09 06:35:14 +00:00			`def get_static_torch_model(zero_ddp_model,`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`device=torch.device("cpu"),`
			`dtype=torch.float32,`
			`only_rank_0=True) -> torch.nn.Module:`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`"""Get a static torch.nn.Module model from the given GeminiDDP module.`
			`You should notice that the original GeminiDDP model is not modified.`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`Thus, you can use the original model in further training.`
			`But you should not use the returned torch model to train, this can cause unexpected errors.`
[Gemini] GeminiDPP convert to PyTorch Module. (#2151) 2022-12-20 02:19:36 +00:00
			`Args:`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`zero_ddp_model (GeminiDDP): a zero ddp model`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`device (torch.device): the device of the final torch model`
			`dtype (torch.dtype): the dtype of the final torch model`
fix typo with colossalai/trainer utils zero (#3908) 2023-06-07 08:08:37 +00:00			`only_rank_0 (bool): if True, only rank0 has the converted torch model`
[Gemini] GeminiDPP convert to PyTorch Module. (#2151) 2022-12-20 02:19:36 +00:00
			`Returns:`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`torch.nn.Module: a static torch model used for saving checkpoints or numeric checks`
[Gemini] GeminiDPP convert to PyTorch Module. (#2151) 2022-12-20 02:19:36 +00:00			`"""`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`from colossalai.zero.gemini.gemini_ddp import GeminiDDP`
			`assert isinstance(zero_ddp_model, GeminiDDP)`
[Gemini] GeminiDPP convert to PyTorch Module. (#2151) 2022-12-20 02:19:36 +00:00
[hotfix] fix lightning error (#2529) 2023-01-31 02:40:39 +00:00			`state_dict = zero_ddp_model.state_dict(only_rank_0=only_rank_0)`
Fix state_dict key missing issue of the ZeroDDP (#2363) * Fix state_dict output for ZeroDDP duplicated parameters * Rewrite state_dict based on get_static_torch_model * Modify get_static_torch_model to be compatible with the lower version (ZeroDDP) 2023-01-09 06:35:14 +00:00			`colo_model = zero_ddp_model.module`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`torch_model = _get_shallow_copy_model(colo_model)`

			`if not only_rank_0 or dist.get_rank() == 0:`
			`for (name, colo_module), (_, torch_module) in \`
			`zip(_get_dfs_module_list(colo_model), _get_dfs_module_list(torch_model)):`
			`# clean the parameter list of the new torch module`
			`torch_module._parameters = OrderedDict()`
			`for sufix_param_name, param in colo_module.named_parameters(recurse=False):`
			`# get the full name of the parameter`
			`full_param_name = name + ('.' if name else '') + sufix_param_name`
[hotfix] fix lightning error (#2529) 2023-01-31 02:40:39 +00:00			`assert full_param_name in state_dict, \`
			f"Can not find parameter `{full_param_name}` in the GeminiDDP module"
			`state_param = state_dict[full_param_name]`
			`torch_param = torch.nn.Parameter(state_param.data.to(device=device, dtype=dtype))`
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00
			`setattr(torch_module, sufix_param_name, torch_param)`
			`dist.barrier()`
[Gemini] GeminiDPP convert to PyTorch Module. (#2151) 2022-12-20 02:19:36 +00:00
[gemini] add get static torch model (#2356) 2023-01-06 05:41:19 +00:00			`return torch_model`