ColossalAI/colossalai/zero/wrapper.py

from copy import copy
from typing import Dict, Optional

import torch
import torch.nn as nn

from .gemini import GeminiDDP


def zero_model_wrapper(model: nn.Module,
                       zero_stage: int = 1,
                       gemini_config: Optional[Dict] = None,
                       verbose: bool = False):
    """This wrapper function is used to wrap your training model for ZeRO DDP.

    Example:

        >>> with ColoInitContext():
        >>>     my_model = Bert()
        >>> my_optim = SGD(my_model.parameters(), lr = 1e-3)
        >>> zero_model = zero_model_wrapper(my_model, zero_stage=1)
        >>> zero_optim = zero_optim_wrapper(zero_model, my_optim)

    Args:
        model (nn.Module): The model used in ZeRO DDP.
        zero_stage (int, optional): The stage of ZeRO DDP. You can find more information in ZeRO's paper.
            https://arxiv.org/abs/1910.02054
        gemini_config (dict, optional): The configuration dictionary of `GeminiDDP`. `GeminiDDP` is enabled
            when the stage is set to 3. You can set the arguments of `GeminiDDP` in the gemini_config.
            Here is an example where we set the device of the model, the placement policy of Gemini, and the
            size of hidden dimension to help Gemini find out a unified chunk size.

            Example:

                >>> config_dict = dict(device=torch.cuda.current_device(), hidden_dim=1024, placement_policy='auto')
                >>> model = zero_model_wrapper(model, zero_stage=3, gemini_config=config_dict)
    """
    assert zero_stage in [1, 2, 3], "The stage of ZeRO should be 1, 2 or 3"

    if gemini_config is None:
        gemini_config = dict()

    if zero_stage in [1, 2]:
        wrapped_model = model
    else:
        wrapped_model = GeminiDDP(model, **gemini_config, verbose=verbose)

    setattr(wrapped_model, "_colo_zero_stage", zero_stage)

    return wrapped_model


def zero_optim_wrapper(model: nn.Module,
                       optimizer: torch.optim.Optimizer,
                       initial_scale: float = 2**16,
                       growth_factor: float = 2,
                       backoff_factor: float = 0.5,
                       growth_interval: int = 1000,
                       hysteresis: int = 2,
                       min_scale: float = 1,
                       max_scale: float = 2**32,
                       max_norm: float = 0.0,
                       norm_type: float = 2.0,
                       optim_config: Optional[Dict] = None,
                       verbose: bool = False):
    """This wrapper function is used to wrap your training optimizer for ZeRO DDP.

    Args:
        model (nn.Module): Your model wrapped by `zero_model_wrapper`
        optimizer (torch.optim.Optimizer): Your initialized optimizer
        initial_scale (float, optional): initial_scale used by DynamicGradScaler.
        min_scale (float, optional): min_scale used by DynamicGradScaler.
        growth_factor (float, optional): growth_factor used by DynamicGradScaler.
        backoff_factor (float, optional): backoff_factor used by DynamicGradScaler.
        growth_interval (float, optional): growth_interval used by DynamicGradScaler.
        hysteresis (float, optional): hysteresis used by DynamicGradScaler.
        max_scale (int, optional): max_scale used by DynamicGradScaler.
        max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
            clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.
        norm_type (float, optional): norm_type used for `clip_grad_norm`.
        optim_config (dict, optional): The configuration used for the ZeRO optimizer.
            Example:

                >>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)
                >>> optim = zero_optim_wrapper(model, optim, optim_config=zero2_config)
        verbose (bool, optional): Whether to print the verbose info.
    """
    assert hasattr(model, "_colo_zero_stage"), "You should use `zero_ddp_wrapper` first"
    zero_stage = getattr(model, "_colo_zero_stage")

    assert norm_type == 2.0, "Current ZeRO optimizers only support 'norm_type=2'"

    if optim_config is None:
        config_dict = dict()
    else:
        config_dict = copy(optim_config)

    config_dict['initial_scale'] = initial_scale
    config_dict['growth_factor'] = growth_factor
    config_dict['backoff_factor'] = backoff_factor
    config_dict['growth_interval'] = growth_interval
    config_dict['hysteresis'] = hysteresis
    config_dict['min_scale'] = min_scale
    config_dict['max_scale'] = max_scale

    if zero_stage in [1, 2]:
        from colossalai.zero.low_level import LowLevelZeroOptimizer
        config_dict['partition_grad'] = zero_stage == 2
        config_dict['clip_grad_norm'] = max_norm
        return LowLevelZeroOptimizer(optimizer, **config_dict, verbose=verbose)
    else:
        from colossalai.zero.gemini.gemini_optimizer import GeminiOptimizer
        config_dict['clipping_norm'] = max_norm
        return GeminiOptimizer(optimizer, model, **config_dict, verbose=verbose)
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`from copy import copy`
			`from typing import Dict, Optional`

			`import torch`
			`import torch.nn as nn`

[zero] reorganize zero/gemini folder structure (#3424) * [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import 2023-04-04 05:48:16 +00:00			`from .gemini import GeminiDDP`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00

[misc] add verbose arg for zero and op builder (#3552) * [misc] add print verbose * [gemini] add print verbose * [zero] add print verbose for low level * [misc] add print verbose for op builder 2023-04-17 03:25:35 +00:00			`def zero_model_wrapper(model: nn.Module,`
			`zero_stage: int = 1,`
			`gemini_config: Optional[Dict] = None,`
			`verbose: bool = False):`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`"""This wrapper function is used to wrap your training model for ZeRO DDP.`

			`Example:`

			`>>> with ColoInitContext():`
			`>>> my_model = Bert()`
			`>>> my_optim = SGD(my_model.parameters(), lr = 1e-3)`
			`>>> zero_model = zero_model_wrapper(my_model, zero_stage=1)`
			`>>> zero_optim = zero_optim_wrapper(zero_model, my_optim)`

			`Args:`
			`model (nn.Module): The model used in ZeRO DDP.`
			`zero_stage (int, optional): The stage of ZeRO DDP. You can find more information in ZeRO's paper.`
			`https://arxiv.org/abs/1910.02054`
			gemini_config (dict, optional): The configuration dictionary of `GeminiDDP`. `GeminiDDP` is enabled
[doc] Fix typo under colossalai and doc(#3618) * Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402 2023-04-26 03:38:43 +00:00			when the stage is set to 3. You can set the arguments of `GeminiDDP` in the gemini_config.
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`Here is an example where we set the device of the model, the placement policy of Gemini, and the`
			`size of hidden dimension to help Gemini find out a unified chunk size.`

			`Example:`

			`>>> config_dict = dict(device=torch.cuda.current_device(), hidden_dim=1024, placement_policy='auto')`
			`>>> model = zero_model_wrapper(model, zero_stage=3, gemini_config=config_dict)`
			`"""`
			`assert zero_stage in [1, 2, 3], "The stage of ZeRO should be 1, 2 or 3"`

			`if gemini_config is None:`
			`gemini_config = dict()`

			`if zero_stage in [1, 2]:`
[gemini] update the gpt example (#2527) 2023-01-30 09:58:05 +00:00			`wrapped_model = model`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`else:`
[misc] add verbose arg for zero and op builder (#3552) * [misc] add print verbose * [gemini] add print verbose * [zero] add print verbose for low level * [misc] add print verbose for op builder 2023-04-17 03:25:35 +00:00			`wrapped_model = GeminiDDP(model, **gemini_config, verbose=verbose)`
[gemini] update the gpt example (#2527) 2023-01-30 09:58:05 +00:00
			`setattr(wrapped_model, "_colo_zero_stage", zero_stage)`

			`return wrapped_model`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00

			`def zero_optim_wrapper(model: nn.Module,`
			`optimizer: torch.optim.Optimizer,`
			`initial_scale: float = 2**16,`
			`growth_factor: float = 2,`
			`backoff_factor: float = 0.5,`
			`growth_interval: int = 1000,`
			`hysteresis: int = 2,`
			`min_scale: float = 1,`
			`max_scale: float = 2**32,`
			`max_norm: float = 0.0,`
			`norm_type: float = 2.0,`
[misc] add verbose arg for zero and op builder (#3552) * [misc] add print verbose * [gemini] add print verbose * [zero] add print verbose for low level * [misc] add print verbose for op builder 2023-04-17 03:25:35 +00:00			`optim_config: Optional[Dict] = None,`
			`verbose: bool = False):`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`"""This wrapper function is used to wrap your training optimizer for ZeRO DDP.`

			`Args:`
			model (nn.Module): Your model wrapped by `zero_model_wrapper`
			`optimizer (torch.optim.Optimizer): Your initialized optimizer`
			`initial_scale (float, optional): initial_scale used by DynamicGradScaler.`
			`min_scale (float, optional): min_scale used by DynamicGradScaler.`
			`growth_factor (float, optional): growth_factor used by DynamicGradScaler.`
			`backoff_factor (float, optional): backoff_factor used by DynamicGradScaler.`
			`growth_interval (float, optional): growth_interval used by DynamicGradScaler.`
			`hysteresis (float, optional): hysteresis used by DynamicGradScaler.`
			`max_scale (int, optional): max_scale used by DynamicGradScaler.`
			max_norm (float, optional): max_norm used for `clip_grad_norm`. You should notice that you shall not do
			`clip_grad_norm by yourself when using ZeRO DDP. The ZeRO optimizer will take care of clip_grad_norm.`
			norm_type (float, optional): norm_type used for `clip_grad_norm`.
[doc] Fix typo under colossalai and doc(#3618) * Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402 2023-04-26 03:38:43 +00:00			`optim_config (dict, optional): The configuration used for the ZeRO optimizer.`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`Example:`

			`>>> zero2_config = dict(reduce_bucket_size=12 * 1024 * 1024, overlap_communication=True)`
			`>>> optim = zero_optim_wrapper(model, optim, optim_config=zero2_config)`
[misc] add verbose arg for zero and op builder (#3552) * [misc] add print verbose * [gemini] add print verbose * [zero] add print verbose for low level * [misc] add print verbose for op builder 2023-04-17 03:25:35 +00:00			`verbose (bool, optional): Whether to print the verbose info.`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`"""`
			assert hasattr(model, "_colo_zero_stage"), "You should use `zero_ddp_wrapper` first"
			`zero_stage = getattr(model, "_colo_zero_stage")`

			`assert norm_type == 2.0, "Current ZeRO optimizers only support 'norm_type=2'"`

			`if optim_config is None:`
			`config_dict = dict()`
			`else:`
			`config_dict = copy(optim_config)`

			`config_dict['initial_scale'] = initial_scale`
			`config_dict['growth_factor'] = growth_factor`
			`config_dict['backoff_factor'] = backoff_factor`
			`config_dict['growth_interval'] = growth_interval`
			`config_dict['hysteresis'] = hysteresis`
			`config_dict['min_scale'] = min_scale`
			`config_dict['max_scale'] = max_scale`

			`if zero_stage in [1, 2]:`
[zero] reorganize zero/gemini folder structure (#3424) * [zero] refactor low-level zero folder structure * [zero] fix legacy zero import path * [zero] fix legacy zero import path * [zero] remove useless import * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor gemini folder structure * [zero] refactor legacy zero import path * [zero] fix test import path * [zero] fix test * [zero] fix circular import * [zero] update import 2023-04-04 05:48:16 +00:00			`from colossalai.zero.low_level import LowLevelZeroOptimizer`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`config_dict['partition_grad'] = zero_stage == 2`
			`config_dict['clip_grad_norm'] = max_norm`
[misc] add verbose arg for zero and op builder (#3552) * [misc] add print verbose * [gemini] add print verbose * [zero] add print verbose for low level * [misc] add print verbose for op builder 2023-04-17 03:25:35 +00:00			`return LowLevelZeroOptimizer(optimizer, **config_dict, verbose=verbose)`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`else:`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`from colossalai.zero.gemini.gemini_optimizer import GeminiOptimizer`
[zero] add zero wrappers (#2523) * [zero] add zero wrappers * change names * add wrapper functions to init 2023-01-29 09:52:58 +00:00			`config_dict['clipping_norm'] = max_norm`
[gemini] improve compatibility and add static placement policy (#4479) * [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example 2023-08-24 01:29:25 +00:00			`return GeminiOptimizer(optimizer, model, **config_dict, verbose=verbose)`