ColossalAI/colossalai/zero/init_ctx/init_context.py

import functools
from typing import Optional

import torch
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.zero.shard_utils import BaseShardStrategy
from colossalai.zero.sharded_model._utils import cast_tensor_to_fp16
from colossalai.zero.sharded_param import ShardedParamV2
from torch.distributed import ProcessGroup


def _substitute_init_recursively(cls, func):
    for subcls in cls.__subclasses__():
        _substitute_init_recursively(subcls, func)
        func(subcls)


class InsertPostInitMethodToModuleSubClasses(object):

    def __init__(self):
        pass

    def __enter__(self):
        r"""
        Enter the context scope.
        """

        def preprocess_after(f):

            @functools.wraps(f)
            def wrapper(module: torch.nn.Module, *args, **kwargs):
                f(module, *args, **kwargs)
                self._post_init_method(module)

            return wrapper

        def _enable_class(cls):
            cls._old_init = cls.__init__
            cls.__init__ = preprocess_after(cls.__init__)

        # The function is called during init subclass.
        def _init_subclass(cls, **kwargs):
            cls.__init__ = preprocess_after(cls.__init__)

        # Replace .__init__() for all existing subclasses of torch.nn.Module
        # Excution self._post_init_method after the default init function.
        _substitute_init_recursively(torch.nn.modules.module.Module, _enable_class)

        # holding on to the current __init__subclass__ for exit
        torch.nn.modules.module.Module._old_init_subclass = (torch.nn.modules.module.Module.__init_subclass__)
        # Replace .__init__() for future subclasses of torch.nn.Module
        torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass)

        self._pre_context_exec()

    def __exit__(self, exc_type, exc_value, traceback):

        def _disable_class(cls):
            cls.__init__ = cls._old_init

        # Replace .__init__() for all existing subclasses of torch.nn.Module
        _substitute_init_recursively(torch.nn.modules.module.Module, _disable_class)

        # Replace .__init__() for future subclasses of torch.nn.Module
        torch.nn.modules.module.Module.__init_subclass__ = (torch.nn.modules.module.Module._old_init_subclass)

        self._post_context_exec()
        # Now that we cleaned up the metaclass injection, raise the exception.
        if exc_type is not None:
            return False

    # To be implemented by inheriting classes
    def _post_init_method(self, module):
        pass

    def _pre_context_exec(self):
        pass

    def _post_context_exec(self):
        pass


class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):
    """A context to initialize model.

    1. Convert the model to fp16.
    2. The paramaters of the module are adapted to type ShardedParameter.
    3. Shard the param and grad according to flags.

    Args:
        convert_fp16 (bool): Whether to convert params to fp16.
        target_device (torch.device): The device where param data after exiting the context.
        shard_strategy (BaseShardStrategy): Shard strategy instance.
        shard_param (bool, optional): Is param sharded after exiting the context. Defaults to False.
        shard_grad (bool, optional): Is param sharded after exiting the context. Defaults to False.
        rm_torch_payload_on_the_fly (bool, optional): If set to `True`, remove tensor payload on `param.data` after module init finished.
            This will reduce memory usage when initializing model. 
            But it's not suitable for all models, especially when there are `weight init` operations in `__init__`.
            If set to `False`, remove tensor payload on param.data afther the context exist.
            This is used when you add some logic to operate tensors in __init__ of module.
            See torchvision resnet18. Defaults to False.
        model_numel_tensor (torch.Tensor, optional): A tensor which will store the number of elements of model. Defaults to torch.zeros(1, dtype=torch.int).
        dp_process_group (Optional[ProcessGroup], optional): Data parallel process group. Defaults to None.
    """

    def __init__(self,
                 target_device: torch.device,
                 shard_strategy: BaseShardStrategy,
                 shard_param: bool = False,
                 rm_torch_payload_on_the_fly: bool = False,
                 model_numel_tensor: torch.Tensor = torch.zeros(1, dtype=torch.long),
                 dp_process_group: Optional[ProcessGroup] = None):

        super().__init__()
        self.target_device = target_device
        self.shard_param = shard_param
        self.shard_strategy = shard_strategy
        self.rm_torch_payload_on_the_fly = rm_torch_payload_on_the_fly
        self.initialized_param_list = []
        self.model_numel_tensor = model_numel_tensor
        self.dp_process_group = dp_process_group or gpc.get_group(ParallelMode.DATA)

    def _pre_context_exec(self):
        """ 
        The Callback function when entering the context
        """
        self.logger = get_dist_logger("ZeroInitContext")

    def _post_context_exec(self):
        """The callback function when exiting context.
        """
        if not self.rm_torch_payload_on_the_fly:
            for param in self.initialized_param_list:
                assert hasattr(param, 'col_attr')
                param.col_attr.remove_torch_payload()

            del self.initialized_param_list

    def _post_init_method(self, module: torch.nn.Module):
        """
        The function to call at the end of the constructor of each module.
        NOTE() The module may be passed to this function multiple times.
        """
        for param in module.parameters(recurse=False):
            # avoid adapting a param to ShardedParam twice
            if hasattr(param, 'col_attr'):
                continue

            self.model_numel_tensor += param.numel()

            target_device = self.target_device

            # convert to fp16
            param.data = param.data.to(torch.half)
            if param.grad is not None:
                param.grad = param.grad.to(torch.half)

            # move torch parameters to the target device
            param.data = param.data.to(target_device)
            if param.grad is not None:
                param.grad = param.grad.to(target_device)

            param.col_attr = ShardedParamV2(param, rm_torch_payload=self.rm_torch_payload_on_the_fly)

            self.initialized_param_list.append(param)
            if self.shard_param:
                self.shard_strategy.shard([param.col_attr.sharded_data_tensor], self.dp_process_group)

        # We must cast buffers
        # If we use BN, buffers may be on CPU and Float
        # We must cast them
        for buffer in module.buffers(recurse=False):
            buffer.data = buffer.data.to(device=torch.cuda.current_device())
            buffer.data = cast_tensor_to_fp16(buffer.data)
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`import functools`
zero init ctx receives a dp process group (#471) 2022-03-21 03:18:55 +00:00			`from typing import Optional`
[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`import torch`
zero init ctx receives a dp process group (#471) 2022-03-21 03:18:55 +00:00			`from colossalai.context.parallel_mode import ParallelMode`
			`from colossalai.core import global_context as gpc`
[zero] polish ZeroInitContext (#540) 2022-03-29 01:09:04 +00:00			`from colossalai.logging import get_dist_logger`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`from colossalai.zero.shard_utils import BaseShardStrategy`
[refactor] remove old zero code (#517) 2022-03-25 06:54:39 +00:00			`from colossalai.zero.sharded_model._utils import cast_tensor_to_fp16`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`from colossalai.zero.sharded_param import ShardedParamV2`
zero init ctx receives a dp process group (#471) 2022-03-21 03:18:55 +00:00			`from torch.distributed import ProcessGroup`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00
[zero] memtracer to record cuda memory usage of model data and overall system (#395) 2022-03-14 14:05:30 +00:00
[zero] adapt for no-leaf module in zero (#535) only process module's own parameters in Zero context add zero hooks for all modules that contrain parameters gather parameters only belonging to module itself 2022-03-28 09:42:18 +00:00			`def _substitute_init_recursively(cls, func):`
			`for subcls in cls.__subclasses__():`
			`_substitute_init_recursively(subcls, func)`
			`func(subcls)`


[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`class InsertPostInitMethodToModuleSubClasses(object):`

			`def __init__(self):`
			`pass`

			`def __enter__(self):`
			`r"""`
			`Enter the context scope.`
			`"""`

			`def preprocess_after(f):`

			`@functools.wraps(f)`
			`def wrapper(module: torch.nn.Module, args, *kwargs):`
			`f(module, args, *kwargs)`
			`self._post_init_method(module)`

			`return wrapper`

			`def _enable_class(cls):`
			`cls._old_init = cls.__init__`
			`cls.__init__ = preprocess_after(cls.__init__)`

			`# The function is called during init subclass.`
			`def _init_subclass(cls, **kwargs):`
			`cls.__init__ = preprocess_after(cls.__init__)`

			`# Replace .__init__() for all existing subclasses of torch.nn.Module`
			`# Excution self._post_init_method after the default init function.`
[zero] adapt for no-leaf module in zero (#535) only process module's own parameters in Zero context add zero hooks for all modules that contrain parameters gather parameters only belonging to module itself 2022-03-28 09:42:18 +00:00			`_substitute_init_recursively(torch.nn.modules.module.Module, _enable_class)`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00
			`# holding on to the current __init__subclass__ for exit`
			`torch.nn.modules.module.Module._old_init_subclass = (torch.nn.modules.module.Module.__init_subclass__)`
			`# Replace .__init__() for future subclasses of torch.nn.Module`
			`torch.nn.modules.module.Module.__init_subclass__ = classmethod(_init_subclass)`

			`self._pre_context_exec()`

			`def __exit__(self, exc_type, exc_value, traceback):`

			`def _disable_class(cls):`
			`cls.__init__ = cls._old_init`

			`# Replace .__init__() for all existing subclasses of torch.nn.Module`
[zero] adapt for no-leaf module in zero (#535) only process module's own parameters in Zero context add zero hooks for all modules that contrain parameters gather parameters only belonging to module itself 2022-03-28 09:42:18 +00:00			`_substitute_init_recursively(torch.nn.modules.module.Module, _disable_class)`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00
			`# Replace .__init__() for future subclasses of torch.nn.Module`
			`torch.nn.modules.module.Module.__init_subclass__ = (torch.nn.modules.module.Module._old_init_subclass)`

			`self._post_context_exec()`
			`# Now that we cleaned up the metaclass injection, raise the exception.`
			`if exc_type is not None:`
			`return False`

			`# To be implemented by inheriting classes`
			`def _post_init_method(self, module):`
			`pass`

			`def _pre_context_exec(self):`
			`pass`

			`def _post_context_exec(self):`
			`pass`


			`class ZeroInitContext(InsertPostInitMethodToModuleSubClasses):`
[zero] zero init ctx enable rm_torch_payload_on_the_fly (#512) * enable rm_torch_payload_on_the_fly * polish docstr 2022-03-24 15:44:00 +00:00			`"""A context to initialize model.`

[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`1. Convert the model to fp16.`
			`2. The paramaters of the module are adapted to type ShardedParameter.`
			`3. Shard the param and grad according to flags.`
[zero] able to place params on cpu after zero init context (#365) * place params on cpu after zero init context * polish code 2022-03-10 06:08:58 +00:00
[zero] zero init ctx enable rm_torch_payload_on_the_fly (#512) * enable rm_torch_payload_on_the_fly * polish docstr 2022-03-24 15:44:00 +00:00			`Args:`
			`convert_fp16 (bool): Whether to convert params to fp16.`
			`target_device (torch.device): The device where param data after exiting the context.`
			`shard_strategy (BaseShardStrategy): Shard strategy instance.`
			`shard_param (bool, optional): Is param sharded after exiting the context. Defaults to False.`
			`shard_grad (bool, optional): Is param sharded after exiting the context. Defaults to False.`
			rm_torch_payload_on_the_fly (bool, optional): If set to `True`, remove tensor payload on `param.data` after module init finished.
			`This will reduce memory usage when initializing model.`
			But it's not suitable for all models, especially when there are `weight init` operations in `__init__`.
			If set to `False`, remove tensor payload on param.data afther the context exist.
[zero] global model data memory tracer (#360) 2022-03-10 03:20:04 +00:00			`This is used when you add some logic to operate tensors in __init__ of module.`
[zero] zero init ctx enable rm_torch_payload_on_the_fly (#512) * enable rm_torch_payload_on_the_fly * polish docstr 2022-03-24 15:44:00 +00:00			`See torchvision resnet18. Defaults to False.`
			`model_numel_tensor (torch.Tensor, optional): A tensor which will store the number of elements of model. Defaults to torch.zeros(1, dtype=torch.int).`
			`dp_process_group (Optional[ProcessGroup], optional): Data parallel process group. Defaults to None.`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`"""`

[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00			`def __init__(self,`
[zero] able to place params on cpu after zero init context (#365) * place params on cpu after zero init context * polish code 2022-03-10 06:08:58 +00:00			`target_device: torch.device,`
[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00			`shard_strategy: BaseShardStrategy,`
			`shard_param: bool = False,`
zero init ctx receives a dp process group (#471) 2022-03-21 03:18:55 +00:00			`rm_torch_payload_on_the_fly: bool = False,`
[zero] polish ZeroInitContext (#540) 2022-03-29 01:09:04 +00:00			`model_numel_tensor: torch.Tensor = torch.zeros(1, dtype=torch.long),`
zero init ctx receives a dp process group (#471) 2022-03-21 03:18:55 +00:00			`dp_process_group: Optional[ProcessGroup] = None):`
[zero] zero init ctx enable rm_torch_payload_on_the_fly (#512) * enable rm_torch_payload_on_the_fly * polish docstr 2022-03-24 15:44:00 +00:00
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`super().__init__()`
[zero] able to place params on cpu after zero init context (#365) * place params on cpu after zero init context * polish code 2022-03-10 06:08:58 +00:00			`self.target_device = target_device`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`self.shard_param = shard_param`
			`self.shard_strategy = shard_strategy`
[zero] zero init ctx enable rm_torch_payload_on_the_fly (#512) * enable rm_torch_payload_on_the_fly * polish docstr 2022-03-24 15:44:00 +00:00			`self.rm_torch_payload_on_the_fly = rm_torch_payload_on_the_fly`
[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00			`self.initialized_param_list = []`
[zero] zero init context collect numel of model (#375) 2022-03-10 08:31:02 +00:00			`self.model_numel_tensor = model_numel_tensor`
zero init ctx receives a dp process group (#471) 2022-03-21 03:18:55 +00:00			`self.dp_process_group = dp_process_group or gpc.get_group(ParallelMode.DATA)`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00
[zero] show model data cuda memory usage after zero context init. (#515) 2022-03-25 03:23:35 +00:00			`def _pre_context_exec(self):`
			`"""`
			`The Callback function when entering the context`
			`"""`
			`self.logger = get_dist_logger("ZeroInitContext")`

[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`def _post_context_exec(self):`
[zero] show model data cuda memory usage after zero context init. (#515) 2022-03-25 03:23:35 +00:00			`"""The callback function when exiting context.`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`"""`
[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00			`if not self.rm_torch_payload_on_the_fly:`
			`for param in self.initialized_param_list:`
[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`assert hasattr(param, 'col_attr')`
			`param.col_attr.remove_torch_payload()`
[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00
			`del self.initialized_param_list`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00
[zero] show model data cuda memory usage after zero context init. (#515) 2022-03-25 03:23:35 +00:00			`def _post_init_method(self, module: torch.nn.Module):`
			`"""`
			`The function to call at the end of the constructor of each module.`
			`NOTE() The module may be passed to this function multiple times.`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`"""`
[zero] adapt for no-leaf module in zero (#535) only process module's own parameters in Zero context add zero hooks for all modules that contrain parameters gather parameters only belonging to module itself 2022-03-28 09:42:18 +00:00			`for param in module.parameters(recurse=False):`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`# avoid adapting a param to ShardedParam twice`
[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`if hasattr(param, 'col_attr'):`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`continue`

[zero] zero init context collect numel of model (#375) 2022-03-10 08:31:02 +00:00			`self.model_numel_tensor += param.numel()`

[zero] able to place params on cpu after zero init context (#365) * place params on cpu after zero init context * polish code 2022-03-10 06:08:58 +00:00			`target_device = self.target_device`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00
[zero] polish ZeroInitContext (#540) 2022-03-29 01:09:04 +00:00			`# convert to fp16`
			`param.data = param.data.to(torch.half)`
			`if param.grad is not None:`
			`param.grad = param.grad.to(torch.half)`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00
[zero] able to place params on cpu after zero init context (#365) * place params on cpu after zero init context * polish code 2022-03-10 06:08:58 +00:00			`# move torch parameters to the target device`
			`param.data = param.data.to(target_device)`
			`if param.grad is not None:`
			`param.grad = param.grad.to(target_device)`

[zero] Update sharded model v2 using sharded param v2 (#323) 2022-03-08 10:18:06 +00:00			`param.col_attr = ShardedParamV2(param, rm_torch_payload=self.rm_torch_payload_on_the_fly)`
[zero] update zero context init with the updated test utils (#327) 2022-03-08 06:45:01 +00:00
			`self.initialized_param_list.append(param)`
[zero] zero init context (#321) * add zero init context * add more flags for zero init context fix bug of repeated converting param to ShardedParamV2 * polish code 2022-03-07 08:14:40 +00:00			`if self.shard_param:`
[zero] polish sharded param name (#484) * [zero] polish sharded param name * polish code * polish * polish code * polish * polsih * polish 2022-03-22 06:36:16 +00:00			`self.shard_strategy.shard([param.col_attr.sharded_data_tensor], self.dp_process_group)`
[zero] refactor model data tracing (#522) 2022-03-25 10:03:32 +00:00
update sharded optim and fix zero init ctx (#457) 2022-03-18 07:44:47 +00:00			`# We must cast buffers`
			`# If we use BN, buffers may be on CPU and Float`
			`# We must cast them`
[zero] adapt for no-leaf module in zero (#535) only process module's own parameters in Zero context add zero hooks for all modules that contrain parameters gather parameters only belonging to module itself 2022-03-28 09:42:18 +00:00			`for buffer in module.buffers(recurse=False):`
update sharded optim and fix zero init ctx (#457) 2022-03-18 07:44:47 +00:00			`buffer.data = buffer.data.to(device=torch.cuda.current_device())`
[zero] polish ZeroInitContext (#540) 2022-03-29 01:09:04 +00:00			`buffer.data = cast_tensor_to_fp16(buffer.data)`