ColossalAI/colossalai/utils/model/lazy_init_context.py

258 lines
9.4 KiB
Python
Raw Normal View History

#!/usr/bin/env python
# coding: utf-8
import torch
from colossalai.tensor import ColoParameter
import types
import inspect
import typing
from typing import List, Callable
from colossalai.utils.model.utils import substitute_init_recursively
import copy
class LazyInitContext():
"""
A context to allow for lazy weight initialization of PyTorch modules. It intercepts the tensor
initialization functions for lazy initialization
Note:
This API is only experimental and subject to future changes.
It should be integrated with meta tensor initialization in the future.
Usage:
with LazyInitContext() as ctx:
model = nn.Linear(10, 10)
model.weight.zero_()
# make sure the weight is a meta tensor
assert model.weight.is_meta
# initialize weights
ctx.lazy_init_parameters(model)
# make sure the weight is not a meta tensor
# and initialized correctly
assert not model.weight.is_meta and torch.all(model.weight == 0)
Args:
extra_torch_tensor_func (List[str]): extra torch tensor functions related
to value setting, such as `zero_` and `triu_`. `zero_` is pre-added by default.
"""
tensor_set_value_func = ['zero_']
def __init__(self, extra_torch_tensor_func: List[str] = None):
self._intercepted_init_func_cache = []
self._nn_init_methods = self._get_nn_init_methods()
self._torch_mod_cls = torch.nn.modules.module.Module
if extra_torch_tensor_func:
# use tuple to remove duplicates
self._torch_tensor_funcs = tuple(self.tensor_set_value_func + extra_torch_tensor_func)
else:
self._torch_tensor_funcs = self.tensor_set_value_func
def _cache_func(self, func):
"""
This method wraps the ``torch.nn.init`` method so that the function call
is cached instead of being executed.
"""
def wrapped_init_func(*args, **kwargs):
self._intercepted_init_func_cache.append(dict(func=func, args=args, kwargs=kwargs))
return wrapped_init_func
def _get_nn_init_methods(self):
"""
This method looks for all available functions in the ``torch.nn.init``
module.
"""
nn_init_method_names = dir(torch.nn.init)
nn_init_methods = []
# look for all methods in ``torch.nn.init`` module
for name in nn_init_method_names:
nn_init_methods.append((name, getattr(torch.nn.init, name)))
def _has_tensor_in_arg(func):
hints = typing.get_type_hints(func)
for k, v in hints.items():
if v is torch.Tensor:
return True
return False
def _is_init_method(item):
name, func = item
if (not isinstance(func, types.FunctionType) or name.startswith('_') or not name.endswith('_')
or not _has_tensor_in_arg(func)):
return False
else:
return True
# remove methods which are not init functions
nn_init_methods = list(filter(_is_init_method, nn_init_methods))
return nn_init_methods
def _wrap_module_init(self, func):
"""
This method wraps the calls to the `__init__` of ``torch.nn.Module`` and replaces
the argument device with value 'meta' so that all modules are created as meta tensors.
"""
has_device = 'device' in inspect.signature(func).parameters
def layer_lazy_init(module, *args, **kwargs):
self._intercepted_init_func_cache.append(
dict(func=func, module=module, args=args, kwargs=copy.deepcopy(kwargs)))
if has_device:
kwargs['device'] = 'meta'
func(module, *args, **kwargs)
if not has_device:
module.to('meta')
return layer_lazy_init
def _get_tmp_origin_func_ref(self, name):
"""
Generate a function name for consistency during caching and retrieving.
"""
return f'_orig_{name}'
def _patch_nn_init_funcs(self):
# patch nn.init functions
for name, func in self._nn_init_methods:
setattr(torch.nn.init, name, self._cache_func(func))
def _unpatch_nn_init_funcs(self):
# unpatch nn.init functions
for name, func in self._nn_init_methods:
setattr(torch.nn.init, name, func)
def _patch_submodule_init(self):
# patch classes __init__ methods
def _activate_wrap_init(cls):
cls.__orig_init__ = cls.__init__
cls.__init__ = self._wrap_module_init(cls.__init__)
substitute_init_recursively(self._torch_mod_cls, _activate_wrap_init)
def _unpatch_submodule_init(self):
def _recover_orig_init(cls):
cls.__init__ = cls.__orig_init__
substitute_init_recursively(self._torch_mod_cls, _recover_orig_init)
def _patch_torch_tensor_funcs(self):
# patch tensor value-setting functions
for func_name in self._torch_tensor_funcs:
origin_func_name = self._get_tmp_origin_func_ref(func_name)
origin_func = getattr(torch.Tensor, func_name)
setattr(torch.Tensor, origin_func_name, origin_func)
setattr(torch.Tensor, func_name, self._cache_func(origin_func))
def _unpatch_torch_tensor_funcs(self):
for func_name in self._torch_tensor_funcs:
origin_func_name = self._get_tmp_origin_func_ref(func_name)
origin_func = getattr(torch.Tensor, origin_func_name)
setattr(torch.Tensor, func_name, origin_func)
def __enter__(self):
self._patch_submodule_init()
return self
def __exit__(self, *args, **kwargs):
self._unpatch_submodule_init()
# build model_rebuild_dict in reverse order to make sure get correct init func for inherited class.
self.module_rebuild_dict = {}
self._intercepted_init_func_cache.reverse()
for cache in self._intercepted_init_func_cache:
self.module_rebuild_dict[cache['module']] = (cache['func'], cache['args'], cache['kwargs'])
self._intercepted_init_func_cache.reverse()
def lazy_init_parameters(self, model: torch.nn.Module, device='cpu', call_back: Callable = None):
"""
Initialize the weights of the meta-tensor model.
Args:
model (`torch.nn.Module`): the model instantiated under the context.
device (str): the device on which weights are initialized
"""
# build param mapping
param_id_to_name = dict()
for name, param in model.named_parameters():
param_id_to_name[id(param)] = name
for name, buffer in model.named_buffers():
param_id_to_name[id(buffer)] = name
assert model in self.module_rebuild_dict, 'We only support rebuild modules which intercepted during initializing by us.'
def _process_arg(arg):
"""
Process args recursively. If arg is a torch.nn.Module instance in module_rebuild_dict,
we need to rebuild it with real parameters. If arg is a tuple or list, we will process
the element of arg with this function again.
"""
if torch.is_tensor(arg):
tensor_id = id(arg)
if tensor_id in param_id_to_name:
arg = _replace_meta_param_with_real_param(arg)
elif isinstance(arg, torch.nn.Module):
if arg in self.module_rebuild_dict:
arg = self.lazy_init_parameters(model=arg, device=device, call_back=call_back)
elif isinstance(arg, (tuple, list)):
rst_list = []
for element in arg:
processed_element = _process_arg(element)
rst_list.append(processed_element)
arg = rst_list
return arg
def _replace_meta_param_with_real_param(meta_param):
if meta_param.device != 'meta':
return meta_param
tensor_id = id(meta_param)
param_full_name = param_id_to_name[tensor_id]
real_param = torch.empty_like(meta_param, dtype=meta_param.dtype, device=device)
real_param = ColoParameter(real_param, requires_grad=meta_param.requires_grad)
if '.' in param_full_name:
submodule_name, param_name = param_full_name.rsplit('.', 1)
submodule = model.get_submodule(submodule_name)
else:
submodule = model
param_name = param_full_name
setattr(submodule, param_name, real_param)
# execute call_back function on the materailized tensor
# this can where sharding comes in
if call_back:
call_back(real_param)
return real_param
func, args, kwargs = self.module_rebuild_dict[model]
args = list(args)
# check args for parameter replacement
for idx, arg in enumerate(args):
arg = _process_arg(arg)
args[idx] = arg
# check kwargs for parameter replacement
for arg_name, arg in kwargs.items():
if arg_name == 'device':
arg = device
else:
arg = _process_arg(arg)
kwargs[arg_name] = arg
# build user specified model
with torch.no_grad():
func(model, *args, **kwargs)
return model