[lazy] refactor lazy init (#3891)

* [lazy] remove old lazy init * [lazy] refactor lazy init folder structure * [lazy] fix lazy tensor deepcopy * [test] update lazy init test
2023-06-05 14:20:47 +08:00 · 2023-06-05 14:20:47 +08:00 · dbb32692d2
parent 70c8cdecf4
commit dbb32692d2
9 changed files with 56 additions and 324 deletions
--- a/colossalai/lazy/init.py
+++ b/colossalai/lazy/init.py
@ -0,0 +1,6 @@
+from .lazy_init import LazyInitContext, LazyTensor
+
+__all__ = [
+    'LazyInitContext',
+    'LazyTensor',
+]
--- a/colossalai/utils/model/experimental.py
+++ b/colossalai/utils/model/experimental.py
@ -350,6 +350,13 @@ class LazyTensor(torch.Tensor):
                copied.requires_grad_()
            return copied

+        if self._materialized_data is not None:
+            # self is early materialized
+            copied = self._materialized_data.detach().clone()
+            if self.requires_grad:
+                copied.requires_grad_()
+            target = LazyTensor(lambda: None, concrete_data=copied)
+        else:
            target = LazyTensor(factory_fn, meta_data=self._meta_data)

        memo[id(self)] = target
--- a/colossalai/utils/model/lazy_init_context.py
+++ b/colossalai/utils/model/lazy_init_context.py
@ -1,242 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-
-import inspect
-import types
-from typing import Callable, List
-
-import torch
-import torch.nn as nn
-
-from colossalai.tensor import ColoParameter, ColoTensor
-from colossalai.utils.model.utils import substitute_init_recursively
-
-
-class LazyInitContext():
-    """
-    A context to allow for lazy weight initialization of PyTorch modules. It intercepts the tensor
-    initialization functions for lazy initialization
-
-    Note:
-        This API is only experimental and subject to future changes.
-
-    Usage:
-        with LazyInitContext() as ctx:
-            model = nn.Linear(10, 10)
-            model.weight.zero_()
-
-        # make sure the weight is a meta tensor
-        assert model.weight.is_meta
-
-        # initialize weights
-        ctx.lazy_init_parameters(model)
-
-        # make sure the weight is not a meta tensor
-        # and initialized correctly
-        assert not model.weight.is_meta and torch.all(model.weight == 0)
-
-    Args:
-        to_meta (bool): optional, whether to initialize the model with meta tensors, default is True. This
-            argument exists for now because some corner cases such as self.weight = torch.zeros(...) cannot be captured yet.
-        extra_torch_tensor_func (List[str]): extra torch tensor functions related
-            to value setting, such as `zero_` and `triu_`. `zero_` is pre-added by default.
-    """
-
-    tensor_set_value_func = ['zero_', 'fill_']
-
-    def __init__(self, to_meta: bool = True, extra_torch_tensor_func: List[str] = None):
-        # TODO: hijack the torch constructor functions as well
-        self._to_meta = to_meta
-        self._intercepted_nn_init_func_cache = {}
-        self._nn_init_methods = self._get_nn_init_methods()
-        self._torch_mod_cls = torch.nn.modules.module.Module
-
-        if extra_torch_tensor_func:
-            # use tuple to remove duplicates
-            self._torch_tensor_funcs = tuple(self.tensor_set_value_func + extra_torch_tensor_func)
-        else:
-            self._torch_tensor_funcs = self.tensor_set_value_func
-
-    @property
-    def to_meta(self):
-        return self._to_meta
-
-    def _cache_init_func(self, func):
-        """
-        This method wraps the ``torch.nn.init`` method and torch tensor value-setting functions
-        so that the function call is cached instead of being executed.
-        """
-
-        def wrapped_init_func(tensor, *args, **kwargs):
-            if tensor not in self._intercepted_nn_init_func_cache:
-                self._intercepted_nn_init_func_cache[tensor] = []
-            self._intercepted_nn_init_func_cache[tensor].append((func, args, kwargs))
-
-        return wrapped_init_func
-
-    def _get_nn_init_methods(self):
-        """
-        This method looks for all available functions in the ``torch.nn.init``
-        module.
-        """
-        nn_init_method_names = dir(torch.nn.init)
-        nn_init_methods = []
-
-        # look for all methods in ``torch.nn.init`` module
-        for name in nn_init_method_names:
-            nn_init_methods.append((name, getattr(torch.nn.init, name)))
-
-        def _is_init_method(item):
-            name, func = item
-
-            if (not isinstance(func, types.FunctionType) or name.startswith('_') or not name.endswith('_')):
-                return False
-            else:
-                return True
-
-        # remove methods which are not init functions
-        nn_init_methods = list(filter(_is_init_method, nn_init_methods))
-        return nn_init_methods
-
-    def _wrap_module_init(self, func):
-        """
-        This method wraps the calls to the `__init__` of ``torch.nn.Module`` and replaces
-        the argument device with value 'meta' so that all modules are created as meta tensors.
-        """
-        has_device = 'device' in inspect.signature(func).parameters
-
-        def layer_lazy_init(module, *args, **kwargs):
-            # if this module contains device argument
-            # we set it to meta to initialize as meta backend
-            if has_device:
-                kwargs['device'] = 'meta'
-            func(module, *args, **kwargs)
-
-            # if device is not found, we intialize it and convert to meta
-            if not has_device:
-                module.to('meta')
-
-        return layer_lazy_init
-
-    def _get_tmp_origin_func_ref(self, name):
-        """
-        Generate a function name for consistency during caching and retrieving.
-        """
-        return f'_orig_{name}'
-
-    def _patch_nn_init_funcs(self):
-        # patch nn.init functions
-        for name, func in self._nn_init_methods:
-            setattr(torch.nn.init, name, self._cache_init_func(func))
-
-    def _unpatch_nn_init_funcs(self):
-        # unpatch nn.init functions
-        for name, func in self._nn_init_methods:
-            setattr(torch.nn.init, name, func)
-
-    def _patch_submodule_init(self):
-        # patch classes __init__ methods
-        def _activate_wrap_init(cls):
-            cls.__orig_init__ = cls.__init__
-            cls.__init__ = self._wrap_module_init(cls.__init__)
-
-        substitute_init_recursively(self._torch_mod_cls, _activate_wrap_init, set())
-
-    def _unpatch_submodule_init(self):
-
-        def _recover_orig_init(cls):
-            cls.__init__ = cls.__orig_init__
-
-        substitute_init_recursively(self._torch_mod_cls, _recover_orig_init, set())
-
-    def _patch_torch_tensor_funcs(self):
-        # patch tensor value-setting functions
-        for func_name in self._torch_tensor_funcs:
-            origin_func_name = self._get_tmp_origin_func_ref(func_name)
-            origin_func = getattr(torch.Tensor, func_name)
-            setattr(torch.Tensor, origin_func_name, origin_func)
-            setattr(torch.Tensor, func_name, self._cache_init_func(origin_func))
-
-    def _unpatch_torch_tensor_funcs(self):
-        for func_name in self._torch_tensor_funcs:
-            origin_func_name = self._get_tmp_origin_func_ref(func_name)
-            origin_func = getattr(torch.Tensor, origin_func_name)
-            setattr(torch.Tensor, func_name, origin_func)
-
-    def __enter__(self):
-        self._patch_torch_tensor_funcs()
-        self._patch_nn_init_funcs()
-
-        if self._to_meta:
-            self._patch_submodule_init()
-        return self
-
-    def __exit__(self, *args, **kwargs):
-        if self._to_meta:
-            self._unpatch_submodule_init()
-        self._unpatch_nn_init_funcs()
-        self._unpatch_torch_tensor_funcs()
-
-    def lazy_init_parameters(self, model: torch.nn.Module, device='cpu'):
-        """
-        Initialize the weights of the meta-tensor model.
-
-        Args:
-            model (`torch.nn.Module`): the model instantiated under the context.
-            device (str): the device on which weights are initialized
-
-        """
-
-        def _init_recursively(module: nn.Module):
-            # recursively initialize the module
-            for mod in module.children():
-                _init_recursively(mod)
-
-            # initialize and shard tensors directly attached to the current module
-            for name, param in module.named_parameters(recurse=False):
-                _init_and_shard(module, name, param)
-
-            for name, buf in module.named_buffers(recurse=False):
-                _init_and_shard(module, name, buf)
-
-        @torch.no_grad()
-        def _init_and_shard(module, name, tensor):
-            # check whether the tensor is a buffer or parameter
-            is_param = isinstance(tensor, nn.parameter.Parameter)
-
-            # get sharding spec
-            dist_spec = getattr(tensor, 'dist_spec', None)
-            pg = getattr(tensor, 'pg', None)
-            comp_spec = getattr(tensor, 'comp_spec', None)
-
-            # convert the tensor from meta to materialized one
-            if tensor.is_meta:
-                materialized_tensor = torch.empty_like(tensor, device=device)
-                # if this tensor is a meta tensor, it must have an init function
-                assert tensor in self._intercepted_nn_init_func_cache
-            else:
-                materialized_tensor = tensor
-
-            # apply init function
-            if tensor in self._intercepted_nn_init_func_cache:
-                init_func, args, kwargs = self._intercepted_nn_init_func_cache[tensor][-1]
-                init_func(materialized_tensor, *args, **kwargs)
-
-            # convert it to ColoTensor or ColoParameter
-            if is_param:
-                tensor = ColoParameter.from_torch_tensor(materialized_tensor, requires_grad=tensor.requires_grad)
-            else:
-                tensor = ColoTensor.from_torch_tensor(materialized_tensor)
-
-            # override the original tensor
-            with torch.no_grad():
-                setattr(module, name, tensor)
-
-            # apply sharding
-            if dist_spec:
-                tensor.process_group = pg
-                tensor.set_tensor_spec(dist_spec, comp_spec)
-
-        _init_recursively(model)
-
-        return model
--- a/colossalai/zero/gemini/gemini_ddp.py
+++ b/colossalai/zero/gemini/gemini_ddp.py
@ -2,13 +2,14 @@ import itertools
 from collections import OrderedDict
 from contextlib import nullcontext
 from functools import partial
-from typing import Dict, Iterator, List, Optional, Union, Tuple, Set
+from typing import Dict, Iterator, List, Optional, Set, Tuple, Union

 import torch
 import torch.distributed as dist
 import torch.nn as nn

 from colossalai.checkpoint_io.utils import calculate_tensor_size
+from colossalai.lazy import LazyTensor
 from colossalai.logging import get_dist_logger
 from colossalai.nn.parallel.data_parallel import ColoDDP, _cast_float, free_storage
 from colossalai.tensor import ProcessGroup as ColoProcessGroup
@ -16,7 +17,6 @@ from colossalai.tensor import ReplicaSpec
 from colossalai.tensor.colo_parameter import ColoParameter, ColoTensor, ColoTensorSpec
 from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from colossalai.utils import get_current_device, is_ddp_ignored
-from colossalai.utils.model.experimental import LazyTensor

 from .chunk import Chunk, ChunkManager, TensorState, init_chunk_manager
 from .gemini_hook import GeminiZeROHook
@ -99,8 +99,11 @@ class ZeroDDP(ColoDDP):
        self._non_persistent_buffers_set = self._get_non_persistent_buffers_set(module)
        self._cast_buffers()

-    def _get_non_persistent_buffers_set(self, module, memo: Optional[Set[nn.Module]] = None, prefix: str = '', remove_duplicate: bool = True):
-
+    def _get_non_persistent_buffers_set(self,
+                                        module,
+                                        memo: Optional[Set[nn.Module]] = None,
+                                        prefix: str = '',
+                                        remove_duplicate: bool = True):
        r"""
        Args:
            memo: a memo to store the set of modules already added to the result
@ -115,16 +118,17 @@ class ZeroDDP(ColoDDP):
        if module not in memo:
            if remove_duplicate:
                memo.add(module)
-                self_non_persistent_set = set(map(lambda key: prefix + ('.' if prefix else '') + key, module._non_persistent_buffers_set))
+            self_non_persistent_set = set(
+                map(lambda key: prefix + ('.' if prefix else '') + key, module._non_persistent_buffers_set))
            for name, sub_module in module._modules.items():
                if sub_module is None:
                    continue
                submodule_prefix = prefix + ('.' if prefix else '') + name
-                    child_non_persistent_set = self._get_non_persistent_buffers_set(sub_module, memo, submodule_prefix, remove_duplicate)
+                child_non_persistent_set = self._get_non_persistent_buffers_set(sub_module, memo, submodule_prefix,
+                                                                                remove_duplicate)
                self_non_persistent_set = set.union(self_non_persistent_set, child_non_persistent_set)
        return self_non_persistent_set

-
    def _post_forward(self):
        """This function is only triggered for inference.
        """
--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@ -8,10 +8,10 @@ import colossalai
 from colossalai.booster import Booster
 from colossalai.booster.plugin import GeminiPlugin
 from colossalai.fx import is_compatible_with_meta
+from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.utils.model.experimental import LazyInitContext
 from colossalai.zero import ColoInitContext
 from tests.kit.model_zoo import model_zoo

--- a/tests/test_utils/test_lazy_init/lazy_init_utils.py
+++ b/tests/test_utils/test_lazy_init/lazy_init_utils.py
@ -1,12 +1,13 @@
 import random
+from copy import deepcopy
 from typing import Any, Callable, Optional, Tuple

 import numpy as np
 import torch
 from packaging import version

+from colossalai.lazy.lazy_init import LazyInitContext, LazyTensor, _MyTensor
 from colossalai.tensor.d_tensor.layout_converter import to_global
-from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
 from tests.kit.model_zoo.registry import ModelAttribute

 SUPPORT_LAZY = version.parse(torch.__version__) >= version.parse('1.12.0')
@ -31,6 +32,9 @@ def assert_model_equal(m1: torch.nn.Module, m2: torch.nn.Module) -> None:
        assert n1 == n2
        assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'

+    for p1, p2 in zip(m1.parameters(), m2.parameters()):
+        assert p1.requires_grad == p2.requires_grad
+

 def assert_forward_equal(m1: torch.nn.Module, m2: torch.nn.Module, data_gen_fn: Callable[[], dict],
                         output_transform_fn: Callable[[Any], dict]) -> None:
@ -65,10 +69,14 @@ def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False,
    ctx = LazyInitContext()
    with ctx:
        deferred_model = model_fn()
+        copied_deferred_model = deepcopy(deferred_model)
    deferred_model = ctx.materialize(deferred_model, verbose=verbose)
+    copied_deferred_model = ctx.materialize(copied_deferred_model, verbose=verbose)
    assert_model_equal(model, deferred_model)
+    assert_model_equal(deferred_model, copied_deferred_model)
    if check_forward:
        assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn)
+        assert_forward_equal(deferred_model, copied_deferred_model, data_gen_fn, output_transform_fn)
    if verbose:
        print(f'{model.__class__.__name__} pass')

--- a/tests/test_utils/test_lazy_init/test_distribute.py
+++ b/tests/test_utils/test_lazy_init/test_distribute.py
@ -12,7 +12,7 @@ from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.utils.common import print_rank_0

 try:
-    from colossalai.utils.model.experimental import LazyInitContext, LazyTensor, _MyTensor
+    from colossalai.lazy.lazy_init import LazyInitContext, LazyTensor, _MyTensor
 except:
    pass
 from lazy_init_utils import SUPPORT_LAZY, assert_dist_model_equal, set_seed
--- a/tests/test_utils/test_lazy_init/test_models.py
+++ b/tests/test_utils/test_lazy_init/test_models.py
--- a/tests/test_utils/test_lazy_init_ctx.py
+++ b/tests/test_utils/test_lazy_init_ctx.py
@ -1,51 +0,0 @@
-import torch
-from colossalai.utils.model.lazy_init_context import LazyInitContext
-from torchvision.models import resnet34
-import random
-import numpy as np
-
-MANUAL_SEED = 0
-random.seed(MANUAL_SEED)
-np.random.seed(MANUAL_SEED)
-torch.manual_seed(MANUAL_SEED)
-
-
-def test_lazy_init_with_meta():
-    ctx = LazyInitContext(to_meta=True)
-    with ctx:
-        model = resnet34(num_classes=10)
-
-    for param in model.parameters():
-        assert param.is_meta
-    for buffer in model.buffers():
-        assert buffer.is_meta
-
-    ctx.lazy_init_parameters(model)
-
-    for name, param in model.named_parameters():
-        assert not param.is_meta, name
-
-    for buffer in model.buffers():
-        assert not buffer.is_meta
-
-
-def test_lazy_init_without_meta():
-    ctx = LazyInitContext(to_meta=False)
-    with ctx:
-        model = resnet34(num_classes=10)
-
-    for param in model.parameters():
-        assert not param.is_meta
-    for buffer in model.buffers():
-        assert not buffer.is_meta
-
-    conv1_weight_before_init = model.conv1.weight.clone()
-    ctx.lazy_init_parameters(model)
-    conv1_weight_after_init = model.conv1.weight.clone()
-
-    assert not torch.allclose(conv1_weight_after_init, conv1_weight_before_init)
-
-
-if __name__ == '__main__':
-    test_lazy_init_with_meta()
-    test_lazy_init_without_meta()