ColossalAI/tests/test_lazy/lazy_init_utils.py

import random
from copy import deepcopy
from typing import Any, Callable, Optional, Tuple

import numpy as np
import torch
from packaging import version

from colossalai.device.device_mesh import DeviceMesh
from colossalai.lazy.lazy_init import LazyInitContext, LazyTensor, _MyTensor
from colossalai.tensor.d_tensor import to_global
from colossalai.tensor.d_tensor.layout import Layout
from tests.kit.model_zoo.registry import ModelAttribute

SUPPORT_LAZY = version.parse(torch.__version__) >= version.parse('1.12.0')

# model_fn, data_gen_fn, output_transform_fn, model_attr
TestingEntry = Tuple[Callable[[], torch.nn.Module], Callable[[], dict], Callable[[], dict], Optional[ModelAttribute]]


def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)


def assert_model_equal(m1: torch.nn.Module, m2: torch.nn.Module) -> None:
    s1 = m1.state_dict()
    s2 = m2.state_dict()

    assert len(s1) == len(s2), f'len {len(s1)} vs {len(s2)}'

    for (n1, t1), (n2, t2) in zip(s1.items(), s2.items()):
        assert n1 == n2
        assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'

    for p1, p2 in zip(m1.parameters(), m2.parameters()):
        assert p1.requires_grad == p2.requires_grad


def assert_forward_equal(m1: torch.nn.Module, m2: torch.nn.Module, data_gen_fn: Callable[[], dict],
                         output_transform_fn: Callable[[Any], dict]) -> None:
    data = data_gen_fn()

    m1.eval()
    m2.eval()
    # run forward
    with torch.no_grad():
        outputs1 = m1(**data)
        outputs2 = m2(**data)

    # compare output
    transformed_out1 = output_transform_fn(outputs1)
    transformed_out2 = output_transform_fn(outputs2)

    assert len(transformed_out1) == len(transformed_out2)

    for key, out1 in transformed_out1.items():
        out2 = transformed_out2[key]
        assert torch.allclose(out1, out2, atol=1e-5), \
            f'{m1.__class__.__name__} has inconsistent outputs, {out1} vs {out2}'


def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False, check_forward: bool = False) -> None:
    model_fn, data_gen_fn, output_transform_fn, _, model_attr = entry
    _MyTensor._pre_op_fn = lambda *args: set_seed(seed)
    LazyTensor._pre_op_fn = lambda *args: set_seed(seed)
    ctx = LazyInitContext(tensor_cls=_MyTensor)
    with ctx:
        model = model_fn()
    ctx = LazyInitContext()
    with ctx:
        deferred_model = model_fn()
        copied_deferred_model = deepcopy(deferred_model)
    deferred_model = ctx.materialize(deferred_model, verbose=verbose)
    copied_deferred_model = ctx.materialize(copied_deferred_model, verbose=verbose)
    assert_model_equal(model, deferred_model)
    assert_model_equal(deferred_model, copied_deferred_model)
    if check_forward:
        assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn)
        assert_forward_equal(deferred_model, copied_deferred_model, data_gen_fn, output_transform_fn)
    if verbose:
        print(f'{model.__class__.__name__} pass')


def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.Module, device_mesh: DeviceMesh,
                            sharding_spec_dict: dict) -> None:
    state = model.state_dict()
    distributed_state = distributed_model.state_dict()

    assert len(state) == len(distributed_state), f'len {len(state)} vs {len(distributed_state)}'

    for (n1, t1), (n2, t2) in zip(state.items(), distributed_state.items()):
        assert n1 == n2
        t1 = t1.cuda()
        t2 = t2.cuda()
        if n2 in sharding_spec_dict:
            layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_dict[n2], global_shape=t1.shape)
            t2.dist_layout = layout
            t2 = to_global(t2)
        assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`import random`
[lazy] refactor lazy init (#3891) * [lazy] remove old lazy init * [lazy] refactor lazy init folder structure * [lazy] fix lazy tensor deepcopy * [test] update lazy init test 2023-06-05 06:20:47 +00:00			`from copy import deepcopy`
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`from typing import Any, Callable, Optional, Tuple`

			`import numpy as np`
			`import torch`
[devops] update torch version of CI (#3725) * [test] fix flop tensor test * [test] fix autochunk test * [test] fix lazyinit test * [devops] update torch version of CI * [devops] enable testmon * [devops] fix ci * [devops] fix ci * [test] fix checkpoint io test * [test] fix cluster test * [test] fix timm test * [devops] fix ci * [devops] fix ci * [devops] fix ci * [devops] fix ci * [devops] force sync to test ci * [test] skip fsdp test 2023-05-15 09:20:56 +00:00			`from packaging import version`
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00
[test] fixed tests failed due to dtensor change (#4082) * [test] fixed tests failed due to dtensor change * polish code 2023-06-26 07:50:07 +00:00			`from colossalai.device.device_mesh import DeviceMesh`
[lazy] refactor lazy init (#3891) * [lazy] remove old lazy init * [lazy] refactor lazy init folder structure * [lazy] fix lazy tensor deepcopy * [test] update lazy init test 2023-06-05 06:20:47 +00:00			`from colossalai.lazy.lazy_init import LazyInitContext, LazyTensor, _MyTensor`
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`from colossalai.tensor.d_tensor import to_global`
			`from colossalai.tensor.d_tensor.layout import Layout`
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`from tests.kit.model_zoo.registry import ModelAttribute`

[devops] update torch version of CI (#3725) * [test] fix flop tensor test * [test] fix autochunk test * [test] fix lazyinit test * [devops] update torch version of CI * [devops] enable testmon * [devops] fix ci * [devops] fix ci * [test] fix checkpoint io test * [test] fix cluster test * [test] fix timm test * [devops] fix ci * [devops] fix ci * [devops] fix ci * [devops] fix ci * [devops] force sync to test ci * [test] skip fsdp test 2023-05-15 09:20:56 +00:00			`SUPPORT_LAZY = version.parse(torch.__version__) >= version.parse('1.12.0')`

[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`# model_fn, data_gen_fn, output_transform_fn, model_attr`
			`TestingEntry = Tuple[Callable[[], torch.nn.Module], Callable[[], dict], Callable[[], dict], Optional[ModelAttribute]]`


			`def set_seed(seed: int) -> None:`
			`random.seed(seed)`
			`np.random.seed(seed)`
			`torch.manual_seed(seed)`


[CI] fix some spelling errors (#3707) * fix spelling error with examples/comminity/ * fix spelling error with tests/ * fix some spelling error with tests/ colossalai/ etc. 2023-05-10 09:12:03 +00:00			`def assert_model_equal(m1: torch.nn.Module, m2: torch.nn.Module) -> None:`
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`s1 = m1.state_dict()`
			`s2 = m2.state_dict()`

			`assert len(s1) == len(s2), f'len {len(s1)} vs {len(s2)}'`

			`for (n1, t1), (n2, t2) in zip(s1.items(), s2.items()):`
			`assert n1 == n2`
			`assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'`

[lazy] refactor lazy init (#3891) * [lazy] remove old lazy init * [lazy] refactor lazy init folder structure * [lazy] fix lazy tensor deepcopy * [test] update lazy init test 2023-06-05 06:20:47 +00:00			`for p1, p2 in zip(m1.parameters(), m2.parameters()):`
			`assert p1.requires_grad == p2.requires_grad`

[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00
			`def assert_forward_equal(m1: torch.nn.Module, m2: torch.nn.Module, data_gen_fn: Callable[[], dict],`
			`output_transform_fn: Callable[[Any], dict]) -> None:`
			`data = data_gen_fn()`

			`m1.eval()`
			`m2.eval()`
			`# run forward`
			`with torch.no_grad():`
			`outputs1 = m1(**data)`
			`outputs2 = m2(**data)`

			`# compare output`
			`transformed_out1 = output_transform_fn(outputs1)`
			`transformed_out2 = output_transform_fn(outputs2)`

			`assert len(transformed_out1) == len(transformed_out2)`

			`for key, out1 in transformed_out1.items():`
			`out2 = transformed_out2[key]`
			`assert torch.allclose(out1, out2, atol=1e-5), \`
			`f'{m1.__class__.__name__} has inconsistent outputs, {out1} vs {out2}'`


			`def check_lazy_init(entry: TestingEntry, seed: int = 42, verbose: bool = False, check_forward: bool = False) -> None:`
[shardformer] adapted T5 and LLaMa test to use kit (#4049) * [shardformer] adapted T5 and LLaMa test to use kit * polish code 2023-06-21 01:32:46 +00:00			`model_fn, data_gen_fn, output_transform_fn, _, model_attr = entry`
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`_MyTensor._pre_op_fn = lambda *args: set_seed(seed)`
			`LazyTensor._pre_op_fn = lambda *args: set_seed(seed)`
			`ctx = LazyInitContext(tensor_cls=_MyTensor)`
			`with ctx:`
			`model = model_fn()`
			`ctx = LazyInitContext()`
			`with ctx:`
			`deferred_model = model_fn()`
[lazy] refactor lazy init (#3891) * [lazy] remove old lazy init * [lazy] refactor lazy init folder structure * [lazy] fix lazy tensor deepcopy * [test] update lazy init test 2023-06-05 06:20:47 +00:00			`copied_deferred_model = deepcopy(deferred_model)`
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`deferred_model = ctx.materialize(deferred_model, verbose=verbose)`
[lazy] refactor lazy init (#3891) * [lazy] remove old lazy init * [lazy] refactor lazy init folder structure * [lazy] fix lazy tensor deepcopy * [test] update lazy init test 2023-06-05 06:20:47 +00:00			`copied_deferred_model = ctx.materialize(copied_deferred_model, verbose=verbose)`
[CI] fix some spelling errors (#3707) * fix spelling error with examples/comminity/ * fix spelling error with tests/ * fix some spelling error with tests/ colossalai/ etc. 2023-05-10 09:12:03 +00:00			`assert_model_equal(model, deferred_model)`
[lazy] refactor lazy init (#3891) * [lazy] remove old lazy init * [lazy] refactor lazy init folder structure * [lazy] fix lazy tensor deepcopy * [test] update lazy init test 2023-06-05 06:20:47 +00:00			`assert_model_equal(deferred_model, copied_deferred_model)`
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`if check_forward:`
			`assert_forward_equal(model, deferred_model, data_gen_fn, output_transform_fn)`
[lazy] refactor lazy init (#3891) * [lazy] remove old lazy init * [lazy] refactor lazy init folder structure * [lazy] fix lazy tensor deepcopy * [test] update lazy init test 2023-06-05 06:20:47 +00:00			`assert_forward_equal(deferred_model, copied_deferred_model, data_gen_fn, output_transform_fn)`
[lazyinit] add correctness verification (#3147) * [lazyinit] fix shared module * [tests] add lazy init test utils * [tests] add torchvision for lazy init * [lazyinit] fix pre op fn * [lazyinit] handle legacy constructor * [tests] refactor lazy init test models * [tests] refactor lazy init test utils * [lazyinit] fix ops don't support meta * [tests] lazy init test timm models * [lazyinit] fix set data * [lazyinit] handle apex layers * [tests] lazy init test transformers models * [tests] lazy init test torchaudio models * [lazyinit] fix import path * [tests] lazy init test torchrec models * [tests] update torch version in CI * [tests] revert torch version in CI * [tests] skip lazy init test 2023-03-17 05:49:04 +00:00			`if verbose:`
			`print(f'{model.__class__.__name__} pass')`
[lazyinit] combine lazy tensor with dtensor (#3204) * [lazyinit] lazy tensor add distribute * [lazyinit] refactor distribute * [lazyinit] add test dist lazy init * [lazyinit] add verbose info for dist lazy init * [lazyinit] fix rnn flatten weight op * [lazyinit] polish test * [lazyinit] polish test * [lazyinit] fix lazy tensor data setter * [lazyinit] polish test * [lazyinit] fix clean * [lazyinit] make materialize inplace * [lazyinit] refactor materialize * [lazyinit] refactor test distribute * [lazyinit] fix requires_grad * [lazyinit] fix tolist after materialization * [lazyinit] refactor distribute module * [lazyinit] polish docstr * [lazyinit] polish lazy init context * [lazyinit] temporarily skip test * [lazyinit] polish test * [lazyinit] add docstr 2023-03-23 02:53:06 +00:00

[test] fixed tests failed due to dtensor change (#4082) * [test] fixed tests failed due to dtensor change * polish code 2023-06-26 07:50:07 +00:00			`def assert_dist_model_equal(model: torch.nn.Module, distributed_model: torch.nn.Module, device_mesh: DeviceMesh,`
			`sharding_spec_dict: dict) -> None:`
[lazyinit] combine lazy tensor with dtensor (#3204) * [lazyinit] lazy tensor add distribute * [lazyinit] refactor distribute * [lazyinit] add test dist lazy init * [lazyinit] add verbose info for dist lazy init * [lazyinit] fix rnn flatten weight op * [lazyinit] polish test * [lazyinit] polish test * [lazyinit] fix lazy tensor data setter * [lazyinit] polish test * [lazyinit] fix clean * [lazyinit] make materialize inplace * [lazyinit] refactor materialize * [lazyinit] refactor test distribute * [lazyinit] fix requires_grad * [lazyinit] fix tolist after materialization * [lazyinit] refactor distribute module * [lazyinit] polish docstr * [lazyinit] polish lazy init context * [lazyinit] temporarily skip test * [lazyinit] polish test * [lazyinit] add docstr 2023-03-23 02:53:06 +00:00			`state = model.state_dict()`
			`distributed_state = distributed_model.state_dict()`

			`assert len(state) == len(distributed_state), f'len {len(state)} vs {len(distributed_state)}'`

			`for (n1, t1), (n2, t2) in zip(state.items(), distributed_state.items()):`
			`assert n1 == n2`
			`t1 = t1.cuda()`
			`t2 = t2.cuda()`
[shardformer] support module saving and loading (#4062) * [shardformer] support module saving and loading * polish code 2023-06-22 03:42:11 +00:00			`if n2 in sharding_spec_dict:`
			`layout = Layout(device_mesh=device_mesh, sharding_spec=sharding_spec_dict[n2], global_shape=t1.shape)`
			`t2.dist_layout = layout`
			`t2 = to_global(t2)`
[lazyinit] combine lazy tensor with dtensor (#3204) * [lazyinit] lazy tensor add distribute * [lazyinit] refactor distribute * [lazyinit] add test dist lazy init * [lazyinit] add verbose info for dist lazy init * [lazyinit] fix rnn flatten weight op * [lazyinit] polish test * [lazyinit] polish test * [lazyinit] fix lazy tensor data setter * [lazyinit] polish test * [lazyinit] fix clean * [lazyinit] make materialize inplace * [lazyinit] refactor materialize * [lazyinit] refactor test distribute * [lazyinit] fix requires_grad * [lazyinit] fix tolist after materialization * [lazyinit] refactor distribute module * [lazyinit] polish docstr * [lazyinit] polish lazy init context * [lazyinit] temporarily skip test * [lazyinit] polish test * [lazyinit] add docstr 2023-03-23 02:53:06 +00:00			`assert torch.equal(t1, t2), f'{n1} {t1} vs {t2}'`