ColossalAI/tests/test_gemini/test_param_op.py

import copy

import torch

from colossalai.gemini.paramhooks import BaseParamHookMgr
from tests.components_to_test.registry import non_distributed_component_funcs


def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:
    if loose:
        return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)
    return torch.allclose(tensor_a, tensor_b)


def run_model(model, inputs, label, criterion, use_param_hook=False):
    if use_param_hook:

        class HooKWrapper:

            def __init__(self) -> None:
                self.hook_triggered_times = 0

            def wrapper_func(self):

                def hook(param, grad) -> torch.Tensor or None:
                    self.hook_triggered_times += 1
                    return grad

                return hook

        hookwrapper = HooKWrapper()
        param_list = [p for p in model.parameters()]
        hook_mgr = BaseParamHookMgr(param_list)
        hook_mgr.register_backward_hooks(hookwrapper.wrapper_func())

    model.zero_grad(set_to_none=True)

    with torch.cuda.amp.autocast():
        if criterion:
            y = model(inputs)
            loss = criterion(y, label)
        else:
            loss = model(inputs, label)
        loss = loss.float()
    loss.backward()

    if use_param_hook:
        hook_mgr.remove_hooks()
        return hookwrapper.hook_triggered_times


def test_base_param_hook():
    test_models = ['repeated_computed_layers', 'resnet18', 'no_leaf_module', 'inline_op_model']
    # test_models = ['bert']

    for model_name in test_models:
        get_components_func = non_distributed_component_funcs.get_callable(model_name)
        model_builder, train_dataloader, _, _, criterion = get_components_func()

        torch.manual_seed(0)
        model = model_builder(checkpoint=True).cuda()
        model.train()

        for i, (inputs, label) in enumerate(train_dataloader):
            if i > 0:
                break
            model_copy = copy.deepcopy(model)

            run_model(model, inputs.cuda(), label.cuda(), criterion, False)
            ret2 = run_model(model_copy, inputs.cuda(), label.cuda(), criterion, True)

        # Make sure param hook has only be fired once in case of parameter sharing
        assert ret2 == len(list(model.parameters()))

        for p, p_copy in zip(model.parameters(), model_copy.parameters()):
            assert allclose(p.grad, p_copy.grad), f"{p.grad} vs {p_copy.grad}"


if __name__ == '__main__':
    test_base_param_hook()
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00			`import copy`

[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`import torch`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`from colossalai.gemini.paramhooks import BaseParamHookMgr`
			`from tests.components_to_test.registry import non_distributed_component_funcs`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00

			`def allclose(tensor_a: torch.Tensor, tensor_b: torch.Tensor, loose=False) -> bool:`
			`if loose:`
			`return torch.allclose(tensor_a, tensor_b, atol=1e-3, rtol=1e-3)`
			`return torch.allclose(tensor_a, tensor_b)`


[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`def run_model(model, inputs, label, criterion, use_param_hook=False):`
			`if use_param_hook:`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`class HooKWrapper:`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`def __init__(self) -> None:`
			`self.hook_triggered_times = 0`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`def wrapper_func(self):`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`def hook(param, grad) -> torch.Tensor or None:`
			`self.hook_triggered_times += 1`
			`return grad`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`return hook`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`hookwrapper = HooKWrapper()`
			`param_list = [p for p in model.parameters()]`
			`hook_mgr = BaseParamHookMgr(param_list)`
			`hook_mgr.register_backward_hooks(hookwrapper.wrapper_func())`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`model.zero_grad(set_to_none=True)`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`with torch.cuda.amp.autocast():`
			`if criterion:`
			`y = model(inputs)`
			`loss = criterion(y, label)`
			`else:`
			`loss = model(inputs, label)`
			`loss = loss.float()`
			`loss.backward()`

			`if use_param_hook:`
			`hook_mgr.remove_hooks()`
			`return hookwrapper.hook_triggered_times`


			`def test_base_param_hook():`
[Gemini] add unitests to check gemini correctness (#2015) 2022-11-24 08:51:45 +00:00			`test_models = ['repeated_computed_layers', 'resnet18', 'no_leaf_module', 'inline_op_model']`
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`# test_models = ['bert']`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`for model_name in test_models:`
			`get_components_func = non_distributed_component_funcs.get_callable(model_name)`
			`model_builder, train_dataloader, _, _, criterion = get_components_func()`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`torch.manual_seed(0)`
			`model = model_builder(checkpoint=True).cuda()`
			`model.train()`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`for i, (inputs, label) in enumerate(train_dataloader):`
			`if i > 0:`
			`break`
			`model_copy = copy.deepcopy(model)`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`run_model(model, inputs.cuda(), label.cuda(), criterion, False)`
			`ret2 = run_model(model_copy, inputs.cuda(), label.cuda(), criterion, True)`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`# Make sure param hook has only be fired once in case of parameter sharing`
			`assert ret2 == len(list(model.parameters()))`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00
[Gemini] add an inline_op_module to common test models and polish unitests. (#2004) 2022-11-23 08:55:54 +00:00			`for p, p_copy in zip(model.parameters(), model_copy.parameters()):`
			`assert allclose(p.grad, p_copy.grad), f"{p.grad} vs {p_copy.grad}"`
[hotfix] add missing file (#1308) 2022-07-14 06:43:15 +00:00

			`if __name__ == '__main__':`
			`test_base_param_hook()`