From 537e181705802a1f52248308d5dd6e087543ff77 Mon Sep 17 00:00:00 2001 From: HELSON Date: Tue, 29 Nov 2022 13:42:06 +0800 Subject: [PATCH] [testing] fix testing models (#2036) * [testing] fix testing models * roll back --- tests/components_to_test/bert.py | 11 ++- tests/components_to_test/inline_op_model.py | 2 +- tests/components_to_test/nested_model.py | 6 +- tests/components_to_test/no_leaf_module.py | 94 ++++++++++--------- .../repeated_computed_layer.py | 6 +- tests/components_to_test/simple_net.py | 10 +- tests/test_gemini/update/test_fwd_bwd.py | 9 +- 7 files changed, 74 insertions(+), 64 deletions(-) diff --git a/tests/components_to_test/bert.py b/tests/components_to_test/bert.py index e8d202b69..3293de7de 100644 --- a/tests/components_to_test/bert.py +++ b/tests/components_to_test/bert.py @@ -8,6 +8,7 @@ from .registry import non_distributed_component_funcs def get_bert_data_loader( + n_class, batch_size, total_samples, sequence_length, @@ -16,7 +17,7 @@ def get_bert_data_loader( ): train_data = torch.randint( low=0, - high=1000, + high=n_class, size=(total_samples, sequence_length), device=device, dtype=torch.long, @@ -37,7 +38,7 @@ def get_training_components(): num_head = 4 sequence_length = 12 num_layer = 2 - vocab_size = 30524 + vocab_size = 32 def bert_model_builder(checkpoint): config = BertConfig(vocab_size=vocab_size, @@ -67,11 +68,13 @@ def get_training_components(): return model - trainloader = get_bert_data_loader(batch_size=2, + trainloader = get_bert_data_loader(n_class=vocab_size, + batch_size=2, total_samples=10000, sequence_length=sequence_length, is_distrbuted=True) - testloader = get_bert_data_loader(batch_size=2, + testloader = get_bert_data_loader(n_class=vocab_size, + batch_size=2, total_samples=10000, sequence_length=sequence_length, is_distrbuted=True) diff --git a/tests/components_to_test/inline_op_model.py b/tests/components_to_test/inline_op_model.py index 92ccb73a7..f061d48f9 100644 --- a/tests/components_to_test/inline_op_model.py +++ b/tests/components_to_test/inline_op_model.py @@ -41,7 +41,7 @@ class DummyDataLoader(DummyDataGenerator): @non_distributed_component_funcs.register(name='inline_op_model') def get_training_components(): - def model_builder(checkpoint=True): + def model_builder(checkpoint=False): return InlineOpModule(checkpoint) trainloader = DummyDataLoader() diff --git a/tests/components_to_test/nested_model.py b/tests/components_to_test/nested_model.py index 26bfb8ecc..339084639 100644 --- a/tests/components_to_test/nested_model.py +++ b/tests/components_to_test/nested_model.py @@ -1,9 +1,11 @@ import torch import torch.nn as nn import torch.nn.functional as F + from colossalai.nn import CheckpointModule -from .utils import DummyDataGenerator + from .registry import non_distributed_component_funcs +from .utils import DummyDataGenerator class SubNet(nn.Module): @@ -43,7 +45,7 @@ class DummyDataLoader(DummyDataGenerator): @non_distributed_component_funcs.register(name='nested_model') def get_training_components(): - def model_builder(checkpoint=True): + def model_builder(checkpoint=False): return NestedNet(checkpoint) trainloader = DummyDataLoader() diff --git a/tests/components_to_test/no_leaf_module.py b/tests/components_to_test/no_leaf_module.py index 28a212f96..47dcecd36 100644 --- a/tests/components_to_test/no_leaf_module.py +++ b/tests/components_to_test/no_leaf_module.py @@ -1,46 +1,48 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from colossalai.nn import CheckpointModule -from .utils.dummy_data_generator import DummyDataGenerator -from .registry import non_distributed_component_funcs - - -class NoLeafModule(CheckpointModule): - """ - In this no-leaf module, it has subordinate nn.modules and a nn.Parameter. - """ - - def __init__(self, checkpoint=False) -> None: - super().__init__(checkpoint=checkpoint) - self.proj1 = nn.Linear(4, 8) - self.weight = nn.Parameter(torch.randn(8, 8)) - self.proj2 = nn.Linear(8, 4) - - def forward(self, x): - x = self.proj1(x) - x = F.linear(x, self.weight) - x = self.proj2(x) - return x - - -class DummyDataLoader(DummyDataGenerator): - - def generate(self): - data = torch.rand(16, 4) - label = torch.randint(low=0, high=2, size=(16,)) - return data, label - - -@non_distributed_component_funcs.register(name='no_leaf_module') -def get_training_components(): - - def model_builder(checkpoint=True): - return NoLeafModule(checkpoint) - - trainloader = DummyDataLoader() - testloader = DummyDataLoader() - - criterion = torch.nn.CrossEntropyLoss() - from colossalai.nn.optimizer import HybridAdam - return model_builder, trainloader, testloader, HybridAdam, criterion +import torch +import torch.nn as nn +import torch.nn.functional as F + +from colossalai.nn import CheckpointModule + +from .registry import non_distributed_component_funcs +from .utils.dummy_data_generator import DummyDataGenerator + + +class NoLeafModule(CheckpointModule): + """ + In this no-leaf module, it has subordinate nn.modules and a nn.Parameter. + """ + + def __init__(self, checkpoint=False) -> None: + super().__init__(checkpoint=checkpoint) + self.proj1 = nn.Linear(4, 8) + self.weight = nn.Parameter(torch.randn(8, 8)) + self.proj2 = nn.Linear(8, 4) + + def forward(self, x): + x = self.proj1(x) + x = F.linear(x, self.weight) + x = self.proj2(x) + return x + + +class DummyDataLoader(DummyDataGenerator): + + def generate(self): + data = torch.rand(16, 4) + label = torch.randint(low=0, high=2, size=(16,)) + return data, label + + +@non_distributed_component_funcs.register(name='no_leaf_module') +def get_training_components(): + + def model_builder(checkpoint=False): + return NoLeafModule(checkpoint) + + trainloader = DummyDataLoader() + testloader = DummyDataLoader() + + criterion = torch.nn.CrossEntropyLoss() + from colossalai.nn.optimizer import HybridAdam + return model_builder, trainloader, testloader, HybridAdam, criterion diff --git a/tests/components_to_test/repeated_computed_layer.py b/tests/components_to_test/repeated_computed_layer.py index f70910191..b3f84bd0e 100644 --- a/tests/components_to_test/repeated_computed_layer.py +++ b/tests/components_to_test/repeated_computed_layer.py @@ -2,9 +2,11 @@ import torch import torch.nn as nn + from colossalai.nn import CheckpointModule -from .utils.dummy_data_generator import DummyDataGenerator + from .registry import non_distributed_component_funcs +from .utils.dummy_data_generator import DummyDataGenerator class NetWithRepeatedlyComputedLayers(CheckpointModule): @@ -37,7 +39,7 @@ class DummyDataLoader(DummyDataGenerator): @non_distributed_component_funcs.register(name='repeated_computed_layers') def get_training_components(): - def model_builder(checkpoint=True): + def model_builder(checkpoint=False): return NetWithRepeatedlyComputedLayers(checkpoint) trainloader = DummyDataLoader() diff --git a/tests/components_to_test/simple_net.py b/tests/components_to_test/simple_net.py index fd4988d9e..cd9d7ebc0 100644 --- a/tests/components_to_test/simple_net.py +++ b/tests/components_to_test/simple_net.py @@ -1,10 +1,13 @@ import torch import torch.nn as nn + from colossalai.nn import CheckpointModule -from .utils.dummy_data_generator import DummyDataGenerator -from .registry import non_distributed_component_funcs from colossalai.utils.cuda import get_current_device +from .registry import non_distributed_component_funcs +from .utils.dummy_data_generator import DummyDataGenerator + + class SimpleNet(CheckpointModule): """ In this no-leaf module, it has subordinate nn.modules and a nn.Parameter. @@ -29,7 +32,6 @@ class SimpleNet(CheckpointModule): return x - class DummyDataLoader(DummyDataGenerator): def generate(self): @@ -41,7 +43,7 @@ class DummyDataLoader(DummyDataGenerator): @non_distributed_component_funcs.register(name='simple_net') def get_training_components(): - def model_builder(checkpoint=True): + def model_builder(checkpoint=False): return SimpleNet(checkpoint) trainloader = DummyDataLoader() diff --git a/tests/test_gemini/update/test_fwd_bwd.py b/tests/test_gemini/update/test_fwd_bwd.py index b1a71502b..ef2e59e43 100644 --- a/tests/test_gemini/update/test_fwd_bwd.py +++ b/tests/test_gemini/update/test_fwd_bwd.py @@ -4,6 +4,7 @@ import pytest import torch import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel as DDP +from torch.testing import assert_close import colossalai from colossalai.amp import convert_to_apex_amp @@ -28,7 +29,7 @@ def check_grad(model: ZeroDDP, torch_model: torch.nn.Module): chunk_manager.access_chunk(chunk) for (p0, p1) in zip(model.parameters(), torch_model.parameters()): - assert torch.allclose(p0, p1.grad, atol=1e-3, rtol=1e-5), "{}".format(torch.max(torch.abs(p0 - p1.grad)).item()) + assert_close(p0, p1.grad, rtol=1e-3, atol=5e-5) @parameterize('placement_policy', ['cuda', 'cpu', 'auto', 'const']) @@ -74,10 +75,8 @@ def exam_gpt_fwd_bwd(placement_policy, keep_gather, model_name: str, use_grad_ch torch_loss = run_fwd_bwd(torch_model, input_ids.cuda(), label.cuda(), criterion, use_init_ctx=False) loss = run_fwd_bwd(model, input_ids.cuda(), label.cuda(), criterion, use_init_ctx=True) - assert torch.allclose(loss, torch_loss, rtol=1e-2), "{} {} {}".format( - torch.max(torch.abs(loss - torch_loss)).item(), loss, torch_loss) + assert torch.equal(torch_loss, loss) - # FIXME(1SAA) bert and resnet18 can not pass the check_grad check_grad(model, torch_model) @@ -96,4 +95,4 @@ def test_gpt(world_size): if __name__ == '__main__': - test_gpt(1) + test_gpt(4)