diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bcece5ab3..e7056383d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,7 +15,7 @@ jobs: runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:1.10.1-11.3.0 - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 + options: --shm-size=2gb --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 timeout-minutes: 40 steps: - uses: actions/checkout@v2 @@ -25,10 +25,10 @@ jobs: run: | [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/ pip install -r requirements/requirements.txt - pip install -r requirements/requirements-test.txt pip install -v -e . cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ cp /__w/ColossalAI/ColossalAI/*.so /github/home/cuda_ext_cache/ + pip install -r requirements/requirements-test.txt - name: Unit Testing run: | PYTHONPATH=$PWD pytest tests diff --git a/tests/components_to_test/gpt.py b/tests/components_to_test/gpt.py index 4d72180d8..a0d70a2fd 100644 --- a/tests/components_to_test/gpt.py +++ b/tests/components_to_test/gpt.py @@ -7,9 +7,9 @@ from colossalai.utils.cuda import get_current_device class DummyDataLoader(DummyDataGenerator): - vocab_size = 50304 + vocab_size = 128 batch_size = 4 - seq_len = 1024 + seq_len = 64 def generate(self): input_ids = torch.randint(0, @@ -47,6 +47,8 @@ class GPTLMModel(nn.Module): # Only return lm_logits return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0] +def gpt2_micro(checkpoint=True): + return GPTLMModel(checkpoint=checkpoint, hidden_size=32, num_layers=2, num_attention_heads=4, max_seq_len=64, vocab_size=128) def gpt2_s(checkpoint=True): return GPTLMModel(checkpoint=checkpoint) @@ -76,4 +78,4 @@ def get_training_components(): testloader = DummyDataLoader() criterion = GPTLMLoss() - return gpt2_s, trainloader, testloader, torch.optim.Adam, criterion + return gpt2_micro, trainloader, testloader, torch.optim.Adam, criterion diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py index 69fd1ee2d..dad9f03ba 100644 --- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py +++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py @@ -83,6 +83,7 @@ def run_trainer(rank, world_size, port): @pytest.mark.dist +@pytest.mark.skip("This test requires 8 GPUs to execute") @rerun_if_address_is_in_use() def test_hybrid_parallel(): world_size = 8 diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_layers/test_3d/test_3d.py index 7962d7dca..063d69873 100644 --- a/tests/test_layers/test_3d/test_3d.py +++ b/tests/test_layers/test_3d/test_3d.py @@ -51,6 +51,7 @@ def check_layer_and_operation(rank, world_size, port): @pytest.mark.dist +@pytest.mark.skip("This test requires 8 GPUs to execute") @rerun_if_address_is_in_use() def test_3d(): world_size = 8 diff --git a/tests/test_tensor/test_model.py b/tests/test_tensor/test_model.py index 65f633a53..becd08501 100644 --- a/tests/test_tensor/test_model.py +++ b/tests/test_tensor/test_model.py @@ -328,7 +328,6 @@ def run_model_dist(rank, world_size, port): @pytest.mark.dist @pytest.mark.parametrize('world_size', [1, 4]) -# @parameterize('world_size', [1, 4]) @rerun_if_address_is_in_use() def test_model(world_size): run_func = partial(run_model_dist, world_size=world_size, port=free_port()) diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py index 0717c118b..f8d1942de 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py @@ -67,6 +67,7 @@ def check_checkpoint_1d(rank, world_size, port): @pytest.mark.dist +@pytest.mark.skip("This test should be invoked with 8 GPUs") @rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*") def test_checkpoint_1d(): world_size = 8 diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py index 42b39b91e..a106a82b7 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py @@ -67,6 +67,7 @@ def check_checkpoint_2d(rank, world_size, port): @pytest.mark.dist +@pytest.mark.skip("This test should be invoked with 8 GPUs") @rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*") def test_checkpoint_2d(): world_size = 8 diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py index 7634a9706..50495a582 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py @@ -67,6 +67,7 @@ def check_checkpoint_2p5d(rank, world_size, port): @pytest.mark.dist +@pytest.mark.skip("This test should be invoked with 8 GPUs") @rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*") def test_checkpoint_2p5d(): world_size = 8 diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py index 740f3cfbd..4ea6e6d39 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py @@ -67,6 +67,7 @@ def check_checkpoint_3d(rank, world_size, port): @pytest.mark.dist +@pytest.mark.skip("This test requires 8 GPUs to execute") @rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*") def test_checkpoint_3d(): world_size = 8 diff --git a/tests/test_utils/test_memory.py b/tests/test_utils/test_memory.py index a6ff7c9a1..46a5aeba5 100644 --- a/tests/test_utils/test_memory.py +++ b/tests/test_utils/test_memory.py @@ -22,7 +22,7 @@ def run_dist(rank, world_size, port): @pytest.mark.dist -@pytest.mark.parametrize("world_size", [4, 5]) +@pytest.mark.parametrize("world_size", [3, 4]) def test_memory_utils(world_size): run_func = partial(run_dist, world_size=world_size, port=free_port()) mp.spawn(run_func, nprocs=world_size) diff --git a/tests/test_zero/test_tensor_utils.py b/tests/test_zero/test_tensor_utils.py index 93f6c9878..81855ff5e 100644 --- a/tests/test_zero/test_tensor_utils.py +++ b/tests/test_zero/test_tensor_utils.py @@ -85,7 +85,7 @@ def run_dist(rank, world_size, port): @pytest.mark.dist -@pytest.mark.parametrize("world_size", [4, 5]) +@pytest.mark.parametrize("world_size", [2, 4]) @rerun_if_address_is_in_use() def test_zero_tensor_utils(world_size): run_func = partial(run_dist, world_size=world_size, port=free_port())