From 50ec3a7e06f7079fd7a8bd045be4f295c8f551a9 Mon Sep 17 00:00:00 2001 From: Frank Lee Date: Thu, 9 Jun 2022 17:19:13 +0800 Subject: [PATCH] [test] skip tests when not enough GPUs are detected (#1090) * [test] skip tests when not enough GPUs are detected * polish code * polish code --- colossalai/testing/__init__.py | 4 +-- colossalai/testing/utils.py | 29 +++++++++++++++++++ .../test_cifar_with_data_pipeline_tensor.py | 4 +-- tests/test_layers/test_3d/test_3d.py | 5 ++-- .../test_checkpoint/test_checkpoint_1d.py | 4 +-- .../test_checkpoint/test_checkpoint_2d.py | 4 +-- .../test_checkpoint/test_checkpoint_2p5d.py | 5 ++-- .../test_checkpoint/test_checkpoint_3d.py | 5 ++-- 8 files changed, 43 insertions(+), 17 deletions(-) diff --git a/colossalai/testing/__init__.py b/colossalai/testing/__init__.py index d8eecbb09..e3dd500de 100644 --- a/colossalai/testing/__init__.py +++ b/colossalai/testing/__init__.py @@ -1,7 +1,7 @@ from .comparison import assert_equal, assert_not_equal, assert_close, assert_close_loose, assert_equal_in_group -from .utils import parameterize, rerun_on_exception, rerun_if_address_is_in_use +from .utils import parameterize, rerun_on_exception, rerun_if_address_is_in_use, skip_if_not_enough_gpus __all__ = [ 'assert_equal', 'assert_not_equal', 'assert_close', 'assert_close_loose', 'assert_equal_in_group', 'parameterize', - 'rerun_on_exception', 'rerun_if_address_is_in_use' + 'rerun_on_exception', 'rerun_if_address_is_in_use', 'skip_if_not_enough_gpus' ] diff --git a/colossalai/testing/utils.py b/colossalai/testing/utils.py index 50566d2a8..4f0c2beee 100644 --- a/colossalai/testing/utils.py +++ b/colossalai/testing/utils.py @@ -172,3 +172,32 @@ def rerun_if_address_is_in_use(): func_wrapper = rerun_on_exception(exception_type=exception, pattern=".*Address already in use.*") return func_wrapper + + +def skip_if_not_enough_gpus(min_gpus: int): + """ + This function is used to check the number of available GPUs on the system and + automatically skip the test cases which require more GPUs. + + Note: + The wrapped function must have `world_size` in its keyword argument. + + Usage: + @skip_if_not_enough_gpus(min_gpus=8) + def test_something(): + # will be skipped if there are fewer than 8 GPUs available + do_something() + + Arg: + min_gpus (int): the minimum number of GPUs required to run this test. + """ + + def _wrap_func(f): + def _execute_by_gpu_num(*args, **kwargs): + num_avail_gpu = torch.cuda.device_count() + if num_avail_gpu >= min_gpus: + f(*args, **kwargs) + return _execute_by_gpu_num + + return _wrap_func + diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py index dad9f03ba..415b2ddc7 100644 --- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py +++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py @@ -10,7 +10,7 @@ import torch.multiprocessing as mp from colossalai.amp import AMP_TYPE from colossalai.trainer import Trainer, hooks from colossalai.context import ParallelMode -from colossalai.testing import rerun_if_address_is_in_use +from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus from colossalai.utils import free_port from colossalai.core import global_context as gpc from colossalai.logging import get_dist_logger @@ -83,7 +83,7 @@ def run_trainer(rank, world_size, port): @pytest.mark.dist -@pytest.mark.skip("This test requires 8 GPUs to execute") +@skip_if_not_enough_gpus(min_gpus=8) @rerun_if_address_is_in_use() def test_hybrid_parallel(): world_size = 8 diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_layers/test_3d/test_3d.py index 063d69873..9d74c8a57 100644 --- a/tests/test_layers/test_3d/test_3d.py +++ b/tests/test_layers/test_3d/test_3d.py @@ -9,7 +9,7 @@ from colossalai.core import global_context as gpc from colossalai.initialize import launch from colossalai.logging import disable_existing_loggers from colossalai.utils import free_port -from colossalai.testing import rerun_if_address_is_in_use +from colossalai.testing import rerun_if_address_is_in_use, skip_if_not_enough_gpus from checks_3d.check_layer_3d import (check_classifier_given_embed_weight, check_classifier_no_given_weight, check_embed, check_layernorm, check_linear, check_loss, check_patch_embed, check_vocab_parallel_classifier_given_embed_weight, @@ -38,7 +38,6 @@ def check_layer(): check_loss() check_vocab_parallel_loss() - def check_layer_and_operation(rank, world_size, port): disable_existing_loggers() launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl') @@ -51,7 +50,7 @@ def check_layer_and_operation(rank, world_size, port): @pytest.mark.dist -@pytest.mark.skip("This test requires 8 GPUs to execute") +@skip_if_not_enough_gpus(min_gpus=8) @rerun_if_address_is_in_use() def test_3d(): world_size = 8 diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py index f8d1942de..8a5ec409b 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_1d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_1d.py @@ -15,7 +15,7 @@ from colossalai.initialize import launch from colossalai.logging import disable_existing_loggers from colossalai.utils import free_port, is_using_pp from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint -from colossalai.testing import rerun_on_exception +from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus def build_pipeline(model): @@ -67,7 +67,7 @@ def check_checkpoint_1d(rank, world_size, port): @pytest.mark.dist -@pytest.mark.skip("This test should be invoked with 8 GPUs") +@skip_if_not_enough_gpus(min_gpus=8) @rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*") def test_checkpoint_1d(): world_size = 8 diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py index a106a82b7..bd5c46237 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_2d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_2d.py @@ -15,7 +15,7 @@ from colossalai.initialize import launch from colossalai.logging import disable_existing_loggers from colossalai.utils import free_port, get_current_device, is_using_pp from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint -from colossalai.testing import rerun_on_exception +from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus def build_pipeline(model): @@ -67,7 +67,7 @@ def check_checkpoint_2d(rank, world_size, port): @pytest.mark.dist -@pytest.mark.skip("This test should be invoked with 8 GPUs") +@skip_if_not_enough_gpus(min_gpus=8) @rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*") def test_checkpoint_2d(): world_size = 8 diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py index 50495a582..79dae487b 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py @@ -15,7 +15,7 @@ from colossalai.initialize import launch from colossalai.logging import disable_existing_loggers from colossalai.utils import free_port, get_current_device, is_using_pp from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint -from colossalai.testing import rerun_on_exception +from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus def build_pipeline(model): @@ -37,7 +37,6 @@ def build_pipeline(model): def check_equal(A, B): assert torch.allclose(A, B, rtol=1e-3, atol=1e-2) - def check_checkpoint_2p5d(rank, world_size, port): config = dict(parallel=dict(pipeline=dict(size=2), tensor=dict(size=4, depth=1, mode="2.5d")),) @@ -67,7 +66,7 @@ def check_checkpoint_2p5d(rank, world_size, port): @pytest.mark.dist -@pytest.mark.skip("This test should be invoked with 8 GPUs") +@skip_if_not_enough_gpus(min_gpus=8) @rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*") def test_checkpoint_2p5d(): world_size = 8 diff --git a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py index 4ea6e6d39..d2d938c04 100644 --- a/tests/test_utils/test_checkpoint/test_checkpoint_3d.py +++ b/tests/test_utils/test_checkpoint/test_checkpoint_3d.py @@ -15,7 +15,7 @@ from colossalai.initialize import launch from colossalai.logging import disable_existing_loggers from colossalai.utils import free_port, get_current_device, is_using_pp from colossalai.utils.checkpointing import gather_pipeline_parallel_state_dict, load_checkpoint, save_checkpoint -from colossalai.testing import rerun_on_exception +from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus def build_pipeline(model): @@ -37,7 +37,6 @@ def build_pipeline(model): def check_equal(A, B): assert torch.allclose(A, B, rtol=1e-3, atol=1e-2) - def check_checkpoint_3d(rank, world_size, port): config = dict(parallel=dict(pipeline=dict(size=1), tensor=dict(size=8, mode="3d")),) @@ -67,7 +66,7 @@ def check_checkpoint_3d(rank, world_size, port): @pytest.mark.dist -@pytest.mark.skip("This test requires 8 GPUs to execute") +@skip_if_not_enough_gpus(min_gpus=8) @rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*") def test_checkpoint_3d(): world_size = 8