From 26e29d58f0525ff573d6a2eeae328e0a4d7f9a68 Mon Sep 17 00:00:00 2001
From: Hongxin Liu <lhx0217@gmail.com>
Date: Wed, 16 Aug 2023 18:56:52 +0800
Subject: [PATCH] [devops] add large-scale distributed test marker (#4452)

* [test] remove cpu marker

* [test] remove gpu marker

* [test] update pytest markers

* [ci] update unit test ci
---
 .github/workflows/build_on_pr.yml             |   2 +-
 .../compatiblity_test_on_dispatch.yml         |   2 +-
 .github/workflows/compatiblity_test_on_pr.yml |   2 +-
 .../compatiblity_test_on_schedule.yml         |   2 +-
 applications/Chat/tests/test_dataset.py       |  79 ++++++-------
 applications/Chat/tests/test_models.py        | 105 +++++++-----------
 pytest.ini                                    |   6 +-
 tests/test_config/test_load_config.py         |   1 -
 tests/test_context/test_hybrid_parallel.py    |   1 -
 tests/test_data/test_cifar10_dataset.py       |   3 +-
 tests/test_data/test_data_parallel_sampler.py |   1 -
 .../test_deterministic_dataloader.py          |   1 -
 .../test_activation_checkpointing.py          |   1 -
 13 files changed, 81 insertions(+), 125 deletions(-)

diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml
index 8a1bc8e11..4c7e08e57 100644
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@@ -208,7 +208,7 @@ jobs:
 
       - name: Execute Unit Testing
         run: |
-          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest --testmon --testmon-cov=. --durations=10 tests/
+          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-cov=. --durations=10 tests/
         env:
           DATA: /data/scratch/cifar-10
           NCCL_SHM_DISABLE: 1
diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml
index 1778d64ee..63c0fbbb9 100644
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@@ -44,7 +44,7 @@ jobs:
     name: Test for PyTorch Compatibility
     needs: matrix_preparation
     if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
+    runs-on: [self-hosted, 8-gpu]
     strategy:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml
index c0f45c65a..c9f84806b 100644
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@@ -35,7 +35,7 @@ jobs:
     name: Test for PyTorch Compatibility
     needs: matrix_preparation
     if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
+    runs-on: [self-hosted, 8-gpu]
     strategy:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml
index 15ac4f1a9..3f8fc9639 100644
--- a/.github/workflows/compatiblity_test_on_schedule.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@@ -32,7 +32,7 @@ jobs:
     name: Test for PyTorch Compatibility
     needs: matrix_preparation
     if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
+    runs-on: [self-hosted, 8-gpu]
     strategy:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
diff --git a/applications/Chat/tests/test_dataset.py b/applications/Chat/tests/test_dataset.py
index 64ea1178c..1d9aa50e2 100644
--- a/applications/Chat/tests/test_dataset.py
+++ b/applications/Chat/tests/test_dataset.py
@@ -14,29 +14,43 @@ from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
 
 SFT_DATASET = [
     {
-        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-        "input": "",
-        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-        "id": 0
+        "instruction":
+            "Provide a list of the top 10 most popular mobile games in Asia",
+        "input":
+            "",
+        "output":
+            "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
+        "id":
+            0
     },
     {
-        "instruction": "Please provide an action plan for reducing carbon footprint on a corporate level",
-        "input": "",
-        "output": "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
-        "id": 1
+        "instruction":
+            "Please provide an action plan for reducing carbon footprint on a corporate level",
+        "input":
+            "",
+        "output":
+            "An action plan for reducing carbon footprint on a corporate level could include switching to renewable energy sources (e.g., solar or wind powered), investing in energy efficient infrastructure, implementing green production practices (e.g., reduce water and paper usage, dispose of materials responsibly, recycle/reuse materials), reducing or eliminating travel, encouraging employees to be conscious of and reduce their own energy usage, and incentivizing employees to participate in sustainable work practices.",
+        "id":
+            1
     },
     {
-        "instruction": "Write a persuasive email to your boss explaining why you should have a pay raise",
-        "input": "",
-        "output": "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
-        "id": 2
+        "instruction":
+            "Write a persuasive email to your boss explaining why you should have a pay raise",
+        "input":
+            "",
+        "output":
+            "Dear [Boss], \n\nI hope this message finds you well. I am writing to request a pay raise.\n\nAs you know, I have been a dedicated and hardworking member of the team since I started working here [insert number] of months/years ago. My enthusiasm and passion for my job has remained consistent over the years, and I have always given 100% to my role. \n\nI understand that the current financial situation is challenging, however, I would sincerely appreciate you taking the time to consider my request. I believe that my dedication to the job and the value that I bring to the organization warrants a raise. I work diligently and am confident that I can continue to be an asset to the company. \n\nI hope my request is taken into account and I thank you in advance for your understanding. I look forward to our conversation. \n\nSincerely,\n[Your Name]",
+        "id":
+            2
     },
 ]
 
 PROMPT_DATASET = [
     {
-        "instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
-        "id": 0
+        "instruction":
+            "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
+        "id":
+            0
     },
     {
         "instruction": "Write a descriptive paragraph about a memorable vacation you went on",
@@ -71,9 +85,7 @@ def make_tokenizer(model: str):
     return tokenizer
 
 
-def check_content(input_ids_stripped: torch.Tensor,
-                  tokenizer: PreTrainedTokenizer,
-                  model: str):
+def check_content(input_ids_stripped: torch.Tensor, tokenizer: PreTrainedTokenizer, model: str):
     if model == "opt":
         # NOTE:  Contrary to GPT2, OPT adds the EOS token </s> to the beginning of every prompt.
         assert input_ids_stripped[0] == tokenizer.eos_token_id
@@ -90,13 +102,10 @@ def check_content(input_ids_stripped: torch.Tensor,
     assert input_ids_stripped != tokenizer.mask_token_id
 
 
-@pytest.mark.cpu
 @pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
 @pytest.mark.parametrize("max_length", [32, 1024])
 @pytest.mark.parametrize("max_datasets_size", [2])
-def test_prompt_dataset(model: str,
-                        max_datasets_size: int,
-                        max_length: int):
+def test_prompt_dataset(model: str, max_datasets_size: int, max_length: int):
     with tempfile.TemporaryDirectory() as tmp_dir:
         dataset_name = "prompt_dataset.json"
         with open(os.path.join(tmp_dir, dataset_name), "w") as f:
@@ -119,19 +128,12 @@ def test_prompt_dataset(model: str,
             check_content(input_ids.masked_select(attention_mask), tokenizer, model)
 
 
-@pytest.mark.cpu
 @pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
-@pytest.mark.parametrize(["dataset_path", "subset"], [
-    ("Anthropic/hh-rlhf", "harmless-base"),
-    ("Dahoas/rm-static", None)
-])
+@pytest.mark.parametrize(["dataset_path", "subset"], [("Anthropic/hh-rlhf", "harmless-base"),
+                                                      ("Dahoas/rm-static", None)])
 @pytest.mark.parametrize("max_datasets_size", [32])
 @pytest.mark.parametrize("max_length", [32, 1024])
-def test_reward_dataset(model: str,
-                        dataset_path: str,
-                        subset: Optional[str],
-                        max_datasets_size: int,
-                        max_length: int):
+def test_reward_dataset(model: str, dataset_path: str, subset: Optional[str], max_datasets_size: int, max_length: int):
     data = load_dataset(dataset_path, data_dir=subset)
     assert max_datasets_size <= len(data["train"]) \
         and max_datasets_size <= len(data["test"])
@@ -188,15 +190,11 @@ def test_reward_dataset(model: str,
             assert torch.all(r_mask)
 
 
-@pytest.mark.cpu
 @pytest.mark.parametrize("model", ["gpt2", "bloom", "opt", "llama"])
 @pytest.mark.parametrize("dataset_path", ["yizhongw/self_instruct", None])
 @pytest.mark.parametrize("max_dataset_size", [2])
 @pytest.mark.parametrize("max_length", [32, 1024])
-def test_sft_dataset(model: str,
-                     dataset_path: Optional[str],
-                     max_dataset_size: int,
-                     max_length: int):
+def test_sft_dataset(model: str, dataset_path: Optional[str], max_dataset_size: int, max_length: int):
     tokenizer = make_tokenizer(model)
     if dataset_path == "yizhongw/self_instruct":
         data = load_dataset(dataset_path, "super_natural_instructions")
@@ -232,10 +230,7 @@ def test_sft_dataset(model: str,
 
 
 if __name__ == "__main__":
-    test_sft_dataset(model="bloom",
-                     dataset_path="yizhongw/self_instruct",
-                     max_dataset_size=2,
-                     max_length=256)
+    test_sft_dataset(model="bloom", dataset_path="yizhongw/self_instruct", max_dataset_size=2, max_length=256)
 
     test_reward_dataset(model="gpt2",
                         dataset_path="Anthropic/hh-rlhf",
@@ -243,6 +238,4 @@ if __name__ == "__main__":
                         max_datasets_size=8,
                         max_length=256)
 
-    test_prompt_dataset(model="opt",
-                        max_datasets_size=2,
-                        max_length=128)
+    test_prompt_dataset(model="opt", max_datasets_size=2, max_length=128)
diff --git a/applications/Chat/tests/test_models.py b/applications/Chat/tests/test_models.py
index bd6b3e8a5..e96ff8bd7 100644
--- a/applications/Chat/tests/test_models.py
+++ b/applications/Chat/tests/test_models.py
@@ -15,16 +15,17 @@ from coati.models.opt import OPTRM, OPTActor, OPTCritic
 from coati.models.utils import calc_action_log_probs, compute_reward, masked_mean
 
 
-@pytest.mark.gpu
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seq_len", [32])
-@pytest.mark.parametrize("actor_maker", [
-    lambda: BLOOMActor(),
-    lambda: GPTActor(),
+@pytest.mark.parametrize(
+    "actor_maker",
+    [
+        lambda: BLOOMActor(),
+        lambda: GPTActor(),
     # HACK: skip llama due to long execution time
     # lambda: LlamaActor(),
-    lambda: OPTActor()
-])
+        lambda: OPTActor()
+    ])
 @pytest.mark.parametrize("generate_kwargs", [{
     "max_length": 64,
     "use_cache": True,
@@ -32,23 +33,15 @@ from coati.models.utils import calc_action_log_probs, compute_reward, masked_mea
     "temperature": 1.0,
     "top_k": 50,
 }])
-def test_generation(actor_maker: Callable[[], Actor],
-                    batch_size: int,
-                    seq_len: int,
-                    generate_kwargs: Dict[str, Any]
-                    ):
+def test_generation(actor_maker: Callable[[], Actor], batch_size: int, seq_len: int, generate_kwargs: Dict[str, Any]):
     actor = actor_maker()
     input_ids = torch.randint(0, 100, (batch_size, seq_len)).cuda()
     sequences = generate(actor.cuda(), input_ids, **generate_kwargs)
     assert sequences.shape == (batch_size, generate_kwargs["max_length"])
 
 
-@pytest.mark.cpu
 def test_utils():
-    fn_input = {
-        "tensor": torch.ones((10, )),
-        "mask": torch.randint(0, 2, (10, ))
-    }
+    fn_input = {"tensor": torch.ones((10,)), "mask": torch.randint(0, 2, (10,))}
     fn_output = masked_mean(dim=0, **fn_input)
     assert fn_output.dim() == 0
     assert torch.allclose(fn_output, torch.tensor(1.0))
@@ -56,14 +49,14 @@ def test_utils():
     batch_size = 4
     num_labels = 10
     fn_input = {
-        "r": torch.ones((batch_size, )),
+        "r": torch.ones((batch_size,)),
         "kl_coef": 1.0,
         "log_probs": torch.randn((batch_size, num_labels)),
         "log_probs_base": torch.randn((batch_size, num_labels)),
         "action_mask": torch.randint(0, 2, (batch_size, num_labels))
     }
     fn_output = compute_reward(**fn_input)
-    assert fn_output.shape == (batch_size, )
+    assert fn_output.shape == (batch_size,)
 
     batch_size = 4
     seq_len = 32
@@ -80,17 +73,11 @@ def test_utils():
     assert fn_output.shape == (batch_size, num_actions)
 
 
-@pytest.mark.cpu
 @pytest.mark.parametrize("lora_rank", [4])
 @pytest.mark.parametrize("num_dim", [32])
 @pytest.mark.parametrize("num_layers", [4])
-def test_lora(lora_rank: int,
-              num_dim: int,
-              num_layers: int):
-    model = nn.ModuleList(
-        [nn.Linear(num_dim, num_dim)
-         for _ in range(num_layers)]
-    )
+def test_lora(lora_rank: int, num_dim: int, num_layers: int):
+    model = nn.ModuleList([nn.Linear(num_dim, num_dim) for _ in range(num_layers)])
     lora_model = convert_to_lora_module(model, lora_rank)
     assert isinstance(lora_model, nn.ModuleList)
     for i in range(num_layers):
@@ -103,8 +90,7 @@ def test_lora(lora_rank: int,
         assert isinstance(lora_model[i], LoraLinear)
         assert torch.allclose(old_model[i].weight, lora_model[i].weight)
         assert torch.allclose(old_model[i].bias, lora_model[i].bias)
-        assert torch.allclose(old_model[i].lora_B @ old_model[i].lora_A,
-                              lora_model[i].lora_B @ lora_model[i].lora_A)
+        assert torch.allclose(old_model[i].lora_B @ old_model[i].lora_A, lora_model[i].lora_B @ lora_model[i].lora_A)
     optimizer = torch.optim.Adam(lora_model.parameters())
     x = torch.randn(8, num_dim)
     for i in range(num_layers):
@@ -120,20 +106,19 @@ def test_lora(lora_rank: int,
                                   lora_model[i].lora_B @ lora_model[i].lora_A)
 
 
-@pytest.mark.cpu
 @pytest.mark.parametrize("batch_size", [8])
 @pytest.mark.parametrize("seq_len", [128])
-@pytest.mark.parametrize("models_maker", [
-    lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()),
-    lambda: (GPTActor(), GPTCritic(), GPTRM()),
+@pytest.mark.parametrize(
+    "models_maker",
+    [
+        lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()),
+        lambda: (GPTActor(), GPTCritic(), GPTRM()),
     # HACK: skip llama due to long execution time
     # lambda: (LlamaActor(), LlamaCritic(), LlamaRM()),
-    lambda: (OPTActor(), OPTCritic(), OPTRM()),
-])
+        lambda: (OPTActor(), OPTCritic(), OPTRM()),
+    ])
 @torch.no_grad()
-def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]],
-                batch_size: int,
-                seq_len: int):
+def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]], batch_size: int, seq_len: int):
 
     actor_input = {
         "input_ids": torch.randint(0, 100, (batch_size, seq_len)),
@@ -162,17 +147,14 @@ def test_models(models_maker: Callable[[], Tuple[Actor, Critic, RewardModel]],
     rm_output = rm(**rm_input)
 
     assert actor_output.logits.shape[:2] == (batch_size, seq_len)
-    assert critic_output.shape == (batch_size, )
-    assert rm_output.shape == (batch_size, )
+    assert critic_output.shape == (batch_size,)
+    assert rm_output.shape == (batch_size,)
 
 
-@pytest.mark.cpu
 @pytest.mark.parametrize("batch_size", [16])
 @pytest.mark.parametrize("seq_len", [128])
 @pytest.mark.parametrize("num_labels", [100])
-def test_loss(batch_size: int,
-              seq_len: int,
-              num_labels: int):
+def test_loss(batch_size: int, seq_len: int, num_labels: int):
     loss = GPTLMLoss()
     loss_input = {
         "logits": torch.randn(batch_size, seq_len, num_labels),
@@ -182,54 +164,43 @@ def test_loss(batch_size: int,
 
     loss = PolicyLoss()
     loss_input = {
-        "log_probs": torch.randn(batch_size, ),
-        "old_log_probs": torch.randn(batch_size, ),
-        "advantages": torch.randn(batch_size, )
+        "log_probs": torch.randn(batch_size,),
+        "old_log_probs": torch.randn(batch_size,),
+        "advantages": torch.randn(batch_size,)
     }
     loss_output = loss(**loss_input)
 
     loss = ValueLoss()
     loss_input = {
-        "values": torch.randn(batch_size, ),
-        "old_values": torch.randn(batch_size, ),
-        "reward": torch.randn(batch_size, )
+        "values": torch.randn(batch_size,),
+        "old_values": torch.randn(batch_size,),
+        "reward": torch.randn(batch_size,)
     }
     loss_output = loss(**loss_input)
 
     loss = LogSigLoss()
     loss_input = {
-        "chosen_reward": torch.randn(batch_size, ),
-        "reject_reward": torch.randn(batch_size, ),
+        "chosen_reward": torch.randn(batch_size,),
+        "reject_reward": torch.randn(batch_size,),
     }
     loss_output = loss(**loss_input)
 
     loss = LogExpLoss()
     loss_input = {
-        "chosen_reward": torch.randn(batch_size, ),
-        "reject_reward": torch.randn(batch_size, ),
+        "chosen_reward": torch.randn(batch_size,),
+        "reject_reward": torch.randn(batch_size,),
     }
     loss_output = loss(**loss_input)
 
 
 if __name__ == "__main__":
-    generate_kwargs = dict(max_length=40,
-                           use_cache=True,
-                           do_sample=True,
-                           temperature=1.0,
-                           top_k=50)
-    test_generation(lambda: LlamaActor(),
-                    batch_size=4,
-                    seq_len=32,
-                    generate_kwargs=generate_kwargs)
+    generate_kwargs = dict(max_length=40, use_cache=True, do_sample=True, temperature=1.0, top_k=50)
+    test_generation(lambda: LlamaActor(), batch_size=4, seq_len=32, generate_kwargs=generate_kwargs)
 
     test_utils()
 
     test_lora(lora_rank=2, num_dim=8, num_layers=2)
 
-    test_models(models_maker=lambda: (BLOOMActor(),
-                                      BLOOMCritic(),
-                                      BLOOMRM()),
-                batch_size=8,
-                seq_len=128)
+    test_models(models_maker=lambda: (BLOOMActor(), BLOOMCritic(), BLOOMRM()), batch_size=8, seq_len=128)
 
     test_loss(batch_size=8, seq_len=128, num_labels=100)
diff --git a/pytest.ini b/pytest.ini
index e8a60c853..7912dbffc 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,7 +1,5 @@
 [pytest]
 markers =
-    cpu: tests which can run on CPU
-    gpu: tests which requires a single GPU
-    dist: tests which are run in a multi-GPU or multi-machine environment
-    experiment: tests for experimental features
+    dist: tests which are run in a multi-GPU or multi-machine environment (at least 4 GPUs)
+    largedist: tests which are run in a multi-GPU or multi-machine environment (at least 8 GPUs)
 addopts = --ignore=tests/test_analyzer --ignore=tests/test_auto_parallel --ignore=tests/test_autochunk --ignore=tests/test_moe
diff --git a/tests/test_config/test_load_config.py b/tests/test_config/test_load_config.py
index 550af2a4a..38b5e3f5f 100644
--- a/tests/test_config/test_load_config.py
+++ b/tests/test_config/test_load_config.py
@@ -8,7 +8,6 @@ import pytest
 from colossalai.context.config import Config
 
 
-@pytest.mark.cpu
 def test_load_config():
     filename = Path(__file__).parent.joinpath('sample_config.py')
     config = Config.from_file(filename)
diff --git a/tests/test_context/test_hybrid_parallel.py b/tests/test_context/test_hybrid_parallel.py
index 9f26a5af5..d25668afd 100644
--- a/tests/test_context/test_hybrid_parallel.py
+++ b/tests/test_context/test_hybrid_parallel.py
@@ -143,7 +143,6 @@ def run_dist(rank, world_size, port, backend, port_list, host):
         reset_seeds()
 
 
-@pytest.mark.cpu
 @rerun_if_address_is_in_use()
 def test_context():
     """
diff --git a/tests/test_data/test_cifar10_dataset.py b/tests/test_data/test_cifar10_dataset.py
index 4b9ca61d9..dfa9fa211 100644
--- a/tests/test_data/test_cifar10_dataset.py
+++ b/tests/test_data/test_cifar10_dataset.py
@@ -5,11 +5,10 @@ import os
 from pathlib import Path
 
 import pytest
-from torchvision import transforms, datasets
 from torch.utils.data import DataLoader
+from torchvision import datasets, transforms
 
 
-@pytest.mark.cpu
 def test_cifar10_dataset():
     # build transform
     transform_pipeline = [transforms.ToTensor()]
diff --git a/tests/test_data/test_data_parallel_sampler.py b/tests/test_data/test_data_parallel_sampler.py
index 2ad3fd696..7beef707c 100644
--- a/tests/test_data/test_data_parallel_sampler.py
+++ b/tests/test_data/test_data_parallel_sampler.py
@@ -53,7 +53,6 @@ def run_data_sampler(rank, world_size, port):
     torch.cuda.empty_cache()
 
 
-@pytest.mark.cpu
 @rerun_if_address_is_in_use()
 def test_data_sampler():
     spawn(run_data_sampler, 4)
diff --git a/tests/test_data/test_deterministic_dataloader.py b/tests/test_data/test_deterministic_dataloader.py
index 239e79dff..283b5cc35 100644
--- a/tests/test_data/test_deterministic_dataloader.py
+++ b/tests/test_data/test_deterministic_dataloader.py
@@ -64,7 +64,6 @@ def run_data_sampler(rank, world_size, port):
     torch.cuda.empty_cache()
 
 
-@pytest.mark.cpu
 @rerun_if_address_is_in_use()
 def test_data_sampler():
     spawn(run_data_sampler, 4)
diff --git a/tests/test_utils/test_activation_checkpointing.py b/tests/test_utils/test_activation_checkpointing.py
index 2930552cc..b7764c2f4 100644
--- a/tests/test_utils/test_activation_checkpointing.py
+++ b/tests/test_utils/test_activation_checkpointing.py
@@ -40,7 +40,6 @@ def forward_inplace(x, weight):
     return out
 
 
-@pytest.mark.gpu
 @clear_cache_before_run()
 @parameterize("use_reentrant", [True, False])
 @parameterize("cpu_offload", [True, False])