mirror of https://github.com/hpcaitech/ColossalAI
[test] ignore 8 gpu test (#1080)
* [test] ignore 8 gpu test * polish code * polish workflow * polish workflowpull/1081/head
parent
0653c63eaa
commit
65ee6dcc20
|
@ -15,7 +15,7 @@ jobs:
|
||||||
runs-on: [self-hosted, gpu]
|
runs-on: [self-hosted, gpu]
|
||||||
container:
|
container:
|
||||||
image: hpcaitech/pytorch-cuda:1.10.1-11.3.0
|
image: hpcaitech/pytorch-cuda:1.10.1-11.3.0
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
|
options: --shm-size=2gb --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
|
||||||
timeout-minutes: 40
|
timeout-minutes: 40
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
@ -25,10 +25,10 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
|
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
|
||||||
pip install -r requirements/requirements.txt
|
pip install -r requirements/requirements.txt
|
||||||
pip install -r requirements/requirements-test.txt
|
|
||||||
pip install -v -e .
|
pip install -v -e .
|
||||||
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
||||||
cp /__w/ColossalAI/ColossalAI/*.so /github/home/cuda_ext_cache/
|
cp /__w/ColossalAI/ColossalAI/*.so /github/home/cuda_ext_cache/
|
||||||
|
pip install -r requirements/requirements-test.txt
|
||||||
- name: Unit Testing
|
- name: Unit Testing
|
||||||
run: |
|
run: |
|
||||||
PYTHONPATH=$PWD pytest tests
|
PYTHONPATH=$PWD pytest tests
|
||||||
|
|
|
@ -7,9 +7,9 @@ from colossalai.utils.cuda import get_current_device
|
||||||
|
|
||||||
|
|
||||||
class DummyDataLoader(DummyDataGenerator):
|
class DummyDataLoader(DummyDataGenerator):
|
||||||
vocab_size = 50304
|
vocab_size = 128
|
||||||
batch_size = 4
|
batch_size = 4
|
||||||
seq_len = 1024
|
seq_len = 64
|
||||||
|
|
||||||
def generate(self):
|
def generate(self):
|
||||||
input_ids = torch.randint(0,
|
input_ids = torch.randint(0,
|
||||||
|
@ -47,6 +47,8 @@ class GPTLMModel(nn.Module):
|
||||||
# Only return lm_logits
|
# Only return lm_logits
|
||||||
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
|
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
|
||||||
|
|
||||||
|
def gpt2_micro(checkpoint=True):
|
||||||
|
return GPTLMModel(checkpoint=checkpoint, hidden_size=32, num_layers=2, num_attention_heads=4, max_seq_len=64, vocab_size=128)
|
||||||
|
|
||||||
def gpt2_s(checkpoint=True):
|
def gpt2_s(checkpoint=True):
|
||||||
return GPTLMModel(checkpoint=checkpoint)
|
return GPTLMModel(checkpoint=checkpoint)
|
||||||
|
@ -76,4 +78,4 @@ def get_training_components():
|
||||||
testloader = DummyDataLoader()
|
testloader = DummyDataLoader()
|
||||||
|
|
||||||
criterion = GPTLMLoss()
|
criterion = GPTLMLoss()
|
||||||
return gpt2_s, trainloader, testloader, torch.optim.Adam, criterion
|
return gpt2_micro, trainloader, testloader, torch.optim.Adam, criterion
|
||||||
|
|
|
@ -83,6 +83,7 @@ def run_trainer(rank, world_size, port):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
|
@pytest.mark.skip("This test requires 8 GPUs to execute")
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
def test_hybrid_parallel():
|
def test_hybrid_parallel():
|
||||||
world_size = 8
|
world_size = 8
|
||||||
|
|
|
@ -51,6 +51,7 @@ def check_layer_and_operation(rank, world_size, port):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
|
@pytest.mark.skip("This test requires 8 GPUs to execute")
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
def test_3d():
|
def test_3d():
|
||||||
world_size = 8
|
world_size = 8
|
||||||
|
|
|
@ -328,7 +328,6 @@ def run_model_dist(rank, world_size, port):
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
@pytest.mark.parametrize('world_size', [1, 4])
|
@pytest.mark.parametrize('world_size', [1, 4])
|
||||||
# @parameterize('world_size', [1, 4])
|
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
def test_model(world_size):
|
def test_model(world_size):
|
||||||
run_func = partial(run_model_dist, world_size=world_size, port=free_port())
|
run_func = partial(run_model_dist, world_size=world_size, port=free_port())
|
||||||
|
|
|
@ -67,6 +67,7 @@ def check_checkpoint_1d(rank, world_size, port):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
|
@pytest.mark.skip("This test should be invoked with 8 GPUs")
|
||||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||||
def test_checkpoint_1d():
|
def test_checkpoint_1d():
|
||||||
world_size = 8
|
world_size = 8
|
||||||
|
|
|
@ -67,6 +67,7 @@ def check_checkpoint_2d(rank, world_size, port):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
|
@pytest.mark.skip("This test should be invoked with 8 GPUs")
|
||||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||||
def test_checkpoint_2d():
|
def test_checkpoint_2d():
|
||||||
world_size = 8
|
world_size = 8
|
||||||
|
|
|
@ -67,6 +67,7 @@ def check_checkpoint_2p5d(rank, world_size, port):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
|
@pytest.mark.skip("This test should be invoked with 8 GPUs")
|
||||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||||
def test_checkpoint_2p5d():
|
def test_checkpoint_2p5d():
|
||||||
world_size = 8
|
world_size = 8
|
||||||
|
|
|
@ -67,6 +67,7 @@ def check_checkpoint_3d(rank, world_size, port):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
|
@pytest.mark.skip("This test requires 8 GPUs to execute")
|
||||||
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
@rerun_on_exception(exception_type=mp.ProcessRaisedException, pattern=".*Address already in use.*")
|
||||||
def test_checkpoint_3d():
|
def test_checkpoint_3d():
|
||||||
world_size = 8
|
world_size = 8
|
||||||
|
|
|
@ -22,7 +22,7 @@ def run_dist(rank, world_size, port):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
@pytest.mark.parametrize("world_size", [4, 5])
|
@pytest.mark.parametrize("world_size", [3, 4])
|
||||||
def test_memory_utils(world_size):
|
def test_memory_utils(world_size):
|
||||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||||
mp.spawn(run_func, nprocs=world_size)
|
mp.spawn(run_func, nprocs=world_size)
|
||||||
|
|
|
@ -85,7 +85,7 @@ def run_dist(rank, world_size, port):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.dist
|
@pytest.mark.dist
|
||||||
@pytest.mark.parametrize("world_size", [4, 5])
|
@pytest.mark.parametrize("world_size", [2, 4])
|
||||||
@rerun_if_address_is_in_use()
|
@rerun_if_address_is_in_use()
|
||||||
def test_zero_tensor_utils(world_size):
|
def test_zero_tensor_utils(world_size):
|
||||||
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
run_func = partial(run_dist, world_size=world_size, port=free_port())
|
||||||
|
|
Loading…
Reference in New Issue