mirror of https://github.com/hpcaitech/ColossalAI
[devops] fix compatibility (#5444)
* [devops] fix compatibility * [hotfix] update compatibility test on pr * [devops] fix compatibility * [devops] record duration during comp test * [test] decrease test duration * fix falconpull/5471/head
parent
385e85afd4
commit
f2e8b9ef9f
|
@ -1,2 +1 @@
|
||||||
2.0.0-11.7.0
|
2.1.0-12.1.0
|
||||||
2.1.0-11.8.0
|
|
||||||
|
|
|
@ -67,7 +67,6 @@ jobs:
|
||||||
--durations=0 \
|
--durations=0 \
|
||||||
tests/
|
tests/
|
||||||
env:
|
env:
|
||||||
NCCL_SHM_DISABLE: 1
|
|
||||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
LLAMA_PATH: /data/scratch/llama-tiny
|
LLAMA_PATH: /data/scratch/llama-tiny
|
||||||
|
|
||||||
|
|
|
@ -50,7 +50,7 @@ jobs:
|
||||||
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
||||||
container:
|
container:
|
||||||
image: ${{ matrix.container }}
|
image: ${{ matrix.container }}
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||||
timeout-minutes: 120
|
timeout-minutes: 120
|
||||||
steps:
|
steps:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
@ -87,9 +87,8 @@ jobs:
|
||||||
pip install -r requirements/requirements-test.txt
|
pip install -r requirements/requirements-test.txt
|
||||||
- name: Unit Testing
|
- name: Unit Testing
|
||||||
run: |
|
run: |
|
||||||
PYTHONPATH=$PWD pytest tests
|
PYTHONPATH=$PWD pytest --durations=0 tests
|
||||||
env:
|
env:
|
||||||
DATA: /data/scratch/cifar-10
|
DATA: /data/scratch/cifar-10
|
||||||
NCCL_SHM_DISABLE: 1
|
|
||||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
LLAMA_PATH: /data/scratch/llama-tiny
|
LLAMA_PATH: /data/scratch/llama-tiny
|
||||||
|
|
|
@ -41,7 +41,7 @@ jobs:
|
||||||
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
||||||
container:
|
container:
|
||||||
image: ${{ matrix.container }}
|
image: ${{ matrix.container }}
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||||
timeout-minutes: 120
|
timeout-minutes: 120
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }}
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }}
|
||||||
|
@ -82,9 +82,8 @@ jobs:
|
||||||
pip install -r requirements/requirements-test.txt
|
pip install -r requirements/requirements-test.txt
|
||||||
- name: Unit Testing
|
- name: Unit Testing
|
||||||
run: |
|
run: |
|
||||||
PYTHONPATH=$PWD pytest tests
|
PYTHONPATH=$PWD pytest --durations=0 tests
|
||||||
env:
|
env:
|
||||||
DATA: /data/scratch/cifar-10
|
DATA: /data/scratch/cifar-10
|
||||||
NCCL_SHM_DISABLE: 1
|
|
||||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
LLAMA_PATH: /data/scratch/llama-tiny
|
LLAMA_PATH: /data/scratch/llama-tiny
|
||||||
|
|
|
@ -38,7 +38,7 @@ jobs:
|
||||||
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
||||||
container:
|
container:
|
||||||
image: ${{ matrix.container }}
|
image: ${{ matrix.container }}
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||||
timeout-minutes: 120
|
timeout-minutes: 120
|
||||||
steps:
|
steps:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
|
@ -80,10 +80,9 @@ jobs:
|
||||||
|
|
||||||
- name: Unit Testing
|
- name: Unit Testing
|
||||||
run: |
|
run: |
|
||||||
PYTHONPATH=$PWD pytest tests
|
PYTHONPATH=$PWD pytest --durations=0 tests
|
||||||
env:
|
env:
|
||||||
DATA: /data/scratch/cifar-10
|
DATA: /data/scratch/cifar-10
|
||||||
NCCL_SHM_DISABLE: 1
|
|
||||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
LLAMA_PATH: /data/scratch/llama-tiny
|
LLAMA_PATH: /data/scratch/llama-tiny
|
||||||
|
|
||||||
|
|
|
@ -182,7 +182,7 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
|
||||||
overlap_communication: bool = True,
|
overlap_communication: bool = True,
|
||||||
use_ep_inside: bool = True,
|
use_ep_inside: bool = True,
|
||||||
custom_policy: Policy = None,
|
custom_policy: Policy = None,
|
||||||
checkpoint_io: Optional[MoECheckpintIO] = None,
|
checkpoint_io: Optional[MoECheckpointIO] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
assert (
|
assert (
|
||||||
dist.get_world_size() % (tp_size * pp_size) == 0
|
dist.get_world_size() % (tp_size * pp_size) == 0
|
||||||
|
@ -341,7 +341,6 @@ class MoeHybridParallelPlugin(HybridParallelPlugin):
|
||||||
**_kwargs,
|
**_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_checkpoint_io(self) -> MoECheckpointIO:
|
def get_checkpoint_io(self) -> MoECheckpointIO:
|
||||||
if self.checkpoint_io is None:
|
if self.checkpoint_io is None:
|
||||||
self.checkpoint_io = MoECheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
|
self.checkpoint_io = MoECheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage)
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pytest
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
|
||||||
|
@ -12,13 +11,7 @@ from colossalai.fx import is_compatible_with_meta
|
||||||
from colossalai.lazy.lazy_init import LazyInitContext
|
from colossalai.lazy.lazy_init import LazyInitContext
|
||||||
from colossalai.nn.optimizer import HybridAdam
|
from colossalai.nn.optimizer import HybridAdam
|
||||||
from colossalai.tensor.colo_parameter import ColoParameter
|
from colossalai.tensor.colo_parameter import ColoParameter
|
||||||
from colossalai.testing import (
|
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||||
clear_cache_before_run,
|
|
||||||
parameterize,
|
|
||||||
rerun_if_address_is_in_use,
|
|
||||||
skip_if_not_enough_gpus,
|
|
||||||
spawn,
|
|
||||||
)
|
|
||||||
from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
|
from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
|
||||||
|
|
||||||
|
|
||||||
|
@ -177,12 +170,5 @@ def test_gemini_plugin(early_stop: bool = True):
|
||||||
spawn(run_dist, 4, early_stop=early_stop)
|
spawn(run_dist, 4, early_stop=early_stop)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.largedist
|
|
||||||
@skip_if_not_enough_gpus(8)
|
|
||||||
@rerun_if_address_is_in_use()
|
|
||||||
def test_gemini_plugin_3d(early_stop: bool = True):
|
|
||||||
spawn(run_dist, 8, early_stop=early_stop)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_gemini_plugin(early_stop=False)
|
test_gemini_plugin(early_stop=False)
|
|
@ -16,7 +16,6 @@ from colossalai.testing import (
|
||||||
clear_cache_before_run,
|
clear_cache_before_run,
|
||||||
parameterize,
|
parameterize,
|
||||||
rerun_if_address_is_in_use,
|
rerun_if_address_is_in_use,
|
||||||
skip_if_not_enough_gpus,
|
|
||||||
spawn,
|
spawn,
|
||||||
)
|
)
|
||||||
from tests.kit.model_zoo import model_zoo
|
from tests.kit.model_zoo import model_zoo
|
||||||
|
@ -178,12 +177,5 @@ def test_gemini_ckpIO():
|
||||||
spawn(run_dist, 4)
|
spawn(run_dist, 4)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.largedist
|
|
||||||
@skip_if_not_enough_gpus(min_gpus=8)
|
|
||||||
@rerun_if_address_is_in_use()
|
|
||||||
def test_gemini_ckpIO_3d():
|
|
||||||
spawn(run_dist, 8)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_gemini_ckpIO()
|
test_gemini_ckpIO()
|
|
@ -1,5 +1,6 @@
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
|
||||||
import colossalai
|
import colossalai
|
||||||
from colossalai.logging import disable_existing_loggers
|
from colossalai.logging import disable_existing_loggers
|
||||||
|
@ -72,6 +73,8 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
|
||||||
if stage_manager is None or stage_manager.is_first_stage():
|
if stage_manager is None or stage_manager.is_first_stage():
|
||||||
if test_config["precision"] == "fp32":
|
if test_config["precision"] == "fp32":
|
||||||
atol, rtol = 2e-4, 1e-3
|
atol, rtol = 2e-4, 1e-3
|
||||||
|
if dist.get_world_size() > 4:
|
||||||
|
atol, rtol = 4e-4, 3e-2
|
||||||
else:
|
else:
|
||||||
atol, rtol = 5e-3, 5e-3
|
atol, rtol = 5e-3, 5e-3
|
||||||
check_weight(falcon, sharded_falcon, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False)
|
check_weight(falcon, sharded_falcon, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False)
|
||||||
|
|
Loading…
Reference in New Issue