diff --git a/.compatibility b/.compatibility index a918cb162..d90a74b58 100644 --- a/.compatibility +++ b/.compatibility @@ -1,2 +1 @@ -2.0.0-11.7.0 -2.1.0-11.8.0 +2.1.0-12.1.0 diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 510665b46..3ff19b37b 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -67,7 +67,6 @@ jobs: --durations=0 \ tests/ env: - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny @@ -83,4 +82,4 @@ jobs: SERVER_URL: ${{github.server_url }} REPO: ${{ github.repository }} RUN_ID: ${{ github.run_id }} - WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }} \ No newline at end of file + WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }} diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index a6f9582ac..764938806 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -50,7 +50,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: ${{ matrix.container }} - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 120 steps: - name: Install dependencies @@ -87,9 +87,8 @@ jobs: pip install -r requirements/requirements-test.txt - name: Unit Testing run: | - PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index ede6c380a..f582b3090 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -41,7 +41,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: ${{ matrix.container }} - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 120 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }} @@ -82,9 +82,8 @@ jobs: pip install -r requirements/requirements-test.txt - name: Unit Testing run: | - PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 1cf456ff6..3348b51ec 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -38,7 +38,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: ${{ matrix.container }} - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 120 steps: - name: Install dependencies @@ -80,10 +80,9 @@ jobs: - name: Unit Testing run: | - PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py index 454710fcc..ae372dd03 100644 --- a/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py +++ b/colossalai/booster/plugin/moe_hybrid_parallel_plugin.py @@ -182,7 +182,7 @@ class MoeHybridParallelPlugin(HybridParallelPlugin): overlap_communication: bool = True, use_ep_inside: bool = True, custom_policy: Policy = None, - checkpoint_io: Optional[MoECheckpintIO] = None, + checkpoint_io: Optional[MoECheckpointIO] = None, ) -> None: assert ( dist.get_world_size() % (tp_size * pp_size) == 0 @@ -341,7 +341,6 @@ class MoeHybridParallelPlugin(HybridParallelPlugin): **_kwargs, ) - def get_checkpoint_io(self) -> MoECheckpointIO: if self.checkpoint_io is None: self.checkpoint_io = MoECheckpointIO(self.dp_group, self.pp_group, self.tp_group, self.zero_stage) diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index 0f72d2bcd..892144772 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -1,7 +1,6 @@ from contextlib import nullcontext from typing import Optional -import pytest import torch import torch.distributed as dist @@ -12,13 +11,7 @@ from colossalai.fx import is_compatible_with_meta from colossalai.lazy.lazy_init import LazyInitContext from colossalai.nn.optimizer import HybridAdam from colossalai.tensor.colo_parameter import ColoParameter -from colossalai.testing import ( - clear_cache_before_run, - parameterize, - rerun_if_address_is_in_use, - skip_if_not_enough_gpus, - spawn, -) +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo @@ -177,12 +170,5 @@ def test_gemini_plugin(early_stop: bool = True): spawn(run_dist, 4, early_stop=early_stop) -@pytest.mark.largedist -@skip_if_not_enough_gpus(8) -@rerun_if_address_is_in_use() -def test_gemini_plugin_3d(early_stop: bool = True): - spawn(run_dist, 8, early_stop=early_stop) - - if __name__ == "__main__": - test_gemini_plugin(early_stop=False) \ No newline at end of file + test_gemini_plugin(early_stop=False) diff --git a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py index daddf6dc7..ece3b4036 100644 --- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py +++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py @@ -16,7 +16,6 @@ from colossalai.testing import ( clear_cache_before_run, parameterize, rerun_if_address_is_in_use, - skip_if_not_enough_gpus, spawn, ) from tests.kit.model_zoo import model_zoo @@ -178,12 +177,5 @@ def test_gemini_ckpIO(): spawn(run_dist, 4) -@pytest.mark.largedist -@skip_if_not_enough_gpus(min_gpus=8) -@rerun_if_address_is_in_use() -def test_gemini_ckpIO_3d(): - spawn(run_dist, 8) - - if __name__ == "__main__": - test_gemini_ckpIO() \ No newline at end of file + test_gemini_ckpIO() diff --git a/tests/test_shardformer/test_model/test_shard_falcon.py b/tests/test_shardformer/test_model/test_shard_falcon.py index 963045179..5e2efcd80 100644 --- a/tests/test_shardformer/test_model/test_shard_falcon.py +++ b/tests/test_shardformer/test_model/test_shard_falcon.py @@ -1,5 +1,6 @@ import pytest import torch +import torch.distributed as dist import colossalai from colossalai.logging import disable_existing_loggers @@ -72,6 +73,8 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, if stage_manager is None or stage_manager.is_first_stage(): if test_config["precision"] == "fp32": atol, rtol = 2e-4, 1e-3 + if dist.get_world_size() > 4: + atol, rtol = 4e-4, 3e-2 else: atol, rtol = 5e-3, 5e-3 check_weight(falcon, sharded_falcon, col_layer_for_check, tp_group, atol=atol, rtol=rtol, dim=1, verbose=False)