diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 50417ac8a..54e8a6d93 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -90,7 +90,7 @@ jobs: runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 60 defaults: run: @@ -165,7 +165,6 @@ jobs: --ignore tests/test_checkpoint_io \ tests/ env: - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny @@ -205,4 +204,3 @@ jobs: with: name: report path: report/ - diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 3bee3b4f9..5b0103eb7 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -13,15 +13,16 @@ jobs: runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 90 steps: - name: Check GPU Availability # ensure all GPUs have enough memory id: check-avai run: | avai=true - for i in $(seq 0 3); - do + ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + endIndex=$(($ngpu-1)) + for i in $(seq 0 $endIndex); gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) [ "$gpu_used" -gt "2000" ] && avai=false done @@ -74,7 +75,7 @@ jobs: if: ${{ failure() }} run: | url=$SERVER_URL/$REPO/actions/runs/$RUN_ID - msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details" + msg="Scheduled Build and Test failed, please visit $url for details" echo $msg python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL env: diff --git a/tests/kit/model_zoo/transformers/chatglm2.py b/tests/kit/model_zoo/transformers/chatglm2.py index 0b178d58c..e27fdb4e2 100644 --- a/tests/kit/model_zoo/transformers/chatglm2.py +++ b/tests/kit/model_zoo/transformers/chatglm2.py @@ -2,7 +2,6 @@ import torch from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel - from ..registry import ModelAttribute, model_zoo # ================================ diff --git a/tests/test_booster/test_plugin/test_3d_plugin.py b/tests/test_booster/test_plugin/test_3d_plugin.py index ad878fb0c..e724d7359 100644 --- a/tests/test_booster/test_plugin/test_3d_plugin.py +++ b/tests/test_booster/test_plugin/test_3d_plugin.py @@ -10,10 +10,11 @@ from colossalai.booster.plugin import HybridParallelPlugin from colossalai.fx import is_compatible_with_meta from colossalai.lazy.lazy_init import LazyInitContext from colossalai.nn.optimizer import HybridAdam -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn from tests.kit.model_zoo import model_zoo +@clear_cache_before_run() def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]: try: if init_method == "lazy": @@ -69,7 +70,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True): "transformers_llama_for_casual_lm" ).items(): err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) - torch.cuda.empty_cache() if err is None: passed_models.append(name) diff --git a/tests/test_booster/test_plugin/test_gemini_plugin.py b/tests/test_booster/test_plugin/test_gemini_plugin.py index 3462d5dde..9952e41e5 100644 --- a/tests/test_booster/test_plugin/test_gemini_plugin.py +++ b/tests/test_booster/test_plugin/test_gemini_plugin.py @@ -12,10 +12,11 @@ from colossalai.fx import is_compatible_with_meta from colossalai.lazy.lazy_init import LazyInitContext from colossalai.nn.optimizer import HybridAdam from colossalai.tensor.colo_parameter import ColoParameter -from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn -from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST +from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn +from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo +@clear_cache_before_run() def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]: try: if init_method == "lazy": @@ -116,7 +117,7 @@ def check_gemini_plugin( "transformers_falcon_for_sequence_classification", "transformers_falcon_for_token_classification", "transformers_falcon_for_question_answering", - "transformers_gptj_lm", # lead to OOM when running in ci + "transformers_gptj_lm", # lead to OOM when running in ci "transformers_gptj_for_question_answering", "transformers_gptj_for_sequence_classification", ]: @@ -145,7 +146,6 @@ def check_gemini_plugin( tp_size = 1 err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) - torch.cuda.empty_cache() if err is None: passed_models.append(name) else: