mirror of https://github.com/hpcaitech/ColossalAI
[ci] fixed booster test (#5251)
* [ci] fixed booster test * [ci] fixed booster test * [ci] fixed booster testpull/5272/head
parent
edf94a35c3
commit
d5eeeb1416
|
@ -90,7 +90,7 @@ jobs:
|
||||||
runs-on: [self-hosted, gpu]
|
runs-on: [self-hosted, gpu]
|
||||||
container:
|
container:
|
||||||
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
|
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||||
timeout-minutes: 60
|
timeout-minutes: 60
|
||||||
defaults:
|
defaults:
|
||||||
run:
|
run:
|
||||||
|
@ -165,7 +165,6 @@ jobs:
|
||||||
--ignore tests/test_checkpoint_io \
|
--ignore tests/test_checkpoint_io \
|
||||||
tests/
|
tests/
|
||||||
env:
|
env:
|
||||||
NCCL_SHM_DISABLE: 1
|
|
||||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
LLAMA_PATH: /data/scratch/llama-tiny
|
LLAMA_PATH: /data/scratch/llama-tiny
|
||||||
|
|
||||||
|
@ -205,4 +204,3 @@ jobs:
|
||||||
with:
|
with:
|
||||||
name: report
|
name: report
|
||||||
path: report/
|
path: report/
|
||||||
|
|
||||||
|
|
|
@ -13,15 +13,16 @@ jobs:
|
||||||
runs-on: [self-hosted, gpu]
|
runs-on: [self-hosted, gpu]
|
||||||
container:
|
container:
|
||||||
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
image: hpcaitech/pytorch-cuda:2.0.0-11.7.0
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
||||||
timeout-minutes: 90
|
timeout-minutes: 90
|
||||||
steps:
|
steps:
|
||||||
- name: Check GPU Availability # ensure all GPUs have enough memory
|
- name: Check GPU Availability # ensure all GPUs have enough memory
|
||||||
id: check-avai
|
id: check-avai
|
||||||
run: |
|
run: |
|
||||||
avai=true
|
avai=true
|
||||||
for i in $(seq 0 3);
|
ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
||||||
do
|
endIndex=$(($ngpu-1))
|
||||||
|
for i in $(seq 0 $endIndex);
|
||||||
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
|
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
|
||||||
[ "$gpu_used" -gt "2000" ] && avai=false
|
[ "$gpu_used" -gt "2000" ] && avai=false
|
||||||
done
|
done
|
||||||
|
@ -74,7 +75,7 @@ jobs:
|
||||||
if: ${{ failure() }}
|
if: ${{ failure() }}
|
||||||
run: |
|
run: |
|
||||||
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
|
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
|
||||||
msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
|
msg="Scheduled Build and Test failed, please visit $url for details"
|
||||||
echo $msg
|
echo $msg
|
||||||
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
|
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
|
||||||
env:
|
env:
|
||||||
|
|
|
@ -2,7 +2,6 @@ import torch
|
||||||
|
|
||||||
from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
|
from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
|
||||||
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
|
from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
|
||||||
|
|
||||||
from ..registry import ModelAttribute, model_zoo
|
from ..registry import ModelAttribute, model_zoo
|
||||||
|
|
||||||
# ================================
|
# ================================
|
||||||
|
|
|
@ -10,10 +10,11 @@ from colossalai.booster.plugin import HybridParallelPlugin
|
||||||
from colossalai.fx import is_compatible_with_meta
|
from colossalai.fx import is_compatible_with_meta
|
||||||
from colossalai.lazy.lazy_init import LazyInitContext
|
from colossalai.lazy.lazy_init import LazyInitContext
|
||||||
from colossalai.nn.optimizer import HybridAdam
|
from colossalai.nn.optimizer import HybridAdam
|
||||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||||
from tests.kit.model_zoo import model_zoo
|
from tests.kit.model_zoo import model_zoo
|
||||||
|
|
||||||
|
|
||||||
|
@clear_cache_before_run()
|
||||||
def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
|
def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
|
||||||
try:
|
try:
|
||||||
if init_method == "lazy":
|
if init_method == "lazy":
|
||||||
|
@ -69,7 +70,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
|
||||||
"transformers_llama_for_casual_lm"
|
"transformers_llama_for_casual_lm"
|
||||||
).items():
|
).items():
|
||||||
err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
|
err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
if err is None:
|
if err is None:
|
||||||
passed_models.append(name)
|
passed_models.append(name)
|
||||||
|
|
|
@ -12,10 +12,11 @@ from colossalai.fx import is_compatible_with_meta
|
||||||
from colossalai.lazy.lazy_init import LazyInitContext
|
from colossalai.lazy.lazy_init import LazyInitContext
|
||||||
from colossalai.nn.optimizer import HybridAdam
|
from colossalai.nn.optimizer import HybridAdam
|
||||||
from colossalai.tensor.colo_parameter import ColoParameter
|
from colossalai.tensor.colo_parameter import ColoParameter
|
||||||
from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
|
from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
|
||||||
from tests.kit.model_zoo import model_zoo, COMMON_MODELS, IS_FAST_TEST
|
from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo
|
||||||
|
|
||||||
|
|
||||||
|
@clear_cache_before_run()
|
||||||
def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
|
def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
|
||||||
try:
|
try:
|
||||||
if init_method == "lazy":
|
if init_method == "lazy":
|
||||||
|
@ -116,7 +117,7 @@ def check_gemini_plugin(
|
||||||
"transformers_falcon_for_sequence_classification",
|
"transformers_falcon_for_sequence_classification",
|
||||||
"transformers_falcon_for_token_classification",
|
"transformers_falcon_for_token_classification",
|
||||||
"transformers_falcon_for_question_answering",
|
"transformers_falcon_for_question_answering",
|
||||||
"transformers_gptj_lm", # lead to OOM when running in ci
|
"transformers_gptj_lm", # lead to OOM when running in ci
|
||||||
"transformers_gptj_for_question_answering",
|
"transformers_gptj_for_question_answering",
|
||||||
"transformers_gptj_for_sequence_classification",
|
"transformers_gptj_for_sequence_classification",
|
||||||
]:
|
]:
|
||||||
|
@ -145,7 +146,6 @@ def check_gemini_plugin(
|
||||||
tp_size = 1
|
tp_size = 1
|
||||||
|
|
||||||
err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size)
|
err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size)
|
||||||
torch.cuda.empty_cache()
|
|
||||||
if err is None:
|
if err is None:
|
||||||
passed_models.append(name)
|
passed_models.append(name)
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue