diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index ea1f4879c..32b518ac5 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -16,30 +16,65 @@ jobs: options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 timeout-minutes: 40 steps: + - name: Check GPU Availability # ensure all GPUs have enough memory + id: check-avai + run: | + avai=true + for i in $(seq 0 7); + do + gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) + [ "$gpu_used" -le "10000" ] && avai=false + done + + echo "GPU is available: $avai" + echo "avai=$avai" >> $GITHUB_OUTPUT + - uses: actions/checkout@v2 + if: steps.check-avai.outputs.avai == 'true' with: repository: hpcaitech/TensorNVMe ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} path: TensorNVMe + - name: Install tensornvme + if: steps.check-avai.outputs.avai == 'true' run: | cd TensorNVMe conda install cmake pip install -r requirements.txt pip install -v . + - uses: actions/checkout@v2 + if: steps.check-avai.outputs.avai == 'true' with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} + - name: Install Colossal-AI + if: steps.check-avai.outputs.avai == 'true' run: | [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/ CUDA_EXT=1 pip install -v -e . cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ pip install -r requirements/requirements-test.txt + - name: Unit Testing + if: steps.check-avai.outputs.avai == 'true' run: | - gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits) - [ "$gpu_used" -le "10000" ] && PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest tests env: DATA: /data/scratch/cifar-10 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + + - name: Notify Lark + id: message-preparation + if: ${{ failure() }} + run: | + url=$SERVER_URL/$REPO/actions/runs/$RUN_ID + msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details" + echo $msg + python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL + env: + SERVER_URL: ${{github.server_url }} + REPO: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }} diff --git a/.github/workflows/scripts/send_message_to_lark.py b/.github/workflows/scripts/send_message_to_lark.py new file mode 100644 index 000000000..a113327a7 --- /dev/null +++ b/.github/workflows/scripts/send_message_to_lark.py @@ -0,0 +1,20 @@ +import argparse + +import requests + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument('-m', '--message', type=str) + parser.add_argument('-u', '--url', type=str) + return parser.parse_args() + + +def send_message_to_lark(message, webhook_url): + data = {"msg_type": "text", "content": {"text": message}} + requests.post(webhook_url, json=data) + + +if __name__ == '__main__': + args = parse_args() + send_message_to_lark(args.message, args.url) diff --git a/test.sh b/test.sh new file mode 100644 index 000000000..8dcecc6dd --- /dev/null +++ b/test.sh @@ -0,0 +1,6 @@ +avai=true +for i in $(seq 0 7); +do + gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) + [ "$gpu_used" -le "10000" ] && avai=false +done