Browse Source

[workflow] added notification if scheduled build fails (#2574)

* [workflow] added notification if scheduled build fails

* polish code

* polish code
pull/2585/head
Frank Lee 2 years ago committed by GitHub
parent
commit
788e138960
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 39
      .github/workflows/build_on_schedule.yml
  2. 20
      .github/workflows/scripts/send_message_to_lark.py
  3. 6
      test.sh

39
.github/workflows/build_on_schedule.yml

@ -16,30 +16,65 @@ jobs:
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
timeout-minutes: 40
steps:
- name: Check GPU Availability # ensure all GPUs have enough memory
id: check-avai
run: |
avai=true
for i in $(seq 0 7);
do
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -le "10000" ] && avai=false
done
echo "GPU is available: $avai"
echo "avai=$avai" >> $GITHUB_OUTPUT
- uses: actions/checkout@v2
if: steps.check-avai.outputs.avai == 'true'
with:
repository: hpcaitech/TensorNVMe
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
path: TensorNVMe
- name: Install tensornvme
if: steps.check-avai.outputs.avai == 'true'
run: |
cd TensorNVMe
conda install cmake
pip install -r requirements.txt
pip install -v .
- uses: actions/checkout@v2
if: steps.check-avai.outputs.avai == 'true'
with:
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
- name: Install Colossal-AI
if: steps.check-avai.outputs.avai == 'true'
run: |
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
CUDA_EXT=1 pip install -v -e .
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
pip install -r requirements/requirements-test.txt
- name: Unit Testing
if: steps.check-avai.outputs.avai == 'true'
run: |
gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -le "10000" ] && PYTHONPATH=$PWD pytest tests
PYTHONPATH=$PWD pytest tests
env:
DATA: /data/scratch/cifar-10
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
- name: Notify Lark
id: message-preparation
if: ${{ failure() }}
run: |
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
echo $msg
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
env:
SERVER_URL: ${{github.server_url }}
REPO: ${{ github.repository }}
RUN_ID: ${{ github.run_id }}
WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}

20
.github/workflows/scripts/send_message_to_lark.py

@ -0,0 +1,20 @@
import argparse
import requests
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-m', '--message', type=str)
parser.add_argument('-u', '--url', type=str)
return parser.parse_args()
def send_message_to_lark(message, webhook_url):
data = {"msg_type": "text", "content": {"text": message}}
requests.post(webhook_url, json=data)
if __name__ == '__main__':
args = parse_args()
send_message_to_lark(args.message, args.url)

6
test.sh

@ -0,0 +1,6 @@
avai=true
for i in $(seq 0 7);
do
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
[ "$gpu_used" -le "10000" ] && avai=false
done
Loading…
Cancel
Save