mirror of https://github.com/hpcaitech/ColossalAI
[workflow] added notification if scheduled build fails (#2574)
* [workflow] added notification if scheduled build fails * polish code * polish codepull/2585/head
parent
fba08743a8
commit
788e138960
|
@ -16,30 +16,65 @@ jobs:
|
||||||
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
|
options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10
|
||||||
timeout-minutes: 40
|
timeout-minutes: 40
|
||||||
steps:
|
steps:
|
||||||
|
- name: Check GPU Availability # ensure all GPUs have enough memory
|
||||||
|
id: check-avai
|
||||||
|
run: |
|
||||||
|
avai=true
|
||||||
|
for i in $(seq 0 7);
|
||||||
|
do
|
||||||
|
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
|
||||||
|
[ "$gpu_used" -le "10000" ] && avai=false
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "GPU is available: $avai"
|
||||||
|
echo "avai=$avai" >> $GITHUB_OUTPUT
|
||||||
|
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
if: steps.check-avai.outputs.avai == 'true'
|
||||||
with:
|
with:
|
||||||
repository: hpcaitech/TensorNVMe
|
repository: hpcaitech/TensorNVMe
|
||||||
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
|
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
|
||||||
path: TensorNVMe
|
path: TensorNVMe
|
||||||
|
|
||||||
- name: Install tensornvme
|
- name: Install tensornvme
|
||||||
|
if: steps.check-avai.outputs.avai == 'true'
|
||||||
run: |
|
run: |
|
||||||
cd TensorNVMe
|
cd TensorNVMe
|
||||||
conda install cmake
|
conda install cmake
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
pip install -v .
|
pip install -v .
|
||||||
|
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
|
if: steps.check-avai.outputs.avai == 'true'
|
||||||
with:
|
with:
|
||||||
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
|
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
|
||||||
|
|
||||||
- name: Install Colossal-AI
|
- name: Install Colossal-AI
|
||||||
|
if: steps.check-avai.outputs.avai == 'true'
|
||||||
run: |
|
run: |
|
||||||
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
|
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
|
||||||
CUDA_EXT=1 pip install -v -e .
|
CUDA_EXT=1 pip install -v -e .
|
||||||
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
||||||
pip install -r requirements/requirements-test.txt
|
pip install -r requirements/requirements-test.txt
|
||||||
|
|
||||||
- name: Unit Testing
|
- name: Unit Testing
|
||||||
|
if: steps.check-avai.outputs.avai == 'true'
|
||||||
run: |
|
run: |
|
||||||
gpu_used=$(nvidia-smi -i 0 --query-gpu=memory.used --format=csv,noheader,nounits)
|
PYTHONPATH=$PWD pytest tests
|
||||||
[ "$gpu_used" -le "10000" ] && PYTHONPATH=$PWD pytest tests
|
|
||||||
env:
|
env:
|
||||||
DATA: /data/scratch/cifar-10
|
DATA: /data/scratch/cifar-10
|
||||||
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
||||||
|
|
||||||
|
- name: Notify Lark
|
||||||
|
id: message-preparation
|
||||||
|
if: ${{ failure() }}
|
||||||
|
run: |
|
||||||
|
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
|
||||||
|
msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
|
||||||
|
echo $msg
|
||||||
|
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
|
||||||
|
env:
|
||||||
|
SERVER_URL: ${{github.server_url }}
|
||||||
|
REPO: ${{ github.repository }}
|
||||||
|
RUN_ID: ${{ github.run_id }}
|
||||||
|
WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-m', '--message', type=str)
|
||||||
|
parser.add_argument('-u', '--url', type=str)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def send_message_to_lark(message, webhook_url):
|
||||||
|
data = {"msg_type": "text", "content": {"text": message}}
|
||||||
|
requests.post(webhook_url, json=data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parse_args()
|
||||||
|
send_message_to_lark(args.message, args.url)
|
Loading…
Reference in New Issue