mirror of https://github.com/hpcaitech/ColossalAI
[workflow] refactored the example check workflow (#2411)
* [workflow] refactored the example check workflow * polish code * polish code * polish code * polish code * polish code * polish code * polish code * polish code * polish code * polish code * polish codepull/2413/head
parent
8de8de9fa3
commit
8327932d2c
|
@ -1,7 +1,7 @@
|
|||
name: Test Example
|
||||
on:
|
||||
pull_request:
|
||||
# So only the changes in examples folder will trigger jobs below.
|
||||
# any change in the examples folder will trigger check for the corresponding example.
|
||||
paths:
|
||||
- 'examples/**'
|
||||
# run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
|
||||
|
@ -17,12 +17,14 @@ jobs:
|
|||
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
name: Check out all files
|
||||
matrix: ${{ steps.setup-matrix.outputs.matrix }}
|
||||
anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
|
||||
name: Detect changed example files
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
fetch-depth: 0
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
- name: Get all changed example files
|
||||
id: changed-files
|
||||
uses: tj-actions/changed-files@v35
|
||||
|
@ -30,46 +32,53 @@ jobs:
|
|||
with:
|
||||
since_last_remote_commit: true
|
||||
- name: setup matrix
|
||||
id: set-matrix
|
||||
id: setup-matrix
|
||||
run: |
|
||||
changedFileName=""
|
||||
for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
|
||||
changedFileName="${file}:${changedFileName}"
|
||||
done
|
||||
echo "$changedFileName was changed"
|
||||
res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName`
|
||||
echo "All changed files are $res"
|
||||
loc=$( IFS=',' ; echo "${res[*]}" )
|
||||
echo "$loc"
|
||||
echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}"
|
||||
res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
|
||||
echo "All changed examples are $res"
|
||||
|
||||
if [ "$x" = "[]" ]; then
|
||||
echo "anyChanged=false" >> $GITHUB_OUTPUT
|
||||
echo "matrix=null" >> $GITHUB_OUTPUT
|
||||
else
|
||||
dirs=$( IFS=',' ; echo "${res[*]}" )
|
||||
echo "anyChanged=true" >> $GITHUB_OUTPUT
|
||||
echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
# If no file is changed, it will prompt an error and shows the matrix do not have value.
|
||||
check-all-changed-files:
|
||||
check-changed-example:
|
||||
# Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
|
||||
if: |
|
||||
github.event.pull_request.draft == false &&
|
||||
github.base_ref == 'main' &&
|
||||
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
|
||||
name: Test each changed example files
|
||||
name: Test the changed example
|
||||
needs: detect-changed-example
|
||||
runs-on: [self-hosted, gpu]
|
||||
strategy:
|
||||
matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --gpus all --rm -v /data/scratch/examples-data:/data/
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
with:
|
||||
fetch-depth: 2
|
||||
- name: Install dependancies
|
||||
- name: Install Colossal-AI
|
||||
run: |
|
||||
pip install -r ./requirements/requirements.txt
|
||||
pip install colossalai
|
||||
- name: List all changed example files
|
||||
pip install -v .
|
||||
- name: Test the example
|
||||
run: |
|
||||
res=${{ matrix.loc }}
|
||||
cd "${PWD}/examples/${res}"
|
||||
example_dir=${{ matrix.directory }}
|
||||
cd "${PWD}/examples/${example_dir}"
|
||||
bash test_ci.sh
|
||||
env:
|
||||
NCCL_SHM_DISABLE: 1
|
||||
|
||||
# This is for all files' weekly check. Specifically, this job is to find all the directories.
|
||||
matrix_preparation:
|
||||
|
@ -77,20 +86,20 @@ jobs:
|
|||
github.event.pull_request.draft == false &&
|
||||
github.base_ref == 'main' &&
|
||||
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
|
||||
name: Prepare Directory List for All files
|
||||
name: Prepare matrix for weekly check
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
matrix: ${{ steps.setup-matrix.outputs.matrix }}
|
||||
steps:
|
||||
- name: 📚 Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: setup matrix
|
||||
id: set-matrix
|
||||
id: setup-matrix
|
||||
run: |
|
||||
res=`python .github/workflows/scripts/weekly_check_example.py`
|
||||
res=`python .github/workflows/scripts/example_checks/check_example_weekly.py`
|
||||
all_loc=$( IFS=',' ; echo "${res[*]}" )
|
||||
echo "$all_loc"
|
||||
echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}"
|
||||
echo "Found the examples: $all_loc"
|
||||
echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT
|
||||
|
||||
weekly_check:
|
||||
if: |
|
||||
|
@ -104,16 +113,18 @@ jobs:
|
|||
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: 📚 Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Install the requirements
|
||||
- name: Install Colossal-AI
|
||||
run: |
|
||||
pip install -r ./requirements/requirements.txt
|
||||
pip install colossalai
|
||||
pip install -v .
|
||||
- name: Traverse all files
|
||||
run: |
|
||||
dir=${{ matrix.all_loc }}
|
||||
echo "${dir} is current directory"
|
||||
cd "${PWD}/examples/${dir}"
|
||||
example_dir=${{ matrix.diretory }}
|
||||
echo "Testing ${example_dir} now"
|
||||
cd "${PWD}/examples/${example_dir}"
|
||||
bash test_ci.sh
|
||||
env:
|
||||
NCCL_SHM_DISABLE: 1
|
|
@ -8,7 +8,7 @@ on:
|
|||
required: true
|
||||
|
||||
jobs:
|
||||
manual_check_matrix_preparation:
|
||||
matrix_preparation:
|
||||
if: |
|
||||
github.event.pull_request.draft == false &&
|
||||
github.base_ref == 'main' &&
|
||||
|
@ -16,31 +16,24 @@ jobs:
|
|||
name: Check the examples user want
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
matrix: ${{ steps.set-matrix-1.outputs.matrix }}
|
||||
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
||||
steps:
|
||||
- name: 📚 Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Get manual directories
|
||||
id: set-matrix-1
|
||||
- name: Set up matrix
|
||||
id: set-matrix
|
||||
env:
|
||||
check_dir: ${{ inputs.example_directory }}
|
||||
run: |
|
||||
all_mannual_check_dir=()
|
||||
for cdi in $check_dir
|
||||
do
|
||||
all_mannual_check_dir+=("\"${cdi}\"")
|
||||
done
|
||||
man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" )
|
||||
res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc`
|
||||
echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist."
|
||||
if [ res == -1 ];then
|
||||
exit(1)
|
||||
res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir`
|
||||
if [ res == "failure" ];then
|
||||
exit -1
|
||||
fi
|
||||
man_loc="[${man_loc}]"
|
||||
echo "$man_loc"
|
||||
echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}"
|
||||
dirs="[${check_dir}]"
|
||||
echo "Testing examples in $dirs"
|
||||
echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
|
||||
|
||||
manual_check:
|
||||
test_example:
|
||||
if: |
|
||||
github.event.pull_request.draft == false &&
|
||||
github.base_ref == 'main' &&
|
||||
|
@ -52,16 +45,19 @@ jobs:
|
|||
matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
|
||||
container:
|
||||
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
|
||||
options: --gpus all --rm -v /data/scratch/examples-data:/data/
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: 📚 Checkout
|
||||
uses: actions/checkout@v3
|
||||
- name: Install the requirements
|
||||
- name: Install Colossal-AI
|
||||
run: |
|
||||
pip install -r ./requirements/requirements.txt
|
||||
pip install colossalai
|
||||
- name: Traverse all files
|
||||
pip install -v .
|
||||
- name: Test the example
|
||||
run: |
|
||||
dir=${{ matrix.man_loc }}
|
||||
echo "${dir} is current directory"
|
||||
dir=${{ matrix.directory }}
|
||||
echo "Testing ${dir} now"
|
||||
cd "${PWD}/examples/${dir}"
|
||||
bash test_ci.sh
|
||||
env:
|
||||
NCCL_SHM_DISABLE: 1
|
|
@ -0,0 +1,27 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
|
||||
def check_inputs(input_list):
|
||||
for path in input_list:
|
||||
real_path = os.path.join('examples', path)
|
||||
if not os.path.exists(real_path):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-f', '--fileNameList', type=str, help="List of file names")
|
||||
args = parser.parse_args()
|
||||
name_list = args.fileNameList.split(",")
|
||||
is_correct = check_inputs(name_list)
|
||||
|
||||
if is_correct:
|
||||
print('success')
|
||||
else:
|
||||
print('failure')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -5,9 +5,9 @@ def show_files(path, all_files):
|
|||
# Traverse all the folder/file in current directory
|
||||
file_list = os.listdir(path)
|
||||
# Determine the element is folder or file. If file, pass it into list, if folder, recurse.
|
||||
for file in file_list:
|
||||
for file_name in file_list:
|
||||
# Get the abs directory using os.path.join() and store into cur_path.
|
||||
cur_path = os.path.join(path, file)
|
||||
cur_path = os.path.join(path, file_name)
|
||||
# Determine whether folder
|
||||
if os.path.isdir(cur_path):
|
||||
show_files(cur_path, all_files)
|
||||
|
@ -26,9 +26,8 @@ def main():
|
|||
for file_loc in contents:
|
||||
split_loc = file_loc.split('/')
|
||||
# must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
|
||||
if len(split_loc) - split_loc.index('examples') >= 3:
|
||||
tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)]
|
||||
re_loc = join(tmp_loc, '/')
|
||||
if len(split_loc) >= 4:
|
||||
re_loc = '/'.join(split_loc[1:3])
|
||||
if re_loc not in all_loc:
|
||||
all_loc.append(re_loc)
|
||||
print(all_loc)
|
|
@ -3,14 +3,19 @@ import argparse
|
|||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--fileNameList', type=str)
|
||||
parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files")
|
||||
args = parser.parse_args()
|
||||
name_list = args.fileNameList.split(":")
|
||||
folder_need_check = set()
|
||||
for loc in name_list:
|
||||
# Find only the sub-folder of 'example' folder
|
||||
# Find only the sub-sub-folder of 'example' folder
|
||||
# the examples folder structure is like
|
||||
# - examples
|
||||
# - area
|
||||
# - application
|
||||
# - file
|
||||
if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
|
||||
folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2])
|
||||
folder_need_check.add('/'.join(loc.split("/")[1:3]))
|
||||
# Output the result using print. Then the shell can get the values.
|
||||
print(list(folder_need_check))
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
import argparse
|
||||
import os
|
||||
|
||||
|
||||
def detect_correct(loc_li):
|
||||
for loc in loc_li:
|
||||
real_loc = 'examples/' + eval(loc)
|
||||
if not os.path.exists(real_loc):
|
||||
return -1
|
||||
return 1
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--fileNameList', type=str)
|
||||
args = parser.parse_args()
|
||||
name_list = args.fileNameList.split(",")
|
||||
result = detect_correct(name_list)
|
||||
print(result)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -6,8 +6,8 @@ from colossalai.amp import AMP_TYPE
|
|||
BATCH_SIZE = 256
|
||||
LEARNING_RATE = 3e-3
|
||||
WEIGHT_DECAY = 0.3
|
||||
NUM_EPOCHS = 10
|
||||
WARMUP_EPOCHS = 3
|
||||
NUM_EPOCHS = 2
|
||||
WARMUP_EPOCHS = 1
|
||||
|
||||
# model config
|
||||
IMG_SIZE = 224
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
colossalai >= 0.1.12
|
||||
torch >= 1.8.1
|
||||
titans
|
|
@ -0,0 +1,5 @@
|
|||
#!/bin/bash
|
||||
set -euxo pipefail
|
||||
|
||||
pip install -r requirements.txt
|
||||
torchrun --standalone --nproc_per_node 4 train.py --config config.py -s
|
|
@ -98,9 +98,9 @@ def main():
|
|||
root = os.environ.get('DATA', '../data')
|
||||
if args.synthetic:
|
||||
# if we use synthetic dataset
|
||||
# we train for 30 steps and eval for 10 steps per epoch
|
||||
train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE)
|
||||
test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
|
||||
# we train for 10 steps and eval for 5 steps per epoch
|
||||
train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
|
||||
test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
|
||||
else:
|
||||
train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)
|
||||
|
||||
|
|
Loading…
Reference in New Issue