[workflow]New version: Create workflow files for examples' auto check (#2298)

* [workflows]bug_repair

* [workflow]new_pr_fixing_bugs

Co-authored-by: binmakeswell <binmakeswell@gmail.com>
pull/2353/head
ziyuhuang123 2023-01-06 09:26:49 +08:00 committed by GitHub
parent d7352bef2c
commit 7080a8edb0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 320 additions and 6 deletions

View File

@ -0,0 +1,119 @@
name: Test Example
on:
pull_request:
# So only the changes in examples folder will trigger jobs below.
paths:
- 'examples/**'
# run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
schedule:
- cron: '0 16 * * 6'
jobs:
# This is for changed example files detect and output a matrix containing all the corresponding directory name.
detect-changed-example:
if: |
github.event.pull_request.draft == false &&
github.base_ref == 'main' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
name: Check out all files
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Get all changed example files
id: changed-files
uses: tj-actions/changed-files@v35
# Using this can trigger action each time a PR is submitted.
with:
since_last_remote_commit: true
- name: setup matrix
id: set-matrix
run: |
changedFileName=""
for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
changedFileName="${file}:${changedFileName}"
done
echo "$changedFileName was changed"
res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName`
echo "All changed files are $res"
loc=$( IFS=',' ; echo "${res[*]}" )
echo "$loc"
echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}"
# If no file is changed, it will prompt an error and shows the matrix do not have value.
check-all-changed-files:
# Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
if: |
github.event.pull_request.draft == false &&
github.base_ref == 'main' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
name: Test each changed example files
needs: detect-changed-example
runs-on: [self-hosted, gpu]
strategy:
matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 2
- name: Install dependancies
run: |
pip install -r ./requirements/requirements.txt
pip install colossalai
- name: List all changed example files
run: |
res=${{ matrix.loc }}
cd "${PWD}/examples/${res}"
bash test_ci.sh
# This is for all files' weekly check. Specifically, this job is to find all the directories.
matrix_preparation:
if: |
github.event.pull_request.draft == false &&
github.base_ref == 'main' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
name: Prepare Directory List for All files
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
- name: setup matrix
id: set-matrix
run: |
res=`python .github/workflows/scripts/weekly_check_example.py`
all_loc=$( IFS=',' ; echo "${res[*]}" )
echo "$all_loc"
echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}"
weekly_check:
if: |
github.event.pull_request.draft == false &&
github.base_ref == 'main' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
name: Weekly check all examples
needs: matrix_preparation
runs-on: [self-hosted, gpu]
strategy:
matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
- name: Install the requirements
run: |
pip install -r ./requirements/requirements.txt
pip install colossalai
- name: Traverse all files
run: |
dir=${{ matrix.all_loc }}
echo "${dir} is current directory"
cd "${PWD}/examples/${dir}"
bash test_ci.sh

View File

@ -0,0 +1,19 @@
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--fileNameList', type=str)
args = parser.parse_args()
name_list = args.fileNameList.split(":")
folder_need_check = set()
for loc in name_list:
# Find only the sub-folder of 'example' folder
if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2])
# Output the result using print. Then the shell can get the values.
print(list(folder_need_check))
if __name__ == '__main__':
main()

View File

@ -0,0 +1,23 @@
import argparse
import os
def detect_correct(loc_li):
for loc in loc_li:
real_loc = 'examples/' + eval(loc)
if not os.path.exists(real_loc):
return -1
return 1
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--fileNameList', type=str)
args = parser.parse_args()
name_list = args.fileNameList.split(",")
result = detect_correct(name_list)
print(result)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,38 @@
import os
def show_files(path, all_files):
# Traverse all the folder/file in current directory
file_list = os.listdir(path)
# Determine the element is folder or file. If file, pass it into list, if folder, recurse.
for file in file_list:
# Get the abs directory using os.path.join() and store into cur_path.
cur_path = os.path.join(path, file)
# Determine whether folder
if os.path.isdir(cur_path):
show_files(cur_path, all_files)
else:
all_files.append(cur_path)
return all_files
def join(input_list, sep=None):
return (sep or ' ').join(input_list)
def main():
contents = show_files('examples/', [])
all_loc = []
for file_loc in contents:
split_loc = file_loc.split('/')
# must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
if len(split_loc) - split_loc.index('examples') >= 3:
tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)]
re_loc = join(tmp_loc, '/')
if re_loc not in all_loc:
all_loc.append(re_loc)
print(all_loc)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,67 @@
name: Manual Test Example
on:
workflow_dispatch:
inputs:
example_directory:
type: string
description: example directory, separated by space. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
required: true
jobs:
manual_check_matrix_preparation:
if: |
github.event.pull_request.draft == false &&
github.base_ref == 'main' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
name: Check the examples user want
runs-on: ubuntu-latest
outputs:
matrix: ${{ steps.set-matrix-1.outputs.matrix }}
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
- name: Get manual directories
id: set-matrix-1
env:
check_dir: ${{ inputs.example_directory }}
run: |
all_mannual_check_dir=()
for cdi in $check_dir
do
all_mannual_check_dir+=("\"${cdi}\"")
done
man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" )
res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc`
echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist."
if [ res == -1 ];then
exit(1)
fi
man_loc="[${man_loc}]"
echo "$man_loc"
echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}"
manual_check:
if: |
github.event.pull_request.draft == false &&
github.base_ref == 'main' &&
github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
name: Manually check example files
needs: manual_check_matrix_preparation
runs-on: [self-hosted, gpu]
strategy:
matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
container:
image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
steps:
- name: 📚 Checkout
uses: actions/checkout@v3
- name: Install the requirements
run: |
pip install -r ./requirements/requirements.txt
pip install colossalai
- name: Traverse all files
run: |
dir=${{ matrix.man_loc }}
echo "${dir} is current directory"
cd "${PWD}/examples/${dir}"
bash test_ci.sh

View File

@ -5,10 +5,11 @@ from functools import reduce
from typing import Dict, List, Optional, Union
import torch
from torch.fx.node import Node
from colossalai.device.device_mesh import DeviceMesh
from colossalai.tensor.shape_consistency import ShapeConsistencyManager
from colossalai.tensor.sharding_spec import ShardingSpec
from torch.fx.node import Node
from .constants import INFINITY_COST
@ -17,7 +18,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
"""
Generate the sharding spec of the tensor based on the given dim_partition_dict.
Args:
input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
@ -58,7 +59,7 @@ def generate_resharding_costs(nodes: List[Node],
nodes (List[Node]): a list of nodes
sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
'''
# The resharding_cost of weight is counted due to sharing weight cases.
resharding_costs = {}

28
examples/README.md Normal file
View File

@ -0,0 +1,28 @@
## Examples folder document
## Table of Contents
<ul>
<li><a href="#Example-folder-description">Example folder description</a> </li>
<li><a href="#Integrate-Your-Example-With-System-Testing">Integrate Your Example With System Testing</a> </li>
</ul>
## Example folder description
This folder provides several examples using colossalai. The images folder includes model like diffusion, dreambooth and vit. The language folder includes gpt, opt, palm and roberta. The tutorial folder is for concept illustration, such as auto-parallel, hybrid-parallel and so on.
## Integrate Your Example With System Testing
For example code contributor, to meet the expectation and test your code automatically using github workflow function, here are several steps:
- (must) Have a test_ci.sh file in the folder like shown below in 'File Structure Chart'
- The dataset should be located in the company's machine and can be announced using environment variable and thus no need for a separate terminal command.
- The model parameters should be small to allow fast testing.
- File Structure Chart
└─examples
└─images
└─vit
└─requirements.txt
└─test_ci.sh

View File

@ -1,6 +1,8 @@
# ColoDiffusion: Stable Diffusion with Colossal-AI
Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) and [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion).
<p id="diffusion_train" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20v2.png" width=800/>
</p>
@ -11,14 +13,17 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/DreamBooth.png" width=800/>
</p>
- [DreamBooth Fine-tuning](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): Personalize your model using just 3-5 images of the desired subject.
<p id="inference" align="center">
<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
</p>
- [Inference](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): Reduce inference GPU memory consumption by 2.5x.
More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper) and [blog of Stable Diffusion v2](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0).
## Installation

View File

@ -1,3 +1 @@
colossalai >= 0.1.12
torch >= 1.8.1
transformers >= 4.231
transformers >= 4.23

View File

@ -0,0 +1,16 @@
pip install -r requirements.txt
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
export DISTPAN="colossalai"
# The following options only valid when DISTPAN="colossalai"
export TPDEGREE=2
export GPUNUM=4
export PLACEMENT='cpu'
export USE_SHARD_INIT=False
export BATCH_SIZE=8
export MODEL_TYPE="gpt2_medium"
mkdir -p logs
torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log