diff --git a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml b/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml new file mode 100644 index 000000000..2b7ec3125 --- /dev/null +++ b/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml @@ -0,0 +1,119 @@ +name: Test Example +on: + pull_request: + # So only the changes in examples folder will trigger jobs below. + paths: + - 'examples/**' + # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00 + schedule: + - cron: '0 16 * * 6' + +jobs: + # This is for changed example files detect and output a matrix containing all the corresponding directory name. + detect-changed-example: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + name: Check out all files + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 2 + - name: Get all changed example files + id: changed-files + uses: tj-actions/changed-files@v35 + # Using this can trigger action each time a PR is submitted. + with: + since_last_remote_commit: true + - name: setup matrix + id: set-matrix + run: | + changedFileName="" + for file in ${{ steps.changed-files.outputs.all_changed_files }}; do + changedFileName="${file}:${changedFileName}" + done + echo "$changedFileName was changed" + res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName` + echo "All changed files are $res" + loc=$( IFS=',' ; echo "${res[*]}" ) + echo "$loc" + echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}" + + # If no file is changed, it will prompt an error and shows the matrix do not have value. + check-all-changed-files: + # Add this condition to avoid executing this job if the trigger event is workflow_dispatch. + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' + name: Test each changed example files + needs: detect-changed-example + runs-on: [self-hosted, gpu] + strategy: + matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} + container: + image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 2 + - name: Install dependancies + run: | + pip install -r ./requirements/requirements.txt + pip install colossalai + - name: List all changed example files + run: | + res=${{ matrix.loc }} + cd "${PWD}/examples/${res}" + bash test_ci.sh + + # This is for all files' weekly check. Specifically, this job is to find all the directories. + matrix_preparation: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule' + name: Prepare Directory List for All files + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + - name: setup matrix + id: set-matrix + run: | + res=`python .github/workflows/scripts/weekly_check_example.py` + all_loc=$( IFS=',' ; echo "${res[*]}" ) + echo "$all_loc" + echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}" + + weekly_check: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule' + name: Weekly check all examples + needs: matrix_preparation + runs-on: [self-hosted, gpu] + strategy: + matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} + container: + image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + - name: Install the requirements + run: | + pip install -r ./requirements/requirements.txt + pip install colossalai + - name: Traverse all files + run: | + dir=${{ matrix.all_loc }} + echo "${dir} is current directory" + cd "${PWD}/examples/${dir}" + bash test_ci.sh diff --git a/.github/workflows/scripts/changed_example.py b/.github/workflows/scripts/changed_example.py new file mode 100644 index 000000000..ac2f0864e --- /dev/null +++ b/.github/workflows/scripts/changed_example.py @@ -0,0 +1,19 @@ +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--fileNameList', type=str) + args = parser.parse_args() + name_list = args.fileNameList.split(":") + folder_need_check = set() + for loc in name_list: + # Find only the sub-folder of 'example' folder + if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4: + folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2]) + # Output the result using print. Then the shell can get the values. + print(list(folder_need_check)) + + +if __name__ == '__main__': + main() diff --git a/.github/workflows/scripts/input_check_example.py b/.github/workflows/scripts/input_check_example.py new file mode 100644 index 000000000..5602d8f09 --- /dev/null +++ b/.github/workflows/scripts/input_check_example.py @@ -0,0 +1,23 @@ +import argparse +import os + + +def detect_correct(loc_li): + for loc in loc_li: + real_loc = 'examples/' + eval(loc) + if not os.path.exists(real_loc): + return -1 + return 1 + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--fileNameList', type=str) + args = parser.parse_args() + name_list = args.fileNameList.split(",") + result = detect_correct(name_list) + print(result) + + +if __name__ == '__main__': + main() diff --git a/.github/workflows/scripts/weekly_check_example.py b/.github/workflows/scripts/weekly_check_example.py new file mode 100644 index 000000000..dfedc4628 --- /dev/null +++ b/.github/workflows/scripts/weekly_check_example.py @@ -0,0 +1,38 @@ +import os + + +def show_files(path, all_files): + # Traverse all the folder/file in current directory + file_list = os.listdir(path) + # Determine the element is folder or file. If file, pass it into list, if folder, recurse. + for file in file_list: + # Get the abs directory using os.path.join() and store into cur_path. + cur_path = os.path.join(path, file) + # Determine whether folder + if os.path.isdir(cur_path): + show_files(cur_path, all_files) + else: + all_files.append(cur_path) + return all_files + + +def join(input_list, sep=None): + return (sep or ' ').join(input_list) + + +def main(): + contents = show_files('examples/', []) + all_loc = [] + for file_loc in contents: + split_loc = file_loc.split('/') + # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not. + if len(split_loc) - split_loc.index('examples') >= 3: + tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)] + re_loc = join(tmp_loc, '/') + if re_loc not in all_loc: + all_loc.append(re_loc) + print(all_loc) + + +if __name__ == '__main__': + main() diff --git a/.github/workflows/workflow_dispatch_example.yml b/.github/workflows/workflow_dispatch_example.yml new file mode 100644 index 000000000..d9d576910 --- /dev/null +++ b/.github/workflows/workflow_dispatch_example.yml @@ -0,0 +1,67 @@ +name: Manual Test Example +on: + workflow_dispatch: + inputs: + example_directory: + type: string + description: example directory, separated by space. For example, language/gpt, images/vit. Simply input language or simply gpt does not work. + required: true + +jobs: + manual_check_matrix_preparation: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' + name: Check the examples user want + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix-1.outputs.matrix }} + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + - name: Get manual directories + id: set-matrix-1 + env: + check_dir: ${{ inputs.example_directory }} + run: | + all_mannual_check_dir=() + for cdi in $check_dir + do + all_mannual_check_dir+=("\"${cdi}\"") + done + man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" ) + res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc` + echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist." + if [ res == -1 ];then + exit(1) + fi + man_loc="[${man_loc}]" + echo "$man_loc" + echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}" + + manual_check: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' + name: Manually check example files + needs: manual_check_matrix_preparation + runs-on: [self-hosted, gpu] + strategy: + matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} + container: + image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + - name: Install the requirements + run: | + pip install -r ./requirements/requirements.txt + pip install colossalai + - name: Traverse all files + run: | + dir=${{ matrix.man_loc }} + echo "${dir} is current directory" + cd "${PWD}/examples/${dir}" + bash test_ci.sh diff --git a/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py b/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py index a72d97554..d6af7ad57 100644 --- a/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py +++ b/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py @@ -5,10 +5,11 @@ from functools import reduce from typing import Dict, List, Optional, Union import torch +from torch.fx.node import Node + from colossalai.device.device_mesh import DeviceMesh from colossalai.tensor.shape_consistency import ShapeConsistencyManager from colossalai.tensor.sharding_spec import ShardingSpec -from torch.fx.node import Node from .constants import INFINITY_COST @@ -17,7 +18,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec: """ Generate the sharding spec of the tensor based on the given dim_partition_dict. - + Args: input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node. @@ -58,7 +59,7 @@ def generate_resharding_costs(nodes: List[Node], nodes (List[Node]): a list of nodes sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes. count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference. - dtype (Optional[torch.dtype]): the data type for cost calculation, default is None. + dtype (Optional[torch.dtype]): the data type for cost calculation, default is None. ''' # The resharding_cost of weight is counted due to sharing weight cases. resharding_costs = {} diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..53ab0896d --- /dev/null +++ b/examples/README.md @@ -0,0 +1,28 @@ +## Examples folder document + +## Table of Contents + + +## Example folder description + +This folder provides several examples using colossalai. The images folder includes model like diffusion, dreambooth and vit. The language folder includes gpt, opt, palm and roberta. The tutorial folder is for concept illustration, such as auto-parallel, hybrid-parallel and so on. + + +## Integrate Your Example With System Testing + +For example code contributor, to meet the expectation and test your code automatically using github workflow function, here are several steps: + + +- (must) Have a test_ci.sh file in the folder like shown below in 'File Structure Chart' +- The dataset should be located in the company's machine and can be announced using environment variable and thus no need for a separate terminal command. +- The model parameters should be small to allow fast testing. +- File Structure Chart + + └─examples + └─images + └─vit + └─requirements.txt + └─test_ci.sh diff --git a/examples/images/diffusion/README.md b/examples/images/diffusion/README.md index 2a522cd66..abb1d24c0 100644 --- a/examples/images/diffusion/README.md +++ b/examples/images/diffusion/README.md @@ -1,6 +1,8 @@ # ColoDiffusion: Stable Diffusion with Colossal-AI + Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) and [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion). +

@@ -11,14 +13,17 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]

+ - [DreamBooth Fine-tuning](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): Personalize your model using just 3-5 images of the desired subject.

+ - [Inference](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): Reduce inference GPU memory consumption by 2.5x. + More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper) and [blog of Stable Diffusion v2](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0). ## Installation diff --git a/examples/language/gpt/requirements.txt b/examples/language/gpt/requirements.txt index 86caf0dbc..e1f131468 100644 --- a/examples/language/gpt/requirements.txt +++ b/examples/language/gpt/requirements.txt @@ -1,3 +1 @@ -colossalai >= 0.1.12 -torch >= 1.8.1 -transformers >= 4.231 +transformers >= 4.23 diff --git a/examples/language/gpt/test_ci.sh b/examples/language/gpt/test_ci.sh new file mode 100644 index 000000000..ad0cfa325 --- /dev/null +++ b/examples/language/gpt/test_ci.sh @@ -0,0 +1,16 @@ +pip install -r requirements.txt + +# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"] +export DISTPAN="colossalai" + +# The following options only valid when DISTPAN="colossalai" +export TPDEGREE=2 +export GPUNUM=4 +export PLACEMENT='cpu' +export USE_SHARD_INIT=False +export BATCH_SIZE=8 +export MODEL_TYPE="gpt2_medium" + + +mkdir -p logs +torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log