[workflow]New version: Create workflow files for examples' auto check (#2298)

* [workflows]bug_repair * [workflow]new_pr_fixing_bugs Co-authored-by: binmakeswell <binmakeswell@gmail.com>
2023-01-06 09:26:49 +08:00 · 2023-01-06 09:26:49 +08:00 · 7080a8edb0
parent d7352bef2c
commit 7080a8edb0
10 changed files with 320 additions and 6 deletions
--- a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml
+++ b/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml
@ -0,0 +1,119 @@
+name: Test Example
+on:
+  pull_request:
+    # So only the changes in examples folder will trigger jobs below.
+    paths:
+      - 'examples/**'
+  # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
+  schedule:
+    - cron:  '0 16 * * 6'
+
+jobs:
+  # This is for changed example files detect and output a matrix containing all the corresponding directory name.
+  detect-changed-example:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    name: Check out all files
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: Get all changed example files
+        id: changed-files
+        uses: tj-actions/changed-files@v35
+        # Using this can trigger action each time a PR is submitted.
+        with:
+          since_last_remote_commit: true
+      - name: setup matrix
+        id: set-matrix
+        run: |
+          changedFileName=""
+          for file in ${{ steps.changed-files.outputs.all_changed_files  }}; do
+            changedFileName="${file}:${changedFileName}"
+          done
+          echo "$changedFileName was changed"
+          res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName`
+          echo "All changed files are $res"
+          loc=$( IFS=',' ; echo "${res[*]}" )
+          echo "$loc"
+          echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}"
+
+  # If no file is changed, it will prompt an error and shows the matrix do not have value.
+  check-all-changed-files:
+    # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
+    name: Test each changed example files
+    needs: detect-changed-example
+    runs-on: [self-hosted, gpu]
+    strategy:
+      matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+      - name: Install dependancies
+        run: |
+          pip install -r ./requirements/requirements.txt
+          pip install colossalai
+      - name: List all changed example files
+        run: |
+          res=${{ matrix.loc }}
+          cd "${PWD}/examples/${res}"
+          bash test_ci.sh
+
+  # This is for all files' weekly check. Specifically, this job is to find all the directories.
+  matrix_preparation:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
+    name: Prepare Directory List for All files
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+    - name: 📚 Checkout
+      uses: actions/checkout@v3
+    - name: setup matrix
+      id: set-matrix
+      run: |
+        res=`python .github/workflows/scripts/weekly_check_example.py`
+        all_loc=$( IFS=',' ; echo "${res[*]}" )
+        echo "$all_loc"
+        echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}"
+
+  weekly_check:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
+    name: Weekly check all examples
+    needs: matrix_preparation
+    runs-on: [self-hosted, gpu]
+    strategy:
+      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+    steps:
+      - name: 📚 Checkout
+        uses: actions/checkout@v3
+      - name: Install the requirements
+        run: |
+          pip install -r ./requirements/requirements.txt
+          pip install colossalai
+      - name: Traverse all files
+        run: |
+          dir=${{ matrix.all_loc }}
+          echo "${dir} is current directory"
+          cd "${PWD}/examples/${dir}"
+          bash test_ci.sh
--- a/.github/workflows/scripts/changed_example.py
+++ b/.github/workflows/scripts/changed_example.py
@ -0,0 +1,19 @@
+import argparse
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--fileNameList', type=str)
+    args = parser.parse_args()
+    name_list = args.fileNameList.split(":")
+    folder_need_check = set()
+    for loc in name_list:
+        # Find only the sub-folder of 'example' folder
+        if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
+            folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2])
+    # Output the result using print. Then the shell can get the values.
+    print(list(folder_need_check))
+
+
+if __name__ == '__main__':
+    main()
--- a/.github/workflows/scripts/input_check_example.py
+++ b/.github/workflows/scripts/input_check_example.py
@ -0,0 +1,23 @@
+import argparse
+import os
+
+
+def detect_correct(loc_li):
+    for loc in loc_li:
+        real_loc = 'examples/' + eval(loc)
+        if not os.path.exists(real_loc):
+            return -1
+    return 1
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--fileNameList', type=str)
+    args = parser.parse_args()
+    name_list = args.fileNameList.split(",")
+    result = detect_correct(name_list)
+    print(result)
+
+
+if __name__ == '__main__':
+    main()
--- a/.github/workflows/scripts/weekly_check_example.py
+++ b/.github/workflows/scripts/weekly_check_example.py
@ -0,0 +1,38 @@
+import os
+
+
+def show_files(path, all_files):
+    # Traverse all the folder/file in current directory
+    file_list = os.listdir(path)
+    # Determine the element is folder or file. If file, pass it into list, if folder, recurse.
+    for file in file_list:
+        # Get the abs directory using os.path.join() and store into cur_path.
+        cur_path = os.path.join(path, file)
+        # Determine whether folder
+        if os.path.isdir(cur_path):
+            show_files(cur_path, all_files)
+        else:
+            all_files.append(cur_path)
+    return all_files
+
+
+def join(input_list, sep=None):
+    return (sep or ' ').join(input_list)
+
+
+def main():
+    contents = show_files('examples/', [])
+    all_loc = []
+    for file_loc in contents:
+        split_loc = file_loc.split('/')
+        # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
+        if len(split_loc) - split_loc.index('examples') >= 3:
+            tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)]
+            re_loc = join(tmp_loc, '/')
+            if re_loc not in all_loc:
+                all_loc.append(re_loc)
+    print(all_loc)
+
+
+if __name__ == '__main__':
+    main()
--- a/.github/workflows/workflow_dispatch_example.yml
+++ b/.github/workflows/workflow_dispatch_example.yml
@ -0,0 +1,67 @@
+name: Manual Test Example
+on:
+  workflow_dispatch:
+    inputs:
+      example_directory:
+        type: string
+        description: example directory, separated by space. For example, language/gpt, images/vit. Simply input language or simply gpt does not work.
+        required: true
+
+jobs:
+  manual_check_matrix_preparation:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    name: Check the examples user want
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.set-matrix-1.outputs.matrix }}
+    steps:
+    - name: 📚 Checkout
+      uses: actions/checkout@v3
+    - name: Get manual directories
+      id: set-matrix-1
+      env:
+        check_dir: ${{ inputs.example_directory }}
+      run: |
+        all_mannual_check_dir=()
+        for cdi in $check_dir
+        do
+          all_mannual_check_dir+=("\"${cdi}\"")
+        done
+        man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" )
+        res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc`
+        echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist."
+        if [ res == -1 ];then
+           exit(1)
+        fi
+        man_loc="[${man_loc}]"
+        echo "$man_loc"
+        echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}"
+
+  manual_check:
+    if: |
+        github.event.pull_request.draft == false &&
+        github.base_ref == 'main' &&
+        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    name: Manually check example files
+    needs: manual_check_matrix_preparation
+    runs-on: [self-hosted, gpu]
+    strategy:
+      matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
+    container:
+      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+    steps:
+      - name: 📚 Checkout
+        uses: actions/checkout@v3
+      - name: Install the requirements
+        run: |
+          pip install -r ./requirements/requirements.txt
+          pip install colossalai
+      - name: Traverse all files
+        run: |
+          dir=${{ matrix.man_loc }}
+          echo "${dir} is current directory"
+          cd "${PWD}/examples/${dir}"
+          bash test_ci.sh
--- a/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
@ -5,10 +5,11 @@ from functools import reduce
 from typing import Dict, List, Optional, Union

 import torch
+from torch.fx.node import Node
+
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 from colossalai.tensor.sharding_spec import ShardingSpec
-from torch.fx.node import Node

 from .constants import INFINITY_COST

@ -17,7 +18,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
                           dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
    """
    Generate the sharding spec of the tensor based on the given dim_partition_dict.
-    
+

    Args:
        input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
@ -58,7 +59,7 @@ def generate_resharding_costs(nodes: List[Node],
        nodes (List[Node]): a list of nodes
        sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
        count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
-        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None. 
+        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
    '''
    # The resharding_cost of weight is counted due to sharing weight cases.
    resharding_costs = {}
--- a/examples/README.md
+++ b/examples/README.md
@ -0,0 +1,28 @@
+## Examples folder document
+
+## Table of Contents
+<ul>
+ <li><a href="#Example-folder-description">Example folder description</a> </li>
+ <li><a href="#Integrate-Your-Example-With-System-Testing">Integrate Your Example With System Testing</a> </li>
+</ul>
+
+## Example folder description
+
+This folder provides several examples using colossalai. The images folder includes model like diffusion, dreambooth and vit. The language folder includes gpt, opt, palm and roberta. The tutorial folder is for concept illustration, such as auto-parallel, hybrid-parallel and so on.
+
+
+## Integrate Your Example With System Testing
+
+For example code contributor, to meet the expectation and test your code automatically using github workflow function, here are several steps:
+
+
+- (must) Have a test_ci.sh file in the folder like shown below in 'File Structure Chart'
+- The dataset should be located in the company's machine and can be announced using environment variable and thus no need for a separate terminal command.
+- The model parameters should be small to allow fast testing.
+- File Structure Chart
+
+       └─examples
+          └─images
+              └─vit
+                └─requirements.txt
+                └─test_ci.sh
--- a/examples/images/diffusion/README.md
+++ b/examples/images/diffusion/README.md
@ -1,6 +1,8 @@
 # ColoDiffusion: Stable Diffusion with Colossal-AI

+
 Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion) and [Stable Diffusion v2](https://github.com/Stability-AI/stablediffusion).
+
 <p id="diffusion_train" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20v2.png" width=800/>
 </p>
@ -11,14 +13,17 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/DreamBooth.png" width=800/>
 </p>

+
 - [DreamBooth Fine-tuning](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): Personalize your model using just 3-5 images of the desired subject.

 <p id="inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
 </p>

+
 - [Inference](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/diffusion): Reduce inference GPU memory consumption by 2.5x.

+
 More details can be found in our [blog of Stable Diffusion v1](https://www.hpc-ai.tech/blog/diffusion-pretraining-and-hardware-fine-tuning-can-be-almost-7x-cheaper) and [blog of Stable Diffusion v2](https://www.hpc-ai.tech/blog/colossal-ai-0-2-0).

 ## Installation
--- a/examples/language/gpt/requirements.txt
+++ b/examples/language/gpt/requirements.txt
@ -1,3 +1 @@
-colossalai >= 0.1.12
-torch >= 1.8.1
-transformers >= 4.231
+transformers >= 4.23
--- a/examples/language/gpt/test_ci.sh
+++ b/examples/language/gpt/test_ci.sh
@ -0,0 +1,16 @@
+pip install -r requirements.txt
+
+# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
+export DISTPAN="colossalai"
+
+# The following options only valid when DISTPAN="colossalai"
+export TPDEGREE=2
+export GPUNUM=4
+export PLACEMENT='cpu'
+export USE_SHARD_INIT=False
+export BATCH_SIZE=8
+export MODEL_TYPE="gpt2_medium"
+
+
+mkdir -p logs
+torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log