[workflow] refactored the example check workflow (#2411)

* [workflow] refactored the example check workflow * polish code * polish code * polish code * polish code * polish code * polish code * polish code * polish code * polish code * polish code * polish code
2023-01-10 11:26:19 +08:00 · 2023-01-10 11:26:19 +08:00 · 8327932d2c
parent 8de8de9fa3
commit 8327932d2c
10 changed files with 113 additions and 92 deletions
--- a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml
+++ b/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml
@ -1,7 +1,7 @@
 name: Test Example
 on:
  pull_request:
-    # So only the changes in examples folder will trigger jobs below.
+    # any change in the examples folder will trigger check for the corresponding example.
    paths:
      - 'examples/**'
  # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00
@ -17,12 +17,14 @@ jobs:
        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
    runs-on: ubuntu-latest
    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    name: Check out all files
+      matrix: ${{ steps.setup-matrix.outputs.matrix }}
+      anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
+    name: Detect changed example files
    steps:
      - uses: actions/checkout@v3
        with:
-          fetch-depth: 2
+          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
      - name: Get all changed example files
        id: changed-files
        uses: tj-actions/changed-files@v35
@ -30,46 +32,53 @@ jobs:
        with:
          since_last_remote_commit: true
      - name: setup matrix
-        id: set-matrix
+        id: setup-matrix
        run: |
          changedFileName=""
          for file in ${{ steps.changed-files.outputs.all_changed_files  }}; do
            changedFileName="${file}:${changedFileName}"
          done
          echo "$changedFileName was changed"
-          res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName`
-          echo "All changed files are $res"
-          loc=$( IFS=',' ; echo "${res[*]}" )
-          echo "$loc"
-          echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}"
+          res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName`
+          echo "All changed examples are $res"
+
+          if [ "$x" = "[]" ]; then
+            echo "anyChanged=false" >> $GITHUB_OUTPUT
+            echo "matrix=null" >> $GITHUB_OUTPUT
+          else
+            dirs=$( IFS=',' ; echo "${res[*]}" )
+            echo "anyChanged=true" >> $GITHUB_OUTPUT
+            echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT
+          fi

  # If no file is changed, it will prompt an error and shows the matrix do not have value.
-  check-all-changed-files:
+  check-changed-example:
    # Add this condition to avoid executing this job if the trigger event is workflow_dispatch.
    if: |
        github.event.pull_request.draft == false &&
        github.base_ref == 'main' &&
        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request'
-    name: Test each changed example files
+    name: Test the changed example
    needs: detect-changed-example
    runs-on: [self-hosted, gpu]
    strategy:
      matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
    container:
      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/
+    timeout-minutes: 10
    steps:
      - uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: Install dependancies
+      - name: Install Colossal-AI
        run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
-      - name: List all changed example files
+          pip install -v .
+      - name: Test the example
        run: |
-          res=${{ matrix.loc }}
-          cd "${PWD}/examples/${res}"
+          example_dir=${{ matrix.directory }}
+          cd "${PWD}/examples/${example_dir}"
          bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1

  # This is for all files' weekly check. Specifically, this job is to find all the directories.
  matrix_preparation:
@ -77,20 +86,20 @@ jobs:
        github.event.pull_request.draft == false &&
        github.base_ref == 'main' &&
        github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule'
-    name: Prepare Directory List for All files
+    name: Prepare matrix for weekly check
    runs-on: ubuntu-latest
    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      matrix: ${{ steps.setup-matrix.outputs.matrix }}
    steps:
    - name: 📚 Checkout
      uses: actions/checkout@v3
    - name: setup matrix
-      id: set-matrix
+      id: setup-matrix
      run: |
-        res=`python .github/workflows/scripts/weekly_check_example.py`
+        res=`python .github/workflows/scripts/example_checks/check_example_weekly.py`
        all_loc=$( IFS=',' ; echo "${res[*]}" )
-        echo "$all_loc"
-        echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}"
+        echo "Found the examples: $all_loc"
+        echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT

  weekly_check:
    if: |
@ -104,16 +113,18 @@ jobs:
      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
    container:
      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+    timeout-minutes: 10
    steps:
      - name: 📚 Checkout
        uses: actions/checkout@v3
-      - name: Install the requirements
+      - name: Install Colossal-AI
        run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
+          pip install -v .
      - name: Traverse all files
        run: |
-          dir=${{ matrix.all_loc }}
-          echo "${dir} is current directory"
-          cd "${PWD}/examples/${dir}"
+          example_dir=${{ matrix.diretory }}
+          echo "Testing ${example_dir} now"
+          cd "${PWD}/examples/${example_dir}"
          bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
--- a/.github/workflows/workflow_dispatch_example.yml
+++ b/.github/workflows/workflow_dispatch_example.yml
@ -8,7 +8,7 @@ on:
        required: true

 jobs:
-  manual_check_matrix_preparation:
+  matrix_preparation:
    if: |
        github.event.pull_request.draft == false &&
        github.base_ref == 'main' &&
@ -16,31 +16,24 @@ jobs:
    name: Check the examples user want
    runs-on: ubuntu-latest
    outputs:
-      matrix: ${{ steps.set-matrix-1.outputs.matrix }}
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
    - name: 📚 Checkout
      uses: actions/checkout@v3
-    - name: Get manual directories
-      id: set-matrix-1
+    - name: Set up matrix
+      id: set-matrix
      env:
        check_dir: ${{ inputs.example_directory }}
      run: |
-        all_mannual_check_dir=()
-        for cdi in $check_dir
-        do
-          all_mannual_check_dir+=("\"${cdi}\"")
-        done
-        man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" )
-        res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc`
-        echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist."
-        if [ res == -1 ];then
-           exit(1)
+        res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir`
+        if [ res == "failure" ];then
+          exit -1
        fi
-        man_loc="[${man_loc}]"
-        echo "$man_loc"
-        echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}"
+        dirs="[${check_dir}]"
+        echo "Testing examples in $dirs"
+        echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT

-  manual_check:
+  test_example:
    if: |
        github.event.pull_request.draft == false &&
        github.base_ref == 'main' &&
@ -52,16 +45,19 @@ jobs:
      matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
    container:
      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/
+    timeout-minutes: 10
    steps:
      - name: 📚 Checkout
        uses: actions/checkout@v3
-      - name: Install the requirements
+      - name: Install Colossal-AI
        run: |
-          pip install -r ./requirements/requirements.txt
-          pip install colossalai
-      - name: Traverse all files
+          pip install -v .
+      - name: Test the example
        run: |
-          dir=${{ matrix.man_loc }}
-          echo "${dir} is current directory"
+          dir=${{ matrix.directory }}
+          echo "Testing ${dir} now"
          cd "${PWD}/examples/${dir}"
          bash test_ci.sh
+        env:
+          NCCL_SHM_DISABLE: 1
--- a/.github/workflows/scripts/example_checks/check_dispatch_inputs.py
+++ b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py
@ -0,0 +1,27 @@
+import argparse
+import os
+
+
+def check_inputs(input_list):
+    for path in input_list:
+        real_path = os.path.join('examples', path)
+        if not os.path.exists(real_path):
+            return False
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--fileNameList', type=str, help="List of file names")
+    args = parser.parse_args()
+    name_list = args.fileNameList.split(",")
+    is_correct = check_inputs(name_list)
+
+    if is_correct:
+        print('success')
+    else:
+        print('failure')
+
+
+if __name__ == '__main__':
+    main()
--- a/.github/workflows/scripts/example_checks/check_example_weekly.py
+++ b/.github/workflows/scripts/example_checks/check_example_weekly.py
@ -5,9 +5,9 @@ def show_files(path, all_files):
    # Traverse all the folder/file in current directory
    file_list = os.listdir(path)
    # Determine the element is folder or file. If file, pass it into list, if folder, recurse.
-    for file in file_list:
+    for file_name in file_list:
        # Get the abs directory using os.path.join() and store into cur_path.
-        cur_path = os.path.join(path, file)
+        cur_path = os.path.join(path, file_name)
        # Determine whether folder
        if os.path.isdir(cur_path):
            show_files(cur_path, all_files)
@ -26,9 +26,8 @@ def main():
    for file_loc in contents:
        split_loc = file_loc.split('/')
        # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not.
-        if len(split_loc) - split_loc.index('examples') >= 3:
-            tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)]
-            re_loc = join(tmp_loc, '/')
+        if len(split_loc) >= 4:
+            re_loc = '/'.join(split_loc[1:3])
            if re_loc not in all_loc:
                all_loc.append(re_loc)
    print(all_loc)
--- a/.github/workflows/scripts/example_checks/detect_changed_example.py
+++ b/.github/workflows/scripts/example_checks/detect_changed_example.py
@ -3,14 +3,19 @@ import argparse

 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument('--fileNameList', type=str)
+    parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files")
    args = parser.parse_args()
    name_list = args.fileNameList.split(":")
    folder_need_check = set()
    for loc in name_list:
-        # Find only the sub-folder of 'example' folder
+        # Find only the sub-sub-folder of 'example' folder
+        # the examples folder structure is like
+        # - examples
+        #   - area
+        #     - application
+        #       - file
        if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4:
-            folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2])
+            folder_need_check.add('/'.join(loc.split("/")[1:3]))
    # Output the result using print. Then the shell can get the values.
    print(list(folder_need_check))

--- a/.github/workflows/scripts/input_check_example.py
+++ b/.github/workflows/scripts/input_check_example.py
@ -1,23 +0,0 @@
-import argparse
-import os
-
-
-def detect_correct(loc_li):
-    for loc in loc_li:
-        real_loc = 'examples/' + eval(loc)
-        if not os.path.exists(real_loc):
-            return -1
-    return 1
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--fileNameList', type=str)
-    args = parser.parse_args()
-    name_list = args.fileNameList.split(",")
-    result = detect_correct(name_list)
-    print(result)
-
-
-if __name__ == '__main__':
-    main()
--- a/examples/tutorial/hybrid_parallel/config.py
+++ b/examples/tutorial/hybrid_parallel/config.py
@ -6,8 +6,8 @@ from colossalai.amp import AMP_TYPE
 BATCH_SIZE = 256
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 3
+NUM_EPOCHS = 2
+WARMUP_EPOCHS = 1

 # model config
 IMG_SIZE = 224
--- a/examples/tutorial/hybrid_parallel/requirements.txt
+++ b/examples/tutorial/hybrid_parallel/requirements.txt
@ -1,2 +1,3 @@
 colossalai >= 0.1.12
 torch >= 1.8.1
+titans
--- a/examples/tutorial/hybrid_parallel/test_ci.sh
+++ b/examples/tutorial/hybrid_parallel/test_ci.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+set -euxo pipefail
+
+pip install -r requirements.txt
+torchrun --standalone --nproc_per_node 4 train.py --config config.py -s
--- a/examples/tutorial/hybrid_parallel/train.py
+++ b/examples/tutorial/hybrid_parallel/train.py
@ -98,9 +98,9 @@ def main():
    root = os.environ.get('DATA', '../data')
    if args.synthetic:
        # if we use synthetic dataset
-        # we train for 30 steps and eval for 10 steps per epoch
-        train_dataloader = DummyDataloader(length=30, batch_size=gpc.config.BATCH_SIZE)
-        test_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+        # we train for 10 steps and eval for 5 steps per epoch
+        train_dataloader = DummyDataloader(length=10, batch_size=gpc.config.BATCH_SIZE)
+        test_dataloader = DummyDataloader(length=5, batch_size=gpc.config.BATCH_SIZE)
    else:
        train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE, root, pad_if_needed=True)