diff --git a/.bdist.json b/.bdist.json new file mode 100644 index 000000000..8693bca48 --- /dev/null +++ b/.bdist.json @@ -0,0 +1,24 @@ +{ + "build": [ + { + "torch_version": "1.11.0", + "cuda_image": "hpcaitech/cuda-conda:10.2" + }, + { + "torch_version": "1.11.0", + "cuda_image": "hpcaitech/cuda-conda:11.3" + }, + { + "torch_version": "1.12.1", + "cuda_image": "hpcaitech/cuda-conda:10.2" + }, + { + "torch_version": "1.12.1", + "cuda_image": "hpcaitech/cuda-conda:11.3" + }, + { + "torch_version": "1.12.1", + "cuda_image": "hpcaitech/cuda-conda:11.6" + } + ] +} diff --git a/.compatibility b/.compatibility new file mode 100644 index 000000000..c8ac4083d --- /dev/null +++ b/.compatibility @@ -0,0 +1,3 @@ +1.12.0-11.3.0 +1.11.0-11.3.0 +1.10.1-11.3.0 diff --git a/.github/workflows/README.md b/.github/workflows/README.md new file mode 100644 index 000000000..cda6a3139 --- /dev/null +++ b/.github/workflows/README.md @@ -0,0 +1,149 @@ +# CI/CD + +## Table of Contents + +- [CI/CD](#cicd) + - [Table of Contents](#table-of-contents) + - [Overview](#overview) + - [Workflows](#workflows) + - [Checks on Pull Requests](#checks-on-pull-requests) + - [Regular Checks](#regular-checks) + - [Release](#release) + - [Manual Dispatch](#manual-dispatch) + - [Release bdist wheel](#release-bdist-wheel) + - [Dispatch Example Test](#dispatch-example-test) + - [Compatibility Test](#compatibility-test) + - [User Friendliness](#user-friendliness) + - [Configuration](#configuration) + - [Progress Log](#progress-log) + +## Overview + +Automation makes our development more efficient as the machine automatically run the pre-defined tasks for the contributors. +This saves a lot of manual work and allow the developer to fully focus on the features and bug fixes. +In Colossal-AI, we use [GitHub Actions](https://github.com/features/actions) to automate a wide range of workflows to ensure the robustness of the software. +In the section below, we will dive into the details of different workflows available. + +## Workflows + +### Checks on Pull Requests + +| Workflow Name | File name | Description | +| --------------------------- | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Build` | `build.yml` | This workflow is triggered when the label `Run build and Test` is assigned to a PR. It will run all the unit tests in the repository with 4 GPUs. | +| `Pre-commit` | `pre_commit.yml` | This workflow runs pre-commit checks for code style consistency. | +| `Report pre-commit failure` | `report_precommit_failure.yml` | This PR will put up a comment in the PR to explain the precommit failure and remedy. This is executed when `Pre-commit` is done | +| `Report test coverage` | `report_test_coverage.yml` | This PR will put up a comment to report the test coverage results. This is executed when `Build` is completed. | +| `Test example` | `auto_example_check.yml` | The example will be automatically tested if its files are changed in the PR | + +### Regular Checks + +| Workflow Name | File name | Description | +| ----------------------- | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Test example` | `auto_example_check.yml` | This workflow will test all examples every Sunday | +| `Compatibility Test` | `auto_compatibility_test.yml` | This workflow will check the compatiblity of Colossal-AI against PyTorch and CUDA every Sunday. The PyTorch and CUDA versions are specified in `.compatibility`. | +| `Build on 8 GPUs` | `build_gpu_8.yml` | This workflow will run the unit tests everyday with 8 GPUs. | +| `Synchronize submodule` | `submodule.yml` | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. | +| `Close inactive issues` | `close_inactive.yml` | This workflow will close issues which are stale for 14 days. | + +### Release + +| Workflow Name | File name | Description | +| --------------------------- | ------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Draft GitHub Release Post` | `draft_github_release_post.yml` | Compose a GitHub release post draft based on the commit history. Triggered when the change of `version.txt` is merged. | +| `Release to PyPI` | `release_pypi.yml` | Build and release the wheel to PyPI. Triggered when the change of `version.txt` is merged. | +| `Release Nightly to PyPI` | `release_nightly.yml` | Build and release the nightly wheel to PyPI as `colossalai-nightly`. Automatically executed every Sunday. | +| `Release Docker` | `release_docker.yml` | Build and release the Docker image to DockerHub. Triggered when the change of `version.txt` is merged. | +| `Release bdist wheel` | `release_bdist.yml` | Build binary wheels with pre-built PyTorch extensions. Manually dispatched. See more details in the next section. | +| `Auto Release bdist wheel` | `auto_release_bdist.yml` | Build binary wheels with pre-built PyTorch extensions.Triggered when the change of `version.txt` is merged. Build specificatons are stored in `.bdist.json` | +| `Auto Compatibility Test` | `auto_compatibility_test.yml` | Check Colossal-AI's compatiblity against the PyTorch and CUDA version specified in `.compatibility`. Triggered when `version.txt` is changed in a PR. | + +### Manual Dispatch + +| Workflow Name | File name | Description | +| ---------------------------- | -------------------------------- | ------------------------------------------------------ | +| `Release bdist wheel` | `release_bdist.yml` | Build binary wheels with pre-built PyTorch extensions. | +| `Dispatch Example Test` | `dispatch_example_check.yml` | Manually test a specified example. | +| `Dispatch Compatiblity Test` | `dispatch_compatiblity_test.yml` | Test PyTorch and Python Compatibility. | + +Refer to this [documentation](https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow) on how to manually trigger a workflow. +I will provide the details of each workflow below. + +#### Release bdist wheel + +Parameters: +- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels) which is regularly updated. +- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda). +- `ref`: input the branch or tag name to build the wheel for this ref. + +#### Dispatch Example Test + +parameters: +- `example_directory`: the example directory to test. Multiple directories are supported and must be separated by comma. For example, language/gpt, images/vit. Simply input language or simply gpt does not work. + + +#### Compatibility Test + +Parameters: +- `torch version`:torch version to test against, multiple versions are supported but must be separated by comma. The default is value is all, which will test all available torch versions listed in this [repository](https://github.com/hpcaitech/public_assets/tree/main/colossalai/torch_build/torch_wheels). +- `cuda version`: cuda versions to test against, multiple versions are supported but must be separated by comma. The CUDA versions must be present in our [DockerHub repository](https://hub.docker.com/r/hpcaitech/cuda-conda). + +> It only test the compatiblity of the main branch + + +### User Friendliness + +| Workflow Name | File name | Description | +| ----------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | +| `issue-translate` | `translate_comment.yml` | This workflow is triggered when a new issue comment is created. The comment will be translated into English if not written in English. | + + +## Configuration + +This section lists the files used to configure the workflow. + +1. `.compatibility` + +This `.compatibility` file is to tell GitHub Actions which PyTorch and CUDA versions to test against. Each line in the file is in the format `${torch-version}-${cuda-version}`, which is a tag for Docker image. Thus, this tag must be present in the [docker registry](https://hub.docker.com/r/pytorch/conda-cuda) so as to perform the test. + +2. `.bdist.json` + +This file controls what pytorch/cuda compatible pre-built releases will be built and published. You can add a new entry according to the json schema below if there is a new wheel that needs to be built with AOT compilation of PyTorch extensions. + +```json +{ + "build": [ + { + "torch_version": "", + "cuda_image": "" + }, + ] +} +``` + +## Progress Log + +- [x] unit testing + - [x] test on PR + - [x] report test coverage + - [x] regular test +- [x] release + - [x] official release + - [x] nightly build + - [x] binary build + - [x] docker build + - [x] draft release post +- [x] pre-commit + - [x] check on PR + - [x] report failure +- [x] example check + - [x] check on PR + - [x] regular check + - [x] manual dispatch +- [x] compatiblity check + - [x] manual dispatch + - [x] auto test when release +- [x] helpers + - [x] comment translation + - [x] submodule update + - [x] close inactive issue diff --git a/.github/workflows/auto_compatibility_test.yml b/.github/workflows/auto_compatibility_test.yml new file mode 100644 index 000000000..4b026c63e --- /dev/null +++ b/.github/workflows/auto_compatibility_test.yml @@ -0,0 +1,74 @@ +name: Compatibility Test + +on: + pull_request: + paths: + - 'version.txt' + - '.compatibility' + # run at 03:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00 + schedule: + - cron: '0 19 * * 6' + +jobs: + matrix_preparation: + name: Prepare Container List + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v3 + - id: set-matrix + run: | + IFS=',' + DOCKER_IMAGE=() + + while read tag; do + DOCKER_IMAGE+=("\"hpcaitech/pytorch-cuda:${tag}\"") + done <.compatibility + + container=$( IFS=',' ; echo "${DOCKER_IMAGE[*]}" ) + container="[${container}]" + echo "$container" + echo "::set-output name=matrix::{\"container\":$(echo "$container")}" + + build: + name: Test for PyTorch Compatibility + needs: matrix_preparation + if: github.repository == 'hpcaitech/ColossalAI' + runs-on: [self-hosted, gpu] + strategy: + fail-fast: false + matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} + container: + image: ${{ matrix.container }} + options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 + timeout-minutes: 120 + steps: + - name: Install dependencies + run: | + pip install -U pip setuptools wheel --user + - uses: actions/checkout@v2 + with: + repository: hpcaitech/TensorNVMe + ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} + path: TensorNVMe + - name: Install tensornvme + run: | + cd TensorNVMe + conda install cmake + pip install -r requirements.txt + pip install -v . + - uses: actions/checkout@v2 + with: + ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} + - name: Install Colossal-AI + run: | + pip install -v --no-cache-dir . + pip install -r requirements/requirements-test.txt + - name: Unit Testing + run: | + PYTHONPATH=$PWD pytest tests + env: + DATA: /data/scratch/cifar-10 + NCCL_SHM_DISABLE: 1 + LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 diff --git a/.github/workflows/auto_example_check.yml b/.github/workflows/auto_example_check.yml new file mode 100644 index 000000000..df413f646 --- /dev/null +++ b/.github/workflows/auto_example_check.yml @@ -0,0 +1,143 @@ +name: Test Example +on: + pull_request: + # any change in the examples folder will trigger check for the corresponding example. + paths: + - 'examples/**' + # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00 + schedule: + - cron: '0 16 * * 6' + +jobs: + # This is for changed example files detect and output a matrix containing all the corresponding directory name. + detect-changed-example: + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.setup-matrix.outputs.matrix }} + anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }} + name: Detect changed example files + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} + + - name: Locate base commit + id: locate-base-sha + run: | + curBranch=$(git rev-parse --abbrev-ref HEAD) + commonCommit=$(git merge-base origin/main $curBranch) + echo $commonCommit + echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT + + - name: Get all changed example files + id: changed-files + uses: tj-actions/changed-files@v35 + with: + base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} + + - name: setup matrix + id: setup-matrix + run: | + changedFileName="" + for file in ${{ steps.changed-files.outputs.all_changed_files }}; do + changedFileName="${file}:${changedFileName}" + done + echo "$changedFileName was changed" + res=`python .github/workflows/scripts/example_checks/detect_changed_example.py --fileNameList $changedFileName` + echo "All changed examples are $res" + + if [ "$res" = "[]" ]; then + echo "anyChanged=false" >> $GITHUB_OUTPUT + echo "matrix=null" >> $GITHUB_OUTPUT + else + dirs=$( IFS=',' ; echo "${res[*]}" ) + echo "anyChanged=true" >> $GITHUB_OUTPUT + echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT + fi + + # If no file is changed, it will prompt an error and shows the matrix do not have value. + check-changed-example: + # Add this condition to avoid executing this job if the trigger event is workflow_dispatch. + if: | + github.event.pull_request.draft == false && + github.base_ref == 'main' && + github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' && + needs.detect-changed-example.outputs.anyChanged == 'true' + name: Test the changed example + needs: detect-changed-example + runs-on: [self-hosted, gpu] + strategy: + matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} + container: + image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ + timeout-minutes: 10 + steps: + - uses: actions/checkout@v3 + + - name: Install Colossal-AI + run: | + pip install -v . + + - name: Test the example + run: | + example_dir=${{ matrix.directory }} + cd "${PWD}/examples/${example_dir}" + bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 + + # This is for all files' weekly check. Specifically, this job is to find all the directories. + matrix_preparation: + if: | + github.repository == 'hpcaitech/ColossalAI' && + github.event_name == 'schedule' + name: Prepare matrix for weekly check + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.setup-matrix.outputs.matrix }} + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + + - name: setup matrix + id: setup-matrix + run: | + res=`python .github/workflows/scripts/example_checks/check_example_weekly.py` + all_loc=$( IFS=',' ; echo "${res[*]}" ) + echo "Found the examples: $all_loc" + echo "matrix={\"directory\":$(echo "$all_loc")}" >> $GITHUB_OUTPUT + + weekly_check: + if: | + github.repository == 'hpcaitech/ColossalAI' && + github.event_name == 'schedule' + name: Weekly check all examples + needs: matrix_preparation + runs-on: [self-hosted, gpu] + strategy: + matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} + container: + image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + timeout-minutes: 10 + steps: + - name: 📚 Checkout + uses: actions/checkout@v3 + + - name: Install Colossal-AI + run: | + pip install -v . + + - name: Traverse all files + run: | + example_dir=${{ matrix.diretory }} + echo "Testing ${example_dir} now" + cd "${PWD}/examples/${example_dir}" + bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/auto_release_bdist.yml b/.github/workflows/auto_release_bdist.yml new file mode 100644 index 000000000..56a3036f8 --- /dev/null +++ b/.github/workflows/auto_release_bdist.yml @@ -0,0 +1,70 @@ +name: Auto Release bdist wheel + +on: + workflow_dispatch: + pull_request: + paths: + - 'version.txt' + types: + - closed + +jobs: + matrix_preparation: + name: Prepare Container List + if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI' + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - uses: actions/checkout@v3 + - id: set-matrix + run: | + bdist=$(cat .bdist.json | tr '\n' ' ') + echo "matrix=${bdist}" >> $GITHUB_OUTPUT + + build: + name: Release bdist wheels + needs: matrix_preparation + runs-on: [self-hosted, gpu] + strategy: + fail-fast: false + matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} + container: + image: ${{ matrix.build.cuda_image }} + options: --gpus all --rm + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + # cub is for cuda 10.2 + - name: Copy scripts + run: | + cp -r ./.github/workflows/scripts/* ./ + + # link the cache diretories to current path + ln -s /github/home/conda_pkgs ./conda_pkgs + ln -s /github/home/pip_wheels ./pip_wheels + + # set the conda package path + echo "pkgs_dirs:\n - $PWD/conda_pkgs" > ~/.condarc + + # set safe directory + git config --global --add safe.directory /__w/ColossalAI/ColossalAI + + # get cub package for cuda 10.2 + wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip + unzip 1.8.0.zip + - name: Build bdist wheel + run: | + pip install beautifulsoup4 requests packaging + python ./build_colossalai_wheel.py --torch_version $TORCH_VERSIONS + env: + TORCH_VERSIONS: ${{ matrix.build.torch_version }} + - name: 🚀 Deploy + uses: garygrossgarten/github-action-scp@release + with: + local: all_dist + remote: ${{ secrets.PRIVATE_PYPI_DIR }} + host: ${{ secrets.PRIVATE_PYPI_HOST }} + username: ${{ secrets.PRIVATE_PYPI_USER }} + password: ${{ secrets.PRIVATE_PYPI_PASSWD }} diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5366f69cc..8f334d599 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,15 +20,26 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} + + - name: Locate base commit + id: locate-base-sha + run: | + curBranch=$(git rev-parse --abbrev-ref HEAD) + commonCommit=$(git merge-base origin/main $curBranch) + echo $commonCommit + echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT + - name: Find the changed files id: find-changed-files uses: tj-actions/changed-files@v35 with: - since_last_remote_commit: true + base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} files: | op_builder/** colossalai/kernel/** setup.py + - name: List changed files run: | for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do @@ -75,12 +86,26 @@ jobs: - name: Unit Testing run: | - PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest --cov=. --cov-report xml tests env: DATA: /data/scratch/cifar-10 NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 + - name: Collate artifact + env: + PR_NUMBER: ${{ github.event.number }} + run: | + mkdir report + echo $PR_NUMBER > ./report/pr_number + mv coverage.xml ./report + + - name: Upload test coverage artifact + uses: actions/upload-artifact@v3 + with: + name: report + path: report/ + - name: Store Cache run: | # -p flag is required to preserve the file timestamp to avoid ninja rebuild diff --git a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml b/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml deleted file mode 100644 index 2b7ec3125..000000000 --- a/.github/workflows/changed_file_trigger_examples_check_and_weekly_check.yml +++ /dev/null @@ -1,119 +0,0 @@ -name: Test Example -on: - pull_request: - # So only the changes in examples folder will trigger jobs below. - paths: - - 'examples/**' - # run at 00:00 of every Sunday(singapore time) so here is UTC time Saturday 16:00 - schedule: - - cron: '0 16 * * 6' - -jobs: - # This is for changed example files detect and output a matrix containing all the corresponding directory name. - detect-changed-example: - if: | - github.event.pull_request.draft == false && - github.base_ref == 'main' && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - name: Check out all files - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - name: Get all changed example files - id: changed-files - uses: tj-actions/changed-files@v35 - # Using this can trigger action each time a PR is submitted. - with: - since_last_remote_commit: true - - name: setup matrix - id: set-matrix - run: | - changedFileName="" - for file in ${{ steps.changed-files.outputs.all_changed_files }}; do - changedFileName="${file}:${changedFileName}" - done - echo "$changedFileName was changed" - res=`python .github/workflows/scripts/changed_example.py --fileNameList $changedFileName` - echo "All changed files are $res" - loc=$( IFS=',' ; echo "${res[*]}" ) - echo "$loc" - echo "::set-output name=matrix::{\"loc\":$(echo "$loc")}" - - # If no file is changed, it will prompt an error and shows the matrix do not have value. - check-all-changed-files: - # Add this condition to avoid executing this job if the trigger event is workflow_dispatch. - if: | - github.event.pull_request.draft == false && - github.base_ref == 'main' && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'pull_request' - name: Test each changed example files - needs: detect-changed-example - runs-on: [self-hosted, gpu] - strategy: - matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} - container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - name: Install dependancies - run: | - pip install -r ./requirements/requirements.txt - pip install colossalai - - name: List all changed example files - run: | - res=${{ matrix.loc }} - cd "${PWD}/examples/${res}" - bash test_ci.sh - - # This is for all files' weekly check. Specifically, this job is to find all the directories. - matrix_preparation: - if: | - github.event.pull_request.draft == false && - github.base_ref == 'main' && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule' - name: Prepare Directory List for All files - runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} - steps: - - name: 📚 Checkout - uses: actions/checkout@v3 - - name: setup matrix - id: set-matrix - run: | - res=`python .github/workflows/scripts/weekly_check_example.py` - all_loc=$( IFS=',' ; echo "${res[*]}" ) - echo "$all_loc" - echo "::set-output name=matrix::{\"all_loc\":$(echo "$all_loc")}" - - weekly_check: - if: | - github.event.pull_request.draft == false && - github.base_ref == 'main' && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' && github.event_name == 'schedule' - name: Weekly check all examples - needs: matrix_preparation - runs-on: [self-hosted, gpu] - strategy: - matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} - container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - steps: - - name: 📚 Checkout - uses: actions/checkout@v3 - - name: Install the requirements - run: | - pip install -r ./requirements/requirements.txt - pip install colossalai - - name: Traverse all files - run: | - dir=${{ matrix.all_loc }} - echo "${dir} is current directory" - cd "${PWD}/examples/${dir}" - bash test_ci.sh diff --git a/.github/workflows/compatibility_test.yml b/.github/workflows/dispatch_compatibility_test.yml similarity index 98% rename from .github/workflows/compatibility_test.yml rename to .github/workflows/dispatch_compatibility_test.yml index eadd07886..ac5669c6f 100644 --- a/.github/workflows/compatibility_test.yml +++ b/.github/workflows/dispatch_compatibility_test.yml @@ -1,4 +1,4 @@ -name: Compatibility Test +name: Dispatch Compatibility Test on: workflow_dispatch: diff --git a/.github/workflows/workflow_dispatch_example.yml b/.github/workflows/dispatch_example_check.yml similarity index 57% rename from .github/workflows/workflow_dispatch_example.yml rename to .github/workflows/dispatch_example_check.yml index d9d576910..e0333422f 100644 --- a/.github/workflows/workflow_dispatch_example.yml +++ b/.github/workflows/dispatch_example_check.yml @@ -8,7 +8,7 @@ on: required: true jobs: - manual_check_matrix_preparation: + matrix_preparation: if: | github.event.pull_request.draft == false && github.base_ref == 'main' && @@ -16,31 +16,24 @@ jobs: name: Check the examples user want runs-on: ubuntu-latest outputs: - matrix: ${{ steps.set-matrix-1.outputs.matrix }} + matrix: ${{ steps.set-matrix.outputs.matrix }} steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Get manual directories - id: set-matrix-1 + - name: Set up matrix + id: set-matrix env: check_dir: ${{ inputs.example_directory }} run: | - all_mannual_check_dir=() - for cdi in $check_dir - do - all_mannual_check_dir+=("\"${cdi}\"") - done - man_loc=$( IFS=',' ; echo "${all_mannual_check_dir[*]}" ) - res=`python .github/workflows/scripts/input_check_example.py --fileNameList $man_loc` - echo "${res} is file existance. 1 for all exist, -1 for at least one file not exist." - if [ res == -1 ];then - exit(1) + res=`python .github/workflows/scripts/example_checks/check_dispatch_inputs.py --fileNameList $check_dir` + if [ res == "failure" ];then + exit -1 fi - man_loc="[${man_loc}]" - echo "$man_loc" - echo "::set-output name=matrix::{\"man_loc\":$(echo "$man_loc")}" + dirs="[${check_dir}]" + echo "Testing examples in $dirs" + echo "matrix={\"directory\":$(echo "$dirs")}" >> $GITHUB_OUTPUT - manual_check: + test_example: if: | github.event.pull_request.draft == false && github.base_ref == 'main' && @@ -52,16 +45,19 @@ jobs: matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ + timeout-minutes: 10 steps: - name: 📚 Checkout uses: actions/checkout@v3 - - name: Install the requirements + - name: Install Colossal-AI run: | - pip install -r ./requirements/requirements.txt - pip install colossalai - - name: Traverse all files + pip install -v . + - name: Test the example run: | - dir=${{ matrix.man_loc }} - echo "${dir} is current directory" + dir=${{ matrix.directory }} + echo "Testing ${dir} now" cd "${PWD}/examples/${dir}" bash test_ci.sh + env: + NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/draft_github_release_post.yml b/.github/workflows/draft_github_release_post.yml index 413714daf..53bfa9e8d 100644 --- a/.github/workflows/draft_github_release_post.yml +++ b/.github/workflows/draft_github_release_post.yml @@ -8,11 +8,10 @@ on: types: - closed - jobs: release: name: Draft Release Post - if: github.repository == 'hpcaitech/ColossalAI' + if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI' runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/pre_commit.yml b/.github/workflows/pre_commit.yml new file mode 100644 index 000000000..3e71be2fc --- /dev/null +++ b/.github/workflows/pre_commit.yml @@ -0,0 +1,71 @@ +name: pre-commit + +on: + pull_request: + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + ref: ${{ github.event.pull_request.head.sha }} + + # the PR branch and the hpcaitech/colossal-ai main branch + # must share a common commit, we need to locate that commit, + # which is the commit checked-out or forked when the PR branch is created + # such that we can look for files changed since that commit + - name: Locate base commit + id: locate-base-sha + run: | + curBranch=$(git rev-parse --abbrev-ref HEAD) + commonCommit=$(git merge-base origin/main $curBranch) + echo $commonCommit + echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT + + - name: Find the changed files + id: find-changed-files + uses: tj-actions/changed-files@v35 + with: + base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} + + - name: List all changed files + run: | + for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do + echo "$file was changed" + done + + - uses: actions/setup-python@v3 + + - name: Cache pre-commit hooks + uses: actions/cache@v3 + with: + path: ~/.cache/pre-commit + key: ${{ runner.os }}-pre-commit-hooks + + - name: Set up pre-commit + run: | + pip install pre-commit + pre-commit install + + - name: Run pre-commit on Changed Files + id: precommit + run: | + for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do + echo "======= running pre-commit on ${file} =======" + pre-commit run --files $file + done + + - name: Save PR number + if: always() + env: + PR_NUMBER: ${{ github.event.number }} + run: | + mkdir -p ./pr + echo $PR_NUMBER > ./pr/pr_number + - uses: actions/upload-artifact@v3 + if: always() + with: + name: pr_number + path: pr/ diff --git a/.github/workflows/release_docker.yml b/.github/workflows/release_docker.yml index 328d232a8..8da6e5f87 100644 --- a/.github/workflows/release_docker.yml +++ b/.github/workflows/release_docker.yml @@ -2,13 +2,16 @@ name: Publish Docker Image to DockerHub on: workflow_dispatch: - release: - types: [published] + pull_request: + paths: + - 'version.txt' + types: + - closed jobs: release: name: Publish Docker Image to DockerHub - if: github.repository == 'hpcaitech/ColossalAI' + if: ( github.event_name == 'workflow_dispatch' || github.event.pull_request.merged == true ) && github.repository == 'hpcaitech/ColossalAI' runs-on: [self-hosted, gpu] container: image: "hpcaitech/docker-in-docker:latest" @@ -18,23 +21,17 @@ jobs: with: fetch-depth: 0 - name: Build Docker + id: build run: | version=$(cat version.txt) - docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t hpcaitech/colossalai:$version ./docker + tag=hpcaitech/colossalai:$version + docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 -t $tag ./docker + echo "tag=${tag}" >> $GITHUB_OUTPUT - name: Log in to Docker Hub uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 - with: - images: hpcaitech/colossalai - - name: Build and push Docker image - uses: docker/build-push-action@ad44023a93711e3deb337508980b4b5e9bcdc5dc - with: - context: . - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + - name: Push Docker image + run: | + docker push ${{ steps.build.outputs.tag }} diff --git a/.github/workflows/release_nightly.yml b/.github/workflows/release_nightly.yml index 6bc000d1f..8aa48b8ed 100644 --- a/.github/workflows/release_nightly.yml +++ b/.github/workflows/release_nightly.yml @@ -1,73 +1,29 @@ -name: Release bdist wheel for Nightly versions +name: Publish Nightly Version to PyPI on: - schedule: - # run at 00:00 of every Sunday - - cron: '0 0 * * 6' workflow_dispatch: + schedule: + - cron: '0 0 * * 6' # release on every Sunday 00:00 UTC time jobs: - matrix_preparation: - name: Prepare Container List + build-n-publish: + if: github.event_name == 'workflow_dispatch' || github.repository == 'hpcaitech/ColossalAI' + name: Build and publish Python 🐍 distributions 📦 to PyPI runs-on: ubuntu-latest - outputs: - matrix: ${{ steps.set-matrix.outputs.matrix }} + timeout-minutes: 20 steps: - - id: set-matrix - run: | - matrix="[\"hpcaitech/cuda-conda:11.3\", \"hpcaitech/cuda-conda:10.2\"]" - echo $matrix - echo "::set-output name=matrix::{\"container\":$(echo $matrix)}" + - uses: actions/checkout@v2 - build: - name: Release bdist wheels - needs: matrix_preparation - if: github.repository == 'hpcaitech/ColossalAI' && contains(fromJson('["FrankLeeeee", "ver217", "feifeibear", "kurisusnowdeng"]'), github.actor) - runs-on: [self-hosted, gpu] - strategy: - fail-fast: false - matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} - container: - image: ${{ matrix.container }} - options: --gpus all --rm - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - # cub is for cuda 10.2 - - name: Copy scripts and checkout - run: | - cp -r ./.github/workflows/scripts/* ./ - ln -s /github/home/pip_wheels ./pip_wheels - wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip - unzip 1.8.0.zip - - name: Build bdist wheel - run: | - pip install beautifulsoup4 requests packaging - python ./build_colossalai_wheel.py --nightly - - name: 🚀 Deploy - uses: garygrossgarten/github-action-scp@release - with: - local: all_dist - remote: ${{ secrets.PRIVATE_PYPI_NIGHTLY_DIR }} - host: ${{ secrets.PRIVATE_PYPI_HOST }} - username: ${{ secrets.PRIVATE_PYPI_USER }} - password: ${{ secrets.PRIVATE_PYPI_PASSWD }} - remove_old_build: - name: Remove old nightly build - runs-on: ubuntu-latest - needs: build - steps: - - name: executing remote ssh commands using password - uses: appleboy/ssh-action@master - env: - BUILD_DIR: ${{ secrets.PRIVATE_PYPI_NIGHTLY_DIR }} - with: - host: ${{ secrets.PRIVATE_PYPI_HOST }} - username: ${{ secrets.PRIVATE_PYPI_USER }} - password: ${{ secrets.PRIVATE_PYPI_PASSWD }} - envs: BUILD_DIR - script: | - cd $BUILD_DIR - find . -type f -mtime +0 -exec rm -f {} + - script_stop: true + - uses: actions/setup-python@v2 + with: + python-version: '3.8.14' + + - run: NIGHTLY=1 python setup.py sdist build + + # publish to PyPI if executed on the main branch + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} + verbose: true diff --git a/.github/workflows/report_precommit_failure.yml b/.github/workflows/report_precommit_failure.yml new file mode 100644 index 000000000..e6ca7b01b --- /dev/null +++ b/.github/workflows/report_precommit_failure.yml @@ -0,0 +1,67 @@ +name: Report Precommit Failure + +on: + workflow_run: + workflows: [pre-commit] + types: + - completed + +jobs: + # comment with a message on how to do pre-commit + # if the pre-commit check was not passed + report-precommit-failure: + runs-on: ubuntu-latest + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + steps: + - name: 'Download artifact' + uses: actions/github-script@v6 + with: + script: | + let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: context.payload.workflow_run.id, + }); + let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => { + return artifact.name == "pr_number" + })[0]; + let download = await github.rest.actions.downloadArtifact({ + owner: context.repo.owner, + repo: context.repo.repo, + artifact_id: matchArtifact.id, + archive_format: 'zip', + }); + let fs = require('fs'); + fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/pr_number.zip`, Buffer.from(download.data)); + + - name: 'Unzip artifact' + run: unzip pr_number.zip + + - name: 'Comment on PR' + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + let fs = require('fs'); + let issue_number = Number(fs.readFileSync('./pr_number')); + let owner = context.repo.owner; + let repo = context.repo.repo; + let run_id = context.payload.workflow_run.id; + let run_url = `https://github.com/${owner}/${repo}/actions/runs/${run_id}` + let body = ` + Your pre-commit check failed, follow the steps to run pre-commit on your file for code style consistency. + + 1. install pre-commit via "pip install pre-commit" + 2. install pre-commit hooks via "pre-commit install" + 3. run pre-commit on file with format error via "pre-commit run --files path" by replacing "path" with the actual file path + 4. commit and push to your branch + + View your job at ${run_url}. + Read our "CONTRIBUTING.md" for more reference to the code style. + `; + await github.rest.issues.createComment({ + owner: owner, + repo: repo, + issue_number: issue_number, + body: body + }); diff --git a/.github/workflows/report_test_coverage.yml b/.github/workflows/report_test_coverage.yml new file mode 100644 index 000000000..dc3fe395f --- /dev/null +++ b/.github/workflows/report_test_coverage.yml @@ -0,0 +1,74 @@ +name: Report Test Coverage + +on: + workflow_run: + workflows: [Build] + types: + - completed + +jobs: + report-test-coverage: + runs-on: ubuntu-latest + steps: + - name: 'Download artifact' + uses: actions/github-script@v6 + with: + script: | + let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: context.payload.workflow_run.id, + }); + let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => { + return artifact.name == "report" + })[0]; + let download = await github.rest.actions.downloadArtifact({ + owner: context.repo.owner, + repo: context.repo.repo, + artifact_id: matchArtifact.id, + archive_format: 'zip', + }); + let fs = require('fs'); + fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/report.zip`, Buffer.from(download.data)); + + - name: 'Unzip artifact' + run: | + unzip report.zip + + - name: Code Coverage Report + uses: irongut/CodeCoverageSummary@v1.3.0 + with: + filename: coverage.xml + badge: true + format: markdown + hide_branch_rate: false + hide_complexity: false + indicators: true + output: both + thresholds: '80 90' + + - name: Make Coverage Report Collapsable + run: | + sed -i '2 i
' code-coverage-results.md + sed -i '3 i Click me to view the complete report' code-coverage-results.md + echo "
" >> code-coverage-results.md + + - name: 'Comment on PR' + uses: actions/github-script@v6 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + let fs = require('fs'); + let issue_number = Number(fs.readFileSync('./pr_number')); + let owner = context.repo.owner; + let repo = context.repo.repo; + let run_id = context.payload.workflow_run.id; + let run_url = `https://github.com/${owner}/${repo}/actions/runs/${run_id}` + let body = fs.readFileSync('./code-coverage-results.md', {encoding:'utf8', flag:'r'}) + + await github.rest.issues.createComment({ + owner: owner, + repo: repo, + issue_number: issue_number, + body: body + }); diff --git a/.github/workflows/scripts/example_checks/check_dispatch_inputs.py b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py new file mode 100644 index 000000000..04d2063ec --- /dev/null +++ b/.github/workflows/scripts/example_checks/check_dispatch_inputs.py @@ -0,0 +1,27 @@ +import argparse +import os + + +def check_inputs(input_list): + for path in input_list: + real_path = os.path.join('examples', path) + if not os.path.exists(real_path): + return False + return True + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-f', '--fileNameList', type=str, help="List of file names") + args = parser.parse_args() + name_list = args.fileNameList.split(",") + is_correct = check_inputs(name_list) + + if is_correct: + print('success') + else: + print('failure') + + +if __name__ == '__main__': + main() diff --git a/.github/workflows/scripts/weekly_check_example.py b/.github/workflows/scripts/example_checks/check_example_weekly.py similarity index 76% rename from .github/workflows/scripts/weekly_check_example.py rename to .github/workflows/scripts/example_checks/check_example_weekly.py index dfedc4628..941e90901 100644 --- a/.github/workflows/scripts/weekly_check_example.py +++ b/.github/workflows/scripts/example_checks/check_example_weekly.py @@ -5,9 +5,9 @@ def show_files(path, all_files): # Traverse all the folder/file in current directory file_list = os.listdir(path) # Determine the element is folder or file. If file, pass it into list, if folder, recurse. - for file in file_list: + for file_name in file_list: # Get the abs directory using os.path.join() and store into cur_path. - cur_path = os.path.join(path, file) + cur_path = os.path.join(path, file_name) # Determine whether folder if os.path.isdir(cur_path): show_files(cur_path, all_files) @@ -26,9 +26,8 @@ def main(): for file_loc in contents: split_loc = file_loc.split('/') # must have two sub-folder levels after examples folder, such as examples/images/vit is acceptable, examples/images/README.md is not, examples/requirements.txt is not. - if len(split_loc) - split_loc.index('examples') >= 3: - tmp_loc = split_loc[(split_loc.index('examples') + 1):(split_loc.index('examples') + 3)] - re_loc = join(tmp_loc, '/') + if len(split_loc) >= 4: + re_loc = '/'.join(split_loc[1:3]) if re_loc not in all_loc: all_loc.append(re_loc) print(all_loc) diff --git a/.github/workflows/scripts/changed_example.py b/.github/workflows/scripts/example_checks/detect_changed_example.py similarity index 52% rename from .github/workflows/scripts/changed_example.py rename to .github/workflows/scripts/example_checks/detect_changed_example.py index ac2f0864e..df4fd6736 100644 --- a/.github/workflows/scripts/changed_example.py +++ b/.github/workflows/scripts/example_checks/detect_changed_example.py @@ -3,14 +3,19 @@ import argparse def main(): parser = argparse.ArgumentParser() - parser.add_argument('--fileNameList', type=str) + parser.add_argument('-f', '--fileNameList', type=str, help="The list of changed files") args = parser.parse_args() name_list = args.fileNameList.split(":") folder_need_check = set() for loc in name_list: - # Find only the sub-folder of 'example' folder + # Find only the sub-sub-folder of 'example' folder + # the examples folder structure is like + # - examples + # - area + # - application + # - file if loc.split("/")[0] == "examples" and len(loc.split("/")) >= 4: - folder_need_check.add(loc.split("/")[1] + "/" + loc.split("/")[2]) + folder_need_check.add('/'.join(loc.split("/")[1:3])) # Output the result using print. Then the shell can get the values. print(list(folder_need_check)) diff --git a/.github/workflows/scripts/input_check_example.py b/.github/workflows/scripts/input_check_example.py deleted file mode 100644 index 5602d8f09..000000000 --- a/.github/workflows/scripts/input_check_example.py +++ /dev/null @@ -1,23 +0,0 @@ -import argparse -import os - - -def detect_correct(loc_li): - for loc in loc_li: - real_loc = 'examples/' + eval(loc) - if not os.path.exists(real_loc): - return -1 - return 1 - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--fileNameList', type=str) - args = parser.parse_args() - name_list = args.fileNameList.split(",") - result = detect_correct(name_list) - print(result) - - -if __name__ == '__main__': - main() diff --git a/.github/workflows/translate_comment.yml b/.github/workflows/translate_comment.yml new file mode 100644 index 000000000..83c127b3c --- /dev/null +++ b/.github/workflows/translate_comment.yml @@ -0,0 +1,18 @@ +name: 'issue-translator' +on: + issue_comment: + types: [created] + issues: + types: [opened] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: usthe/issues-translate-action@v2.7 + with: + IS_MODIFY_TITLE: false + # not require, default false, . Decide whether to modify the issue title + # if true, the robot account @Issues-translate-bot must have modification permissions, invite @Issues-translate-bot to your project or use your custom bot. + CUSTOM_BOT_NOTE: Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑‍🤝‍🧑👫🧑🏿‍🤝‍🧑🏻👩🏾‍🤝‍👨🏿👬🏿 + # not require. Customize the translation robot prefix message. diff --git a/.gitignore b/.gitignore index 6b6f980e3..bf74a7538 100644 --- a/.gitignore +++ b/.gitignore @@ -151,3 +151,7 @@ colossalai/version.py # ignore python interface defition file .pyi + +# ignore coverage test file +coverage.lcov +coverage.xml diff --git a/README-zh-Hans.md b/README-zh-Hans.md index 8edcff28b..5ad22785c 100644 --- a/README-zh-Hans.md +++ b/README-zh-Hans.md @@ -5,10 +5,10 @@ Colossal-AI: 一个面向大模型时代的通用深度学习系统 -

论文 | - 文档 | - 例程 | - 论坛 | +

论文 | + 文档 | + 例程 | + 论坛 | 博客

[![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build.yml) @@ -35,7 +35,7 @@
  • 为何选择 Colossal-AI
  • 特点
  • - 并行训练样例展示 + 并行训练样例展示
  • - 单GPU训练样例展示 + 单GPU训练样例展示
  • - 推理 (Energon-AI) 样例展示 + 推理 (Energon-AI) 样例展示
  • - Colossal-AI 成功案例 + Colossal-AI 成功案例