diff --git a/.compatibility b/.compatibility index 32da32be5..a918cb162 100644 --- a/.compatibility +++ b/.compatibility @@ -1,3 +1,2 @@ -1.12.0-11.3.0 -1.13.0-11.6.0 2.0.0-11.7.0 +2.1.0-11.8.0 diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index e2114d43b..a34a60669 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -22,57 +22,6 @@ on: delete: jobs: - prepare_cache: - name: Prepare testmon cache - if: | - github.event_name == 'create' && - github.event.ref_type == 'branch' && - github.event.repository.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Copy testmon cache - run: | # branch name may contain slash, we need to replace it with space - export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /") - if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then - cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}" - fi - env: - MAIN_BRANCH: ${{ github.event.master_branch }} - - prepare_cache_for_pr: - name: Prepare testmon cache for PR - if: | - github.event_name == 'pull_request' && - (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache - cancel-in-progress: true - steps: - - name: Copy testmon cache - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /") - if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then - mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER} - fi - env: - PR_NUMBER: ${{ github.event.number }} - detect: name: Detect file change if: | @@ -140,8 +89,8 @@ jobs: if: needs.detect.outputs.anyLibraryFileChanged == 'true' runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 60 defaults: run: @@ -174,6 +123,7 @@ jobs: run: | cd TensorNVMe cp -p -r ./build /github/home/tensornvme_cache/ + cp -p -r ./cmake-build /github/home/tensornvme_cache/ - name: Checkout Colossal-AI uses: actions/checkout@v2 @@ -198,31 +148,24 @@ jobs: # -p flag is required to preserve the file timestamp to avoid ninja rebuild cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/ - - name: Restore Testmon Cache - run: | - if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then - cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/ - fi - env: - PR_NUMBER: ${{ github.event.number }} - - name: Execute Unit Testing run: | - CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/ + CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \ + -m "not largedist" \ + --durations=0 \ + --ignore tests/test_analyzer \ + --ignore tests/test_auto_parallel \ + --ignore tests/test_fx \ + --ignore tests/test_autochunk \ + --ignore tests/test_gptq \ + --ignore tests/test_infer_ops \ + --ignore tests/test_legacy \ + --ignore tests/test_smoothquant \ + tests/ env: - DATA: /data/scratch/cifar-10 - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 - TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt LLAMA_PATH: /data/scratch/llama-tiny - - name: Store Testmon Cache - run: | - mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} - cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/ - env: - PR_NUMBER: ${{ github.event.number }} - - name: Collate artifact env: PR_NUMBER: ${{ github.event.number }} @@ -259,54 +202,3 @@ jobs: with: name: report path: report/ - - store_cache: - name: Store testmon cache for PR - if: | - github.event_name == 'pull_request' && - github.event.action == 'closed' && - github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Store testmon cache if possible - if: github.event.pull_request.merged == true - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /") - if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then - cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/" - fi - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - - - name: Remove testmon cache - run: | - rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER} - env: - PR_NUMBER: ${{ github.event.pull_request.number }} - - remove_cache: - name: Remove testmon cache - if: | - github.event_name == 'delete' && - github.event.ref_type == 'branch' && - github.event.repository.full_name == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, gpu] - container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - options: --rm - timeout-minutes: 5 - defaults: - run: - shell: bash - steps: - - name: Remove testmon cache - run: | # branch name may contain slash, we need to replace it with space - export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /") - rm -rf "/github/home/testmon_cache/${BASE}" diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index 6c77377be..03f9c53f1 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -10,20 +10,22 @@ jobs: build: name: Build and Test Colossal-AI if: github.repository == 'hpcaitech/ColossalAI' - runs-on: [self-hosted, 8-gpu] + runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny - timeout-minutes: 40 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 + options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + timeout-minutes: 90 steps: - name: Check GPU Availability # ensure all GPUs have enough memory id: check-avai run: | avai=true - for i in $(seq 0 7); + ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) + endIndex=$(($ngpu-1)) + for i in $(seq 0 $endIndex); do gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits) - [ "$gpu_used" -gt "10000" ] && avai=false + [ "$gpu_used" -gt "2000" ] && avai=false done echo "GPU is available: $avai" @@ -60,9 +62,12 @@ jobs: - name: Unit Testing if: steps.check-avai.outputs.avai == 'true' run: | - PYTHONPATH=$PWD pytest --durations=0 tests + PYTHONPATH=$PWD pytest \ + -m "not largedist" \ + --durations=0 \ + tests/ env: - DATA: /data/scratch/cifar-10 + NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny @@ -71,7 +76,7 @@ jobs: if: ${{ failure() }} run: | url=$SERVER_URL/$REPO/actions/runs/$RUN_ID - msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details" + msg="Scheduled Build and Test failed, please visit $url for details" echo $msg python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL env: diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml index f1e7a2d0c..51238905e 100644 --- a/.github/workflows/doc_test_on_pr.yml +++ b/.github/workflows/doc_test_on_pr.yml @@ -56,7 +56,7 @@ jobs: needs: detect-changed-doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm timeout-minutes: 20 defaults: diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml index 027fbfd0a..b3536184d 100644 --- a/.github/workflows/doc_test_on_schedule.yml +++ b/.github/workflows/doc_test_on_schedule.yml @@ -12,7 +12,7 @@ jobs: name: Test the changed Doc runs-on: [self-hosted, gpu] container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 options: --gpus all --rm timeout-minutes: 60 steps: diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml index 011a0ae03..f734ee8c0 100644 --- a/.github/workflows/example_check_on_dispatch.yml +++ b/.github/workflows/example_check_on_dispatch.yml @@ -45,7 +45,7 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ timeout-minutes: 15 steps: diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index 608ae863f..6d6952aa1 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -77,9 +77,9 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 options: --gpus all --rm -v /data/scratch/examples-data:/data/ - timeout-minutes: 15 + timeout-minutes: 20 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }} cancel-in-progress: true diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml index 4fcd1e3a9..919fa5092 100644 --- a/.github/workflows/example_check_on_schedule.yml +++ b/.github/workflows/example_check_on_schedule.yml @@ -34,8 +34,8 @@ jobs: fail-fast: false matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: - image: hpcaitech/pytorch-cuda:1.12.0-11.3.0 - timeout-minutes: 15 + image: hpcaitech/pytorch-cuda:2.0.0-11.7.0 + timeout-minutes: 10 steps: - name: 📚 Checkout uses: actions/checkout@v3 diff --git a/README.md b/README.md index 1898d255e..971f4375a 100644 --- a/README.md +++ b/README.md @@ -25,15 +25,16 @@ ## Latest News -* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution) +* [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer) +* [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b) +* [2023/11] [Enhanced MoE Parallelism, Open-source MoE Model Training Can Be 9 Times More Efficient](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient) +* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific LLM Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution) * [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training) * [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth) * [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining) * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) * [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana) * [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs) -* [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt) -* [2023/01] [Hardware Savings Up to 46 Times for AIGC and Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02) ## Table of Contents