mirror of https://github.com/InternLM/InternLM
Merge branch 'main' into develop
commit
0e62d41137
|
@ -39,7 +39,7 @@ jobs:
|
||||||
needs: check-requirements
|
needs: check-requirements
|
||||||
runs-on: [lmtest]
|
runs-on: [lmtest]
|
||||||
steps:
|
steps:
|
||||||
- name: mask env
|
- name: mask env
|
||||||
run: |
|
run: |
|
||||||
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
@ -60,15 +60,29 @@ jobs:
|
||||||
runs-on: [lmtest]
|
runs-on: [lmtest]
|
||||||
timeout-minutes: 30
|
timeout-minutes: 30
|
||||||
steps:
|
steps:
|
||||||
- name: mask env
|
- name: mask env
|
||||||
run: |
|
run: |
|
||||||
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
||||||
- name: slurm-train
|
- name: slurm-train
|
||||||
|
id: basic_train
|
||||||
run: |
|
run: |
|
||||||
source activate internlm-env-test
|
source activate internlm-env-test
|
||||||
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
|
|
||||||
|
- name: load_preset_ckpt
|
||||||
|
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
|
||||||
|
run: |
|
||||||
|
source activate internlm-env-test
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
|
|
||||||
|
- name: load_new_ckpt
|
||||||
|
run: |
|
||||||
|
source activate internlm-env-test
|
||||||
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
|
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
|
||||||
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
rm -rf $GITHUB_WORKSPACE/llm_ckpts
|
||||||
|
|
||||||
- name: torchrun-train
|
- name: torchrun-train
|
||||||
|
@ -91,18 +105,17 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
source activate internlm-env-test
|
source activate internlm-env-test
|
||||||
export PYTHONPATH=$PWD:$PYTHONPATH
|
export PYTHONPATH=$PWD:$PYTHONPATH
|
||||||
sh ./ci_scripts/model/convert_to_hf.sh
|
sh ./ci_scripts/model/convert_to_hf.sh
|
||||||
cd ./hf_ckpt
|
cd ./hf_ckpt
|
||||||
srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
|
srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
|
||||||
cd ..
|
cd ..
|
||||||
rm -rf $GITHUB_WORKSPACE/hf_ckpt
|
rm -rf $GITHUB_WORKSPACE/hf_ckpt
|
||||||
|
|
||||||
load-chat-model-in-hf:
|
load-chat-model-in-hf:
|
||||||
if: ${{ always() }}
|
if: ${{ always() }}
|
||||||
needs: check-requirements
|
needs: check-requirements
|
||||||
runs-on: [lmtest]
|
runs-on: [lmtest]
|
||||||
steps:
|
steps:
|
||||||
- name: mask env
|
- name: mask env
|
||||||
run: |
|
run: |
|
||||||
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
|
||||||
- uses: actions/checkout@v3
|
- uses: actions/checkout@v3
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
name: lint-check
|
name: lint-check
|
||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
pull_request:
|
pull_request:
|
||||||
branches:
|
branches:
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
name: Sonarqube
|
name: Sonarqube
|
||||||
on:
|
on:
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
sonarqube:
|
sonarqube:
|
||||||
name: SonarQube Scan
|
name: SonarQube Scan
|
||||||
|
@ -13,4 +13,4 @@ jobs:
|
||||||
- uses: sonarsource/sonarqube-scan-action@master
|
- uses: sonarsource/sonarqube-scan-action@master
|
||||||
env:
|
env:
|
||||||
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
|
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
|
||||||
SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
|
SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
|
||||||
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
# .readthedocs.yaml
|
||||||
|
# Read the Docs configuration file
|
||||||
|
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
||||||
|
|
||||||
|
# Required
|
||||||
|
version: 2
|
||||||
|
|
||||||
|
# Set the OS, Python version and other tools you might need
|
||||||
|
build:
|
||||||
|
os: ubuntu-22.04
|
||||||
|
tools:
|
||||||
|
python: "3.8"
|
||||||
|
|
||||||
|
# Build documentation in the docs/ directory with Sphinx
|
||||||
|
sphinx:
|
||||||
|
configuration: doc/code-docs/source/conf.py
|
||||||
|
fail_on_warning: false
|
||||||
|
|
||||||
|
# Optionally build your docs in additional formats such as PDF
|
||||||
|
formats:
|
||||||
|
- pdf
|
||||||
|
|
||||||
|
# Optional but recommended, declare the Python requirements required
|
||||||
|
# to build your documentation
|
||||||
|
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
||||||
|
python:
|
||||||
|
install:
|
||||||
|
- requirements: doc/code-docs/requirements.txt
|
|
@ -40,6 +40,10 @@ InternLM は、70 億のパラメータを持つベースモデルと、実用
|
||||||
|
|
||||||
さらに、大規模な依存関係を必要とせずにモデルの事前学習をサポートする軽量な学習フレームワークが提供されます。単一のコードベースで、数千の GPU を持つ大規模クラスタでの事前学習と、単一の GPU での微調整をサポートし、顕著な性能最適化を達成します。InternLM は、1024GPU でのトレーニングにおいて 90% 近いアクセラレーション効率を達成しています。
|
さらに、大規模な依存関係を必要とせずにモデルの事前学習をサポートする軽量な学習フレームワークが提供されます。単一のコードベースで、数千の GPU を持つ大規模クラスタでの事前学習と、単一の GPU での微調整をサポートし、顕著な性能最適化を達成します。InternLM は、1024GPU でのトレーニングにおいて 90% 近いアクセラレーション効率を達成しています。
|
||||||
|
|
||||||
|
## 新闻
|
||||||
|
|
||||||
|
InternLM-7B-Chat v1.1 は、コード インタプリタと関数呼び出し機能を備えてリリースされました。 [Lagent](https://github.com/InternLM/lagent) で試すことができます。
|
||||||
|
|
||||||
## InternLM-7B
|
## InternLM-7B
|
||||||
|
|
||||||
### パフォーマンス評価
|
### パフォーマンス評価
|
||||||
|
@ -80,8 +84,8 @@ Transformers を使用して InternLM 7B チャットモデルをロードする
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
|
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
|
||||||
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
|
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
|
||||||
>>> model = model.eval()
|
>>> model = model.eval()
|
||||||
>>> response, history = model.chat(tokenizer, "こんにちは", history=[])
|
>>> response, history = model.chat(tokenizer, "こんにちは", history=[])
|
||||||
>>> print(response)
|
>>> print(response)
|
||||||
|
|
|
@ -45,6 +45,10 @@ InternLM ,即书生·浦语大模型,包含面向实用场景的70亿参数
|
||||||
|
|
||||||
提供了支持模型预训练的轻量级训练框架,无需安装大量依赖包,一套代码支持千卡预训练和单卡人类偏好对齐训练,同时实现了极致的性能优化,实现千卡训练下近90%加速效率。
|
提供了支持模型预训练的轻量级训练框架,无需安装大量依赖包,一套代码支持千卡预训练和单卡人类偏好对齐训练,同时实现了极致的性能优化,实现千卡训练下近90%加速效率。
|
||||||
|
|
||||||
|
## 新闻
|
||||||
|
|
||||||
|
我们开源了 InternLM-Chat-7B v1.1。该模型能够调用代码解释器和工具插件。你可以在 [Lagent](https://github.com/InternLM/lagent) 中体验这些新功能。
|
||||||
|
|
||||||
## InternLM-7B
|
## InternLM-7B
|
||||||
|
|
||||||
### 性能评测
|
### 性能评测
|
||||||
|
@ -74,6 +78,7 @@ InternLM ,即书生·浦语大模型,包含面向实用场景的70亿参数
|
||||||
| 模型 | InternLM 格式权重下载地址 | Transformers 格式权重下载地址 |
|
| 模型 | InternLM 格式权重下载地址 | Transformers 格式权重下载地址 |
|
||||||
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------ |
|
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------ |
|
||||||
| **InternLM 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
|
| **InternLM 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
|
||||||
|
| **InternLM Chat 7B v1.1** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1) | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1) |
|
||||||
| **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)
|
| **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)
|
||||||
| **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k)
|
| **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k)
|
||||||
|
|
||||||
|
@ -85,8 +90,8 @@ InternLM ,即书生·浦语大模型,包含面向实用场景的70亿参数
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
|
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
|
||||||
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
|
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
|
||||||
>>> model = model.eval()
|
>>> model = model.eval()
|
||||||
>>> response, history = model.chat(tokenizer, "你好", history=[])
|
>>> response, history = model.chat(tokenizer, "你好", history=[])
|
||||||
>>> print(response)
|
>>> print(response)
|
||||||
|
@ -117,26 +122,44 @@ streamlit run web_demo.py
|
||||||
|
|
||||||
我们使用 [LMDeploy](https://github.com/InternLM/LMDeploy) 完成 InternLM 的一键部署。
|
我们使用 [LMDeploy](https://github.com/InternLM/LMDeploy) 完成 InternLM 的一键部署。
|
||||||
|
|
||||||
1. 首先安装 LMDeploy:
|
```bash
|
||||||
|
python3 -m pip install lmdeploy
|
||||||
|
```
|
||||||
|
|
||||||
```bash
|
执行以下命令,可以在终端与 `internlm-chat-7b` 模型进行交互式对话,或者通过 WebUI 与它聊天。
|
||||||
python3 -m pip install lmdeploy
|
|
||||||
```
|
|
||||||
|
|
||||||
2. 快速的部署命令如下:
|
```bash
|
||||||
|
# 转换权重格式
|
||||||
|
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b
|
||||||
|
|
||||||
```bash
|
# 在终端进行交互式对话
|
||||||
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-7b/model
|
python3 -m lmdeploy.turbomind.chat ./workspace
|
||||||
```
|
|
||||||
|
|
||||||
3. 在导出模型后,你可以直接通过如下命令启动服务,并在客户端与AI对话
|
# 启动 gradio 服务
|
||||||
|
python3 -m lmdeploy.serve.gradio.app ./workspace
|
||||||
|
```
|
||||||
|
以上过程中,LMDeploy 使用的是 FP16 的计算精度。
|
||||||
|
|
||||||
```bash
|
除了 FP16 精度,LMDeploy 还支持 `internlm-chat-7b` 4bit 权重模型推理。它不仅把模型的显存减少到 6G,大约只有 FP16 的 40%,更重要的是,经过 kernel 层面的极致优化,其推理性能在 A100-80G 上可达到 FP16 的 2.4 倍以上。
|
||||||
bash workspace/service_docker_up.sh
|
|
||||||
python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
|
以下是`internlm-chat-7b` 4bit 权重模型的部署方法。推理速度的 bechmark 请参考[这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md#%E6%8E%A8%E7%90%86%E9%80%9F%E5%BA%A6)
|
||||||
```
|
|
||||||
|
```bash
|
||||||
|
# download prequnantized internlm-chat-7b model from huggingface
|
||||||
|
git-lfs install
|
||||||
|
git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
|
||||||
|
|
||||||
|
# Convert the model's layout and store it in the default path, ./workspace.
|
||||||
|
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
|
||||||
|
|
||||||
|
# inference lmdeploy's turbomind engine
|
||||||
|
python3 -m lmdeploy.turbomind.chat ./workspace
|
||||||
|
|
||||||
|
# serving with gradio
|
||||||
|
python3 -m lmdeploy.serve.gradio.app ./workspace
|
||||||
|
```
|
||||||
|
LMDeploy 是涵盖了 LLM 任务的全套轻量化、部署和服务的工具箱。请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。
|
||||||
|
|
||||||
[LMDeploy](https://github.com/InternLM/LMDeploy) 支持了 InternLM 部署的完整流程,请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。
|
|
||||||
|
|
||||||
## 微调&训练
|
## 微调&训练
|
||||||
|
|
||||||
|
|
58
README.md
58
README.md
|
@ -45,6 +45,10 @@ InternLM has open-sourced a 7 billion parameter base model and a chat model tail
|
||||||
|
|
||||||
Additionally, a lightweight training framework is offered to support model pre-training without the need for extensive dependencies. With a single codebase, it supports pre-training on large-scale clusters with thousands of GPUs, and fine-tuning on a single GPU while achieving remarkable performance optimizations. InternLM achieves nearly 90% acceleration efficiency during training on 1024 GPUs.
|
Additionally, a lightweight training framework is offered to support model pre-training without the need for extensive dependencies. With a single codebase, it supports pre-training on large-scale clusters with thousands of GPUs, and fine-tuning on a single GPU while achieving remarkable performance optimizations. InternLM achieves nearly 90% acceleration efficiency during training on 1024 GPUs.
|
||||||
|
|
||||||
|
## News
|
||||||
|
|
||||||
|
InternLM-7B-Chat v1.1 is released with code interpreter and function calling capability. You can try it with [Lagent](https://github.com/InternLM/lagent).
|
||||||
|
|
||||||
## InternLM-7B
|
## InternLM-7B
|
||||||
|
|
||||||
### Performance Evaluation
|
### Performance Evaluation
|
||||||
|
@ -74,6 +78,7 @@ InternLM 7B and InternLM 7B Chat, trained using InternLM, have been open-sourced
|
||||||
| Model | InternLM Format Weight Download Link | Transformers Format Weight Download Link |
|
| Model | InternLM Format Weight Download Link | Transformers Format Weight Download Link |
|
||||||
| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
|
| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
|
||||||
| **InternLM 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
|
| **InternLM 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
|
||||||
|
| **InternLM Chat 7B v1.1** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1) | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1) |
|
||||||
| **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) |
|
| **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) |
|
||||||
| **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k) |
|
| **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k) |
|
||||||
|
|
||||||
|
@ -85,8 +90,8 @@ To load the InternLM 7B Chat model using Transformers, use the following code:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
|
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
|
||||||
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
|
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
|
||||||
>>> model = model.eval()
|
>>> model = model.eval()
|
||||||
>>> response, history = model.chat(tokenizer, "hello", history=[])
|
>>> response, history = model.chat(tokenizer, "hello", history=[])
|
||||||
>>> print(response)
|
>>> print(response)
|
||||||
|
@ -118,28 +123,45 @@ The effect is as follows
|
||||||
|
|
||||||
### Deployment
|
### Deployment
|
||||||
|
|
||||||
We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the one-click deployment of InternLM.
|
We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the workflow of InternLM deployment.
|
||||||
|
|
||||||
1. First, install LMDeploy:
|
```bash
|
||||||
|
python3 -m pip install lmdeploy
|
||||||
|
```
|
||||||
|
|
||||||
```bash
|
You can utilize the following commands to conduct `internlm-chat-7b` FP16 inference, serve it and interact with AI assistant via WebUI:
|
||||||
python3 -m pip install lmdeploy
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Use the following command for quick deployment:
|
```bash
|
||||||
|
# convert weight layout
|
||||||
|
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b
|
||||||
|
|
||||||
```bash
|
# inference lmdeploy's turbomind engine
|
||||||
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b/model
|
python3 -m lmdeploy.turbomind.chat ./workspace
|
||||||
```
|
|
||||||
|
|
||||||
3. After exporting the model, you can start a server and have a conversation with the deployed model using the following command:
|
# serving with gradio
|
||||||
|
python3 -m lmdeploy.serve.gradio.app ./workspace
|
||||||
```bash
|
```
|
||||||
bash workspace/service_docker_up.sh
|
|
||||||
python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
|
|
||||||
```
|
|
||||||
|
|
||||||
[LMDeploy](https://github.com/InternLM/LMDeploy) provides a complete workflow for deploying InternLM. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.
|
You can also deploy 4-bit quantized `internlm-chat-7b` model via LMDeploy. It greatly trims down the model's memory overhead to 6G, just 40% of what FP16 inference would take. More importantly, with extreme optimized kernel, the inference performance achieves 2.4x faster than FP16 inference on A100-80G.
|
||||||
|
|
||||||
|
Try the followings to enjoy 4-bit `internlm-chat-7b` on a Geforce RTX 30x GPU card. You can find the inference benchmark from [here](https://github.com/InternLM/lmdeploy/blob/main/docs/en/w4a16.md#inference-performance).
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# download prequnantized internlm-chat-7b model from huggingface
|
||||||
|
git-lfs install
|
||||||
|
git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
|
||||||
|
|
||||||
|
# Convert the model's layout and store it in the default path, ./workspace.
|
||||||
|
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
|
||||||
|
|
||||||
|
# inference lmdeploy's turbomind engine
|
||||||
|
python3 -m lmdeploy.turbomind.chat ./workspace
|
||||||
|
|
||||||
|
# serving with gradio
|
||||||
|
python3 -m lmdeploy.serve.gradio.app ./workspace
|
||||||
|
```
|
||||||
|
|
||||||
|
LMDeploy is an efficient toolkit for compressing, deploying, and serving LLM models. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.
|
||||||
|
|
||||||
## Fine-tuning & Training
|
## Fine-tuning & Training
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
def merge_dicts(dict_a: dict, dict_b: dict):
|
||||||
|
for key in dict_b.keys():
|
||||||
|
if isinstance(dict_b[key], dict):
|
||||||
|
dict_b[key] = {**dict_a[key], **dict_b[key]}
|
||||||
|
merge_dicts(dict_a[key], dict_b[key])
|
||||||
|
dict_c = {**dict_a, **dict_b}
|
||||||
|
return dict_c
|
||||||
|
|
||||||
|
|
||||||
|
def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
|
||||||
|
result = ""
|
||||||
|
for key, value in data.items():
|
||||||
|
if isinstance(value, dict):
|
||||||
|
result += f"{' ' * indent}{key} = dict(\n"
|
||||||
|
result += format_dict_to_py_string(value, indent + 4, is_nested=True)
|
||||||
|
result += f"{' ' * indent})"
|
||||||
|
else:
|
||||||
|
result += f"{' ' * indent}{key} = {repr(value)}"
|
||||||
|
if is_nested:
|
||||||
|
result += ","
|
||||||
|
result += "\n"
|
||||||
|
result = f"""\
|
||||||
|
{result}
|
||||||
|
"""
|
||||||
|
return result
|
|
@ -16,7 +16,7 @@ exit_code=0
|
||||||
|
|
||||||
source ./ci_scripts/common/basic_func.sh
|
source ./ci_scripts/common/basic_func.sh
|
||||||
|
|
||||||
echo "start to test alpaca_tokenizer.py."
|
echo "start to test alpaca_tokenizer.py."
|
||||||
|
|
||||||
if [[ -d ${RESULTS} ]]; then
|
if [[ -d ${RESULTS} ]]; then
|
||||||
if ! rm -rf ${RESULTS}/*; then
|
if ! rm -rf ${RESULTS}/*; then
|
||||||
|
|
|
@ -12,7 +12,7 @@ exit_code=0
|
||||||
|
|
||||||
source ./ci_scripts/common/basic_func.sh
|
source ./ci_scripts/common/basic_func.sh
|
||||||
|
|
||||||
echo "start to test tokenizer.py."
|
echo "start to test tokenizer.py."
|
||||||
|
|
||||||
num=$(num_files "${RESULTS}")
|
num=$(num_files "${RESULTS}")
|
||||||
if [[ ${num} -gt 0 ]]; then
|
if [[ ${num} -gt 0 ]]; then
|
||||||
|
|
|
@ -40,7 +40,7 @@ num=$(num_files "${CKPTS_OUTPUT}")
|
||||||
|
|
||||||
if [[ ${num} -ne ${expected_num} ]]; then
|
if [[ ${num} -ne ${expected_num} ]]; then
|
||||||
echo "expect: ${expected_num} files, actual: ${num} files."
|
echo "expect: ${expected_num} files, actual: ${num} files."
|
||||||
exit_code=$(($exit_code + 1))
|
exit_code=$(($exit_code + 1))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# NOTICE: should not remove the cached files, because the cached files will be used in the next test case.
|
# NOTICE: should not remove the cached files, because the cached files will be used in the next test case.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
|
||||||
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
|
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
|
||||||
|
|
|
@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
|
||||||
# Ckpt folder format:
|
# Ckpt folder format:
|
||||||
# fs: 'local:/mnt/nfs/XXX'
|
# fs: 'local:/mnt/nfs/XXX'
|
||||||
# oss: 'boto3:s3://model_weights/XXX'
|
# oss: 'boto3:s3://model_weights/XXX'
|
||||||
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
|
# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
|
||||||
# SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
# SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
||||||
SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
SAVE_CKPT_FOLDER = "local:llm_ckpts"
|
||||||
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
|
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
|
||||||
|
@ -27,7 +27,7 @@ ckpt = dict(
|
||||||
load_optimizer=True,
|
load_optimizer=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
|
TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
|
||||||
data = dict(
|
data = dict(
|
||||||
seq_len=SEQ_LEN,
|
seq_len=SEQ_LEN,
|
||||||
# micro_num means the number of micro_batch contained in one gradient update
|
# micro_num means the number of micro_batch contained in one gradient update
|
||||||
|
@ -120,8 +120,8 @@ zero1 parallel:
|
||||||
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
|
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
|
||||||
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
|
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
|
||||||
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
|
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
|
||||||
pipeline parallel: pipeline parallel size.
|
pipeline parallel: pipeline parallel size, only 1 is accepted currently.
|
||||||
tensor parallel: tensor parallel size, usually the number of GPUs per node.
|
tensor parallel: tensor parallel size, usually the number of GPUs per node, only 1 is accepted currently.
|
||||||
"""
|
"""
|
||||||
parallel = dict(
|
parallel = dict(
|
||||||
zero1=8,
|
zero1=8,
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from ci_scripts.common import com_func
|
||||||
|
from internlm.core.context import Config
|
||||||
|
|
||||||
|
|
||||||
|
def generate_new_config(config_py_file, test_config_json, case_name):
|
||||||
|
# generate path of the new config py
|
||||||
|
config_path = os.path.split(config_py_file)
|
||||||
|
new_config_py_file = os.path.join(config_path[0], case_name + ".py")
|
||||||
|
|
||||||
|
# merge dict
|
||||||
|
origin_config = Config.from_file(config_py_file)
|
||||||
|
with open(test_config_json) as f:
|
||||||
|
test_config = json.load(f)
|
||||||
|
if test_config:
|
||||||
|
if case_name not in test_config.keys():
|
||||||
|
raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
|
||||||
|
new_config = com_func.merge_dicts(origin_config, test_config[case_name])
|
||||||
|
print(f"new config is:\n{new_config}")
|
||||||
|
|
||||||
|
# write new config to py file
|
||||||
|
file_content = com_func.format_dict_to_py_string(new_config)
|
||||||
|
with open(new_config_py_file, "w") as f:
|
||||||
|
f.write(file_content)
|
||||||
|
print(f"The new test train config file is {new_config_py_file}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--origin_config",
|
||||||
|
type=str,
|
||||||
|
default="./ci_scripts/train/ci_7B_sft.py",
|
||||||
|
help="path to the origin train config file",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--test_config",
|
||||||
|
type=str,
|
||||||
|
default="./ci_scripts/train/test_config.json",
|
||||||
|
help="path to the test train config file",
|
||||||
|
)
|
||||||
|
parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
|
||||||
|
args = parser.parse_args()
|
||||||
|
generate_new_config(args.origin_config, args.test_config, args.case_name)
|
|
@ -0,0 +1,38 @@
|
||||||
|
#!/bin/bash
|
||||||
|
set -x
|
||||||
|
|
||||||
|
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
|
||||||
|
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
|
||||||
|
readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
|
||||||
|
readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
|
||||||
|
expected_num=22
|
||||||
|
exit_code=0
|
||||||
|
|
||||||
|
source ./ci_scripts/common/basic_func.sh
|
||||||
|
|
||||||
|
echo "start to test slurm training with loading checkpoint."
|
||||||
|
|
||||||
|
python ./ci_scripts/train/generate_config.py --case_name $1
|
||||||
|
file="./ci_scripts/train/$1.py"
|
||||||
|
if [[ ! -f ${file} ]]; then
|
||||||
|
echo "expect: ${file} exists, actual: not exist."
|
||||||
|
exit_code=$(($exit_code + 1))
|
||||||
|
fi
|
||||||
|
|
||||||
|
srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
|
||||||
|
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
|
||||||
|
|
||||||
|
|
||||||
|
num=$(num_files "${CKPTS40_OUTPUT}")
|
||||||
|
if [[ ${num} -ne ${expected_num} ]]; then
|
||||||
|
echo "expect: ${expected_num} files, actual: ${num} files."
|
||||||
|
exit_code=$(($exit_code + 1))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# clean the test files.
|
||||||
|
if ! rm -rf ${CKPTS_PATH}/*; then
|
||||||
|
echo "cleaning cached file in ${CKPTS_PATH} failed."
|
||||||
|
exit_code=$(($exit_code + 1))
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit $exit_code
|
|
@ -5,7 +5,7 @@ set -x
|
||||||
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
|
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
|
||||||
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
|
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
|
||||||
readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt"
|
readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt"
|
||||||
expected_num=21
|
expected_num=22
|
||||||
exit_code=0
|
exit_code=0
|
||||||
|
|
||||||
source ./ci_scripts/common/basic_func.sh
|
source ./ci_scripts/common/basic_func.sh
|
||||||
|
@ -25,12 +25,6 @@ srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --
|
||||||
num=$(num_files "${CKPTS20_OUTPUT}")
|
num=$(num_files "${CKPTS20_OUTPUT}")
|
||||||
if [[ ${num} -ne ${expected_num} ]]; then
|
if [[ ${num} -ne ${expected_num} ]]; then
|
||||||
echo "expect: ${expected_num} files, actual: ${num} files."
|
echo "expect: ${expected_num} files, actual: ${num} files."
|
||||||
exit_code=$(($exit_code + 1))
|
|
||||||
fi
|
|
||||||
|
|
||||||
# clean the test files.
|
|
||||||
if ! rm -rf ${CKPTS_PATH}/*; then
|
|
||||||
echo "cleaning cached file in ${CKPTS_PATH} failed."
|
|
||||||
exit_code=$(($exit_code + 1))
|
exit_code=$(($exit_code + 1))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,45 @@
|
||||||
|
{
|
||||||
|
"7B_basic_train": {
|
||||||
|
"SEQ_LEN": 1024,
|
||||||
|
"HIDDEN_SIZE": 2048,
|
||||||
|
"NUM_ATTENTION_HEAD": 16,
|
||||||
|
"NUM_LAYER": 16,
|
||||||
|
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||||
|
"ckpt": {
|
||||||
|
"checkpoint_every": 20
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"total_steps": 20
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7B_load_new_ckpt": {
|
||||||
|
"SEQ_LEN": 1024,
|
||||||
|
"HIDDEN_SIZE": 2048,
|
||||||
|
"NUM_ATTENTION_HEAD": 16,
|
||||||
|
"NUM_LAYER": 16,
|
||||||
|
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||||
|
"LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
|
||||||
|
"ckpt": {
|
||||||
|
"load_ckpt_folder": "local:llm_ckpts/20",
|
||||||
|
"checkpoint_every": 20
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"total_steps": 40
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"7B_load_preset_ckpt": {
|
||||||
|
"SEQ_LEN": 1024,
|
||||||
|
"HIDDEN_SIZE": 2048,
|
||||||
|
"NUM_ATTENTION_HEAD": 16,
|
||||||
|
"NUM_LAYER": 16,
|
||||||
|
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
|
||||||
|
"LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
|
||||||
|
"ckpt": {
|
||||||
|
"load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
|
||||||
|
"checkpoint_every": 20
|
||||||
|
},
|
||||||
|
"data": {
|
||||||
|
"total_steps": 40
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,7 +5,7 @@ set -x
|
||||||
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
|
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
|
||||||
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
|
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
|
||||||
readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt"
|
readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt"
|
||||||
expected_num=21
|
expected_num=22
|
||||||
exit_code=0
|
exit_code=0
|
||||||
|
|
||||||
source ./ci_scripts/common/basic_func.sh
|
source ./ci_scripts/common/basic_func.sh
|
||||||
|
@ -25,7 +25,7 @@ srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --
|
||||||
num=$(num_files "${CKPTS_OUTPUT}")
|
num=$(num_files "${CKPTS_OUTPUT}")
|
||||||
if [[ ${num} -ne ${expected_num} ]]; then
|
if [[ ${num} -ne ${expected_num} ]]; then
|
||||||
echo "expect: ${expected_num} files, actual: ${num} files."
|
echo "expect: ${expected_num} files, actual: ${num} files."
|
||||||
exit_code=$(($exit_code + 1))
|
exit_code=$(($exit_code + 1))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# clean the test files.
|
# clean the test files.
|
||||||
|
|
|
@ -75,7 +75,8 @@ grad_scaler = dict(
|
||||||
|
|
||||||
hybrid_zero_optimizer = dict(
|
hybrid_zero_optimizer = dict(
|
||||||
# Enable low_level_optimzer overlap_communication
|
# Enable low_level_optimzer overlap_communication
|
||||||
zero_overlap_communication=True,
|
overlap_sync_grad=True,
|
||||||
|
overlap_sync_param=True,
|
||||||
# bucket size for nccl communication params
|
# bucket size for nccl communication params
|
||||||
reduce_bucket_size=512 * 1024 * 1024,
|
reduce_bucket_size=512 * 1024 * 1024,
|
||||||
# grad clipping
|
# grad clipping
|
||||||
|
@ -120,12 +121,11 @@ model = dict(
|
||||||
num_layers=NUM_LAYER,
|
num_layers=NUM_LAYER,
|
||||||
mlp_ratio=MLP_RATIO,
|
mlp_ratio=MLP_RATIO,
|
||||||
apply_post_layer_norm=False,
|
apply_post_layer_norm=False,
|
||||||
dtype="torch.bfloat16",
|
dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
|
||||||
norm_type="rmsnorm",
|
norm_type="rmsnorm",
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
use_flash_attn=True,
|
use_flash_attn=True,
|
||||||
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
|
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
|
||||||
sequence_parallel=False,
|
|
||||||
)
|
)
|
||||||
"""
|
"""
|
||||||
zero1 parallel:
|
zero1 parallel:
|
||||||
|
@ -142,6 +142,7 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node.
|
||||||
parallel = dict(
|
parallel = dict(
|
||||||
zero1=8,
|
zero1=8,
|
||||||
pipeline=dict(size=1, interleaved_overlap=True),
|
pipeline=dict(size=1, interleaved_overlap=True),
|
||||||
|
sequence_parallel=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
cudnn_deterministic = False
|
cudnn_deterministic = False
|
||||||
|
|
|
@ -0,0 +1,20 @@
|
||||||
|
# Minimal makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line, and also
|
||||||
|
# from the environment for the first two.
|
||||||
|
SPHINXOPTS ?=
|
||||||
|
SPHINXBUILD ?= sphinx-build
|
||||||
|
SOURCEDIR = source
|
||||||
|
BUILDDIR = build
|
||||||
|
|
||||||
|
# Put it first so that "make" without argument is like "make help".
|
||||||
|
help:
|
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
.PHONY: help Makefile
|
||||||
|
|
||||||
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||||
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||||
|
%: Makefile
|
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
|
@ -0,0 +1,35 @@
|
||||||
|
@ECHO OFF
|
||||||
|
|
||||||
|
pushd %~dp0
|
||||||
|
|
||||||
|
REM Command file for Sphinx documentation
|
||||||
|
|
||||||
|
if "%SPHINXBUILD%" == "" (
|
||||||
|
set SPHINXBUILD=sphinx-build
|
||||||
|
)
|
||||||
|
set SOURCEDIR=source
|
||||||
|
set BUILDDIR=build
|
||||||
|
|
||||||
|
%SPHINXBUILD% >NUL 2>NUL
|
||||||
|
if errorlevel 9009 (
|
||||||
|
echo.
|
||||||
|
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||||
|
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||||
|
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||||
|
echo.may add the Sphinx directory to PATH.
|
||||||
|
echo.
|
||||||
|
echo.If you don't have Sphinx installed, grab it from
|
||||||
|
echo.https://www.sphinx-doc.org/
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
if "%1" == "" goto help
|
||||||
|
|
||||||
|
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
goto end
|
||||||
|
|
||||||
|
:help
|
||||||
|
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
|
||||||
|
:end
|
||||||
|
popd
|
|
@ -0,0 +1,10 @@
|
||||||
|
Sphinx
|
||||||
|
sphinx-autobuild
|
||||||
|
recommonmark
|
||||||
|
sphinx_rtd_theme
|
||||||
|
sphinx_markdown_tables
|
||||||
|
autodoc_pydantic==1.9
|
||||||
|
enum_tools
|
||||||
|
numpy
|
||||||
|
torch
|
||||||
|
tqdm
|
|
@ -0,0 +1,2 @@
|
||||||
|
Model Checkpointing
|
||||||
|
===================
|
|
@ -0,0 +1,91 @@
|
||||||
|
# Configuration file for the Sphinx documentation builder.
|
||||||
|
#
|
||||||
|
# For the full list of built-in configuration values, see the documentation:
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||||
|
|
||||||
|
# -- Project information -----------------------------------------------------
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
project = "InternLM"
|
||||||
|
copyright = "2023, InternLM Team"
|
||||||
|
author = "InternLM Team"
|
||||||
|
release = "v0.2.0"
|
||||||
|
|
||||||
|
# -- General configuration ---------------------------------------------------
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
||||||
|
|
||||||
|
extensions = [
|
||||||
|
"recommonmark",
|
||||||
|
"sphinx_rtd_theme",
|
||||||
|
"sphinx.ext.viewcode",
|
||||||
|
"sphinx.ext.autodoc",
|
||||||
|
"sphinxcontrib.autodoc_pydantic",
|
||||||
|
"sphinx.ext.autosectionlabel",
|
||||||
|
"sphinx.ext.napoleon",
|
||||||
|
]
|
||||||
|
|
||||||
|
pygments_style = "sphinx"
|
||||||
|
|
||||||
|
# autodoc_pyandtic config
|
||||||
|
autodoc_pydantic_model_show_field_summary = False
|
||||||
|
autodoc_pydantic_field_signature_prefix = " "
|
||||||
|
autodoc_pydantic_model_signature_prefix = "class"
|
||||||
|
autodoc_pydantic_model_show_json = False
|
||||||
|
autodoc_pydantic_model_show_config_summary = False
|
||||||
|
autodoc_pydantic_model_show_config_member = False
|
||||||
|
autodoc_pydantic_model_show_validator_summary = False
|
||||||
|
autodoc_pydantic_model_show_validator_members = False
|
||||||
|
autodoc_pydantic_model_summary_list_order = "bysource"
|
||||||
|
autodoc_pydantic_model_member_order = "bysource"
|
||||||
|
autodoc_pydantic_field_list_validators = False
|
||||||
|
|
||||||
|
# Napoleon settings
|
||||||
|
napoleon_google_docstring = True
|
||||||
|
napoleon_numpy_docstring = True
|
||||||
|
napoleon_include_init_with_doc = False
|
||||||
|
napoleon_include_private_with_doc = False
|
||||||
|
napoleon_include_special_with_doc = True
|
||||||
|
napoleon_use_admonition_for_examples = False
|
||||||
|
napoleon_use_admonition_for_notes = False
|
||||||
|
napoleon_use_admonition_for_references = False
|
||||||
|
napoleon_use_ivar = False
|
||||||
|
napoleon_use_param = True
|
||||||
|
napoleon_use_rtype = True
|
||||||
|
napoleon_preprocess_types = False
|
||||||
|
napoleon_type_aliases = None
|
||||||
|
napoleon_attr_annotations = True
|
||||||
|
|
||||||
|
templates_path = ["_templates"]
|
||||||
|
|
||||||
|
exclude_patterns = []
|
||||||
|
|
||||||
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
||||||
|
|
||||||
|
html_theme = "sphinx_rtd_theme"
|
||||||
|
html_static_path = ["_static"]
|
||||||
|
|
||||||
|
# GitHub integration
|
||||||
|
html_context = {
|
||||||
|
"display_github": True,
|
||||||
|
"github_user": "pjlab",
|
||||||
|
"github_repo": "InternLM",
|
||||||
|
"github_version": "master",
|
||||||
|
"conf_py_path": "/doc/code-docs/source/",
|
||||||
|
}
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.abspath("../../../"))
|
||||||
|
|
||||||
|
# Prepend module names to class descriptions
|
||||||
|
add_module_names = True
|
||||||
|
|
||||||
|
autoclass_content = "class"
|
||||||
|
|
||||||
|
autodoc_mock_imports = [
|
||||||
|
"apex",
|
||||||
|
"torch",
|
||||||
|
"numpy",
|
||||||
|
]
|
|
@ -0,0 +1,70 @@
|
||||||
|
.. InternLM documentation master file, created by
|
||||||
|
sphinx-quickstart on Mon Aug 28 17:33:28 2023.
|
||||||
|
You can adapt this file completely to your liking, but it should at least
|
||||||
|
contain the root `toctree` directive.
|
||||||
|
|
||||||
|
InternLM
|
||||||
|
========
|
||||||
|
|
||||||
|
Environment Setup
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
install
|
||||||
|
|
||||||
|
Model Setup
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
initialize
|
||||||
|
|
||||||
|
Training API
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
training
|
||||||
|
|
||||||
|
Parallel Training
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
parallel
|
||||||
|
|
||||||
|
Model Checkpointing
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
checkpoint
|
||||||
|
|
||||||
|
Profiler
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
profiler
|
||||||
|
|
||||||
|
Monitor
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
|
||||||
|
monitor
|
||||||
|
|
||||||
|
Indices and tables
|
||||||
|
==================
|
||||||
|
|
||||||
|
* :ref:`genindex`
|
||||||
|
* :ref:`modindex`
|
||||||
|
* :ref:`search`
|
|
@ -0,0 +1,35 @@
|
||||||
|
Training Setup
|
||||||
|
==============
|
||||||
|
|
||||||
|
.. _InternLM-args:
|
||||||
|
|
||||||
|
Argument Parsing
|
||||||
|
----------------
|
||||||
|
InternLM uses the `argparse <https://docs.python.org/3/library/argparse.html>`_ library to supply commandline
|
||||||
|
configuration to the InternLM runtime. Use ``internlm.initialize.get_default_parser()`` to get InternLM's default
|
||||||
|
parser with some builtin arguments, users can add custom parameters to this parser.
|
||||||
|
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
# Get InternLM default parser
|
||||||
|
parser = internlm.initialize.get_default_parser()
|
||||||
|
# Add new argument
|
||||||
|
parser.add_argument("--user_arg", type=int, default=-1, help="arguments add by user.")
|
||||||
|
cmd_args = parser.parse_args()
|
||||||
|
|
||||||
|
.. autofunction:: internlm.initialize.get_default_parser
|
||||||
|
|
||||||
|
|
||||||
|
.. _InternLM-init:
|
||||||
|
|
||||||
|
Model Initialization
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Optimizer Initialization
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Dataloader Initialization
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Trainer Initialization
|
||||||
|
-------------------------
|
|
@ -0,0 +1,70 @@
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Environment Preparation
|
||||||
|
The required packages and corresponding version are shown as follows:
|
||||||
|
- Python == 3.10
|
||||||
|
- GCC == 10.2.0
|
||||||
|
- MPFR == 4.1.0
|
||||||
|
- CUDA >= 11.7
|
||||||
|
- Pytorch >= 1.13.1
|
||||||
|
- Transformers >= 4.28.0
|
||||||
|
- Flash-Attention >= v1.0.5
|
||||||
|
- Apex == 23.05
|
||||||
|
- GPU with Ampere or Hopper architecture (such as H100, A100)
|
||||||
|
- Linux OS
|
||||||
|
|
||||||
|
After installing the above dependencies, some system environment variables need to be updated:
|
||||||
|
```bash
|
||||||
|
export CUDA_PATH={path_of_cuda_11.7}
|
||||||
|
export GCC_HOME={path_of_gcc_10.2.0}
|
||||||
|
export MPFR_HOME={path_of_mpfr_4.1.0}
|
||||||
|
export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
|
||||||
|
export PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
|
||||||
|
export CC=${GCC_HOME}/bin/gcc
|
||||||
|
export CXX=${GCC_HOME}/bin/c++
|
||||||
|
```
|
||||||
|
|
||||||
|
### Environment Installation
|
||||||
|
Clone the project `internlm` and its dependent submodules from the github repository, as follows:
|
||||||
|
```bash
|
||||||
|
git clone git@github.com:InternLM/InternLM.git --recurse-submodules
|
||||||
|
```
|
||||||
|
|
||||||
|
It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:
|
||||||
|
```bash
|
||||||
|
conda create --name internlm-env python=3.10 -y
|
||||||
|
conda activate internlm-env
|
||||||
|
cd internlm
|
||||||
|
pip install -r requirements/torch.txt
|
||||||
|
pip install -r requirements/runtime.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Install flash-attention (version v1.0.5):
|
||||||
|
```bash
|
||||||
|
cd ./third_party/flash-attention
|
||||||
|
python setup.py install
|
||||||
|
cd ./csrc
|
||||||
|
cd fused_dense_lib && pip install -v .
|
||||||
|
cd ../xentropy && pip install -v .
|
||||||
|
cd ../rotary && pip install -v .
|
||||||
|
cd ../layer_norm && pip install -v .
|
||||||
|
cd ../../../../
|
||||||
|
```
|
||||||
|
|
||||||
|
Install Apex (version 23.05):
|
||||||
|
```bash
|
||||||
|
cd ./third_party/apex
|
||||||
|
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
|
||||||
|
cd ../../
|
||||||
|
```
|
||||||
|
|
||||||
|
### Environment Image
|
||||||
|
Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# pull image
|
||||||
|
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
||||||
|
# start container
|
||||||
|
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
||||||
|
docker exec -it myinternlm bash
|
||||||
|
```
|
|
@ -0,0 +1,10 @@
|
||||||
|
Monitor and Alert
|
||||||
|
=================
|
||||||
|
|
||||||
|
|
||||||
|
Monitoring
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
|
||||||
|
Alerting
|
||||||
|
-----------------
|
|
@ -0,0 +1,23 @@
|
||||||
|
Parallel Training
|
||||||
|
=================
|
||||||
|
|
||||||
|
.. 整体说一下并行配置使用方式,接下来再分模块详细说明
|
||||||
|
|
||||||
|
Tensor Parallel
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
|
||||||
|
Pipeline Parallel
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
|
||||||
|
Sequence Parallel
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
|
||||||
|
Data Parallel
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
|
||||||
|
ZeRO1.5
|
||||||
|
-----------------
|
|
@ -0,0 +1,11 @@
|
||||||
|
Profiler
|
||||||
|
========
|
||||||
|
|
||||||
|
.. 可介绍torch profiler, memory profiler的使用
|
||||||
|
|
||||||
|
Torch Profiler
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
|
||||||
|
Memory Profiler
|
||||||
|
-----------------
|
|
@ -0,0 +1,2 @@
|
||||||
|
Training API
|
||||||
|
============
|
|
@ -59,12 +59,28 @@ cd ../../
|
||||||
```
|
```
|
||||||
|
|
||||||
### Environment Image
|
### Environment Image
|
||||||
Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
|
Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm.
|
||||||
|
|
||||||
|
#### Image Configuration and Build
|
||||||
|
The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:
|
||||||
|
``` bash
|
||||||
|
make -f docker.Makefile BASE_OS=centos7
|
||||||
|
```
|
||||||
|
In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported.
|
||||||
|
|
||||||
|
#### Pull Standard Image
|
||||||
|
The standard image based on ubuntu and centos has been built and can be directly pulled:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# pull image
|
# ubuntu20.04
|
||||||
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
|
||||||
# start container
|
# centos7
|
||||||
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
|
||||||
docker exec -it myinternlm bash
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Run Container
|
||||||
|
For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
|
||||||
|
```bash
|
||||||
|
docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
|
||||||
|
```
|
||||||
|
The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md).
|
||||||
|
|
|
@ -6,11 +6,14 @@ The system code file structure is shown below:
|
||||||
├── internlm # Main directory of the system code
|
├── internlm # Main directory of the system code
|
||||||
│ ├── apis # Interface module, containing some interface functions related to inference, etc.
|
│ ├── apis # Interface module, containing some interface functions related to inference, etc.
|
||||||
│ ├── core # Core module, managing parallel context and training scheduling engine for training and inference
|
│ ├── core # Core module, managing parallel context and training scheduling engine for training and inference
|
||||||
|
│ │ ├── communication # Communication module, responsible for p2p communication in pipeline parallel scheduling
|
||||||
│ │ ├── context # Context module, mainly responsible for initializing parallel process groups and managing parallel context
|
│ │ ├── context # Context module, mainly responsible for initializing parallel process groups and managing parallel context
|
||||||
│ │ │ ├── parallel_context.py
|
│ │ │ ├── parallel_context.py
|
||||||
│ │ │ └── process_group_initializer.py
|
│ │ │ └── process_group_initializer.py
|
||||||
|
│ │ ├── scheduler # Scheduling module, which manages schedulers for parallel training, including non-pipeline and pipeline parallel schedulers
|
||||||
|
│ │ │ ├── no_pipeline_scheduler.py
|
||||||
|
│ │ │ └── pipeline_scheduler.py
|
||||||
│ │ ├── engine.py # Responsible for managing the training and evaluation process of the model
|
│ │ ├── engine.py # Responsible for managing the training and evaluation process of the model
|
||||||
│ │ ├── no_pipeline_scheduler.py # Scheduler for parallel training
|
|
||||||
│ │ └── trainer.py # Responsible for managing the training engine and scheduler
|
│ │ └── trainer.py # Responsible for managing the training engine and scheduler
|
||||||
│ ├── data # Data module, responsible for managing dataset generation and processing
|
│ ├── data # Data module, responsible for managing dataset generation and processing
|
||||||
│ ├── initialize # Initialization module, responsible for managing distributed environment startup and trainer initialization
|
│ ├── initialize # Initialization module, responsible for managing distributed environment startup and trainer initialization
|
||||||
|
|
|
@ -165,8 +165,9 @@ Training parallel configuration example:
|
||||||
```python
|
```python
|
||||||
parallel = dict(
|
parallel = dict(
|
||||||
zero1=8,
|
zero1=8,
|
||||||
pipeline=1,
|
|
||||||
tensor=1,
|
tensor=1,
|
||||||
|
pipeline=dict(size=1, interleaved_overlap=True),
|
||||||
|
sequence_parallel=False,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -174,8 +175,11 @@ parallel = dict(
|
||||||
- When `size <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
|
- When `size <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
|
||||||
- When `size == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
|
- When `size == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
|
||||||
- When `size > 1` and `size <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
|
- When `size > 1` and `size <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
|
||||||
- pipeline: pipeline parallel size, default value is 1
|
- tensor: tensor parallel size, usually the number of GPUs per node, default is 1
|
||||||
- tensor: tensor parallel size, usually the number of GPUs per node, default value is 1
|
- pipeline: pipeline parallel strategy
|
||||||
|
- size: pipeline parallel size, the default value is 1
|
||||||
|
- interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False
|
||||||
|
- sequence_parallel: Whether to enable sequence parallelism, the default value is False
|
||||||
|
|
||||||
Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`
|
Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`
|
||||||
|
|
||||||
|
|
|
@ -59,11 +59,28 @@ cd ../../
|
||||||
```
|
```
|
||||||
|
|
||||||
### 环境镜像
|
### 环境镜像
|
||||||
用户可以从 https://hub.docker.com/r/sunpengsdu/internlm 获取安装了 InternLM 运行环境的镜像,拉取镜像及启动容器的命令如下:
|
用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像,或者也可以从 https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。
|
||||||
|
|
||||||
|
#### 镜像配置及构造
|
||||||
|
dockerfile 的配置以及构造均通过 docker.Makefile 文件实现,在 InternLM 根目录下执行如下命令即可 build 镜像:
|
||||||
|
``` bash
|
||||||
|
make -f docker.Makefile BASE_OS=centos7
|
||||||
|
```
|
||||||
|
在 docker.Makefile 中可自定义基础镜像,环境版本等内容,对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 ubuntu20.04 和 centos7。
|
||||||
|
|
||||||
|
#### 镜像拉取
|
||||||
|
基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# 拉取镜像
|
# ubuntu20.04
|
||||||
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
|
||||||
# 启动容器
|
# centos7
|
||||||
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
|
docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
|
||||||
docker exec -it myinternlm bash
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### 容器启动
|
||||||
|
对于使用 dockerfile 构建或拉取的本地标准镜像,使用如下命令启动并进入容器:
|
||||||
|
```bash
|
||||||
|
docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
|
||||||
|
```
|
||||||
|
容器内默认目录即 `/InternLM`,根据[使用文档](./usage.md)即可启动训练。
|
||||||
|
|
|
@ -6,11 +6,14 @@
|
||||||
├── internlm # 系统代码的主目录
|
├── internlm # 系统代码的主目录
|
||||||
│ ├── apis # 接口模块,包含一些关于推理等的接口函数
|
│ ├── apis # 接口模块,包含一些关于推理等的接口函数
|
||||||
│ ├── core # 核心模块,管理用于训练和推理的 parallel context 和训练调度引擎
|
│ ├── core # 核心模块,管理用于训练和推理的 parallel context 和训练调度引擎
|
||||||
|
│ │ ├── communication # 通信模块,负责流水线并行调度中的p2p通信
|
||||||
│ │ ├── context # context 模块,主要负责初始化并行进程组,并管理 parallel context
|
│ │ ├── context # context 模块,主要负责初始化并行进程组,并管理 parallel context
|
||||||
│ │ │ ├── parallel_context.py
|
│ │ │ ├── parallel_context.py
|
||||||
│ │ │ └── process_group_initializer.py
|
│ │ │ └── process_group_initializer.py
|
||||||
|
│ │ ├── scheduler # 调度模块,管理并行训练的调度器,包括非流水线并行调度器和流水线并行调度器
|
||||||
|
│ │ │ ├── no_pipeline_scheduler.py
|
||||||
|
│ │ │ └── pipeline_scheduler.py
|
||||||
│ │ ├── engine.py # 负责管理模型的训练和评估过程
|
│ │ ├── engine.py # 负责管理模型的训练和评估过程
|
||||||
│ │ ├── no_pipeline_scheduler.py # 并行训练的调度器
|
|
||||||
│ │ └── trainer.py # 负责管理训练引擎和调度器
|
│ │ └── trainer.py # 负责管理训练引擎和调度器
|
||||||
│ ├── data # 数据模块,负责管理数据集生成和处理
|
│ ├── data # 数据模块,负责管理数据集生成和处理
|
||||||
│ ├── initialize # 初始化模块,负责管理分布式环境启动和训练器初始化
|
│ ├── initialize # 初始化模块,负责管理分布式环境启动和训练器初始化
|
||||||
|
|
|
@ -151,16 +151,20 @@ model = dict(
|
||||||
```python
|
```python
|
||||||
parallel = dict(
|
parallel = dict(
|
||||||
zero1=8,
|
zero1=8,
|
||||||
pipeline=1,
|
|
||||||
tensor=1,
|
tensor=1,
|
||||||
|
pipeline=dict(size=1, interleaved_overlap=True),
|
||||||
|
sequence_parallel=False,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
- zero1:zero 并行策略,分如下三种情况,默认值为 -1
|
- zero1:zero 并行策略,分如下三种情况,默认值为 -1
|
||||||
- 当`size <= 0`,则 zero1 进程组的大小等于数据并行进程组的大小,因此优化器状态参数将在数据并行范围内分配
|
- 当`size <= 0`,则 zero1 进程组的大小等于数据并行进程组的大小,因此优化器状态参数将在数据并行范围内分配
|
||||||
- 当`size == 1`,则不使用 zero1 ,所有数据并行组保留完整的优化器状态参数
|
- 当`size == 1`,则不使用 zero1 ,所有数据并行组保留完整的优化器状态参数
|
||||||
- 当`size > 1`且`size <= data_parallel_world_size`,则 zero1 进程组是数据并行进程组的子集
|
- 当`size > 1`且`size <= data_parallel_world_size`,则 zero1 进程组是数据并行进程组的子集
|
||||||
- pipeline:流水线并行大小,默认值为 1
|
|
||||||
- tensor:张量并行大小,通常是每个节点的 GPU 数量,默认值为 1
|
- tensor:张量并行大小,通常是每个节点的 GPU 数量,默认值为 1
|
||||||
|
- pipeline:流水线并行策略
|
||||||
|
- size:流水线并行大小,默认值为 1
|
||||||
|
- interleaved_overlap:bool 类型,交错式调度时,开启或关闭通信优化,默认值为关闭
|
||||||
|
- sequence_parallel:是否开启序列化并行,默认值为 False
|
||||||
|
|
||||||
注意:`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`
|
注意:`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
DOCKER_REGISTRY ?= docker.io
|
||||||
|
DOCKER_ORG ?= my
|
||||||
|
DOCKER_IMAGE ?= internlm
|
||||||
|
DOCKER_FULL_NAME = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
|
||||||
|
|
||||||
|
CUDA_VERSION = 11.7.1
|
||||||
|
GCC_VERSION = 10.2.0
|
||||||
|
|
||||||
|
CUDNN_VERSION = 8
|
||||||
|
BASE_RUNTIME =
|
||||||
|
# ubuntu20.04 centos7
|
||||||
|
BASE_OS = centos7
|
||||||
|
BASE_DEVEL = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-${BASE_OS}
|
||||||
|
# The conda channel to use to install cudatoolkit
|
||||||
|
CUDA_CHANNEL = nvidia
|
||||||
|
# The conda channel to use to install pytorch / torchvision
|
||||||
|
INSTALL_CHANNEL ?= pytorch
|
||||||
|
|
||||||
|
PYTHON_VERSION ?= 3.10
|
||||||
|
PYTORCH_VERSION ?= 1.13.1
|
||||||
|
TORCHVISION_VERSION ?= 0.14.1
|
||||||
|
TORCHAUDIO_VERSION ?= 0.13.1
|
||||||
|
BUILD_PROGRESS ?= auto
|
||||||
|
TRITON_VERSION ?=
|
||||||
|
GMP_VERSION ?= 6.2.1
|
||||||
|
MPFR_VERSION ?= 4.1.0
|
||||||
|
MPC_VERSION ?= 1.2.1
|
||||||
|
GCC_VERSION ?= 10.2.0
|
||||||
|
HTTPS_PROXY_I ?=
|
||||||
|
HTTP_PROXY_I ?=
|
||||||
|
FLASH_ATTEN_VERSION ?= 1.0.5
|
||||||
|
FLASH_ATTEN_TAG ?= v${FLASH_ATTEN_VERSION}
|
||||||
|
|
||||||
|
BUILD_ARGS = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
|
||||||
|
--build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
|
||||||
|
--build-arg CUDA_VERSION=$(CUDA_VERSION) \
|
||||||
|
--build-arg CUDA_CHANNEL=$(CUDA_CHANNEL) \
|
||||||
|
--build-arg PYTORCH_VERSION=$(PYTORCH_VERSION) \
|
||||||
|
--build-arg TORCHVISION_VERSION=$(TORCHVISION_VERSION) \
|
||||||
|
--build-arg TORCHAUDIO_VERSION=$(TORCHAUDIO_VERSION) \
|
||||||
|
--build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) \
|
||||||
|
--build-arg TRITON_VERSION=$(TRITON_VERSION) \
|
||||||
|
--build-arg GMP_VERSION=$(GMP_VERSION) \
|
||||||
|
--build-arg MPFR_VERSION=$(MPFR_VERSION) \
|
||||||
|
--build-arg MPC_VERSION=$(MPC_VERSION) \
|
||||||
|
--build-arg GCC_VERSION=$(GCC_VERSION) \
|
||||||
|
--build-arg https_proxy=$(HTTPS_PROXY_I) \
|
||||||
|
--build-arg http_proxy=$(HTTP_PROXY_I) \
|
||||||
|
--build-arg FLASH_ATTEN_TAG=$(FLASH_ATTEN_TAG)
|
||||||
|
|
||||||
|
EXTRA_DOCKER_BUILD_FLAGS ?=
|
||||||
|
|
||||||
|
BUILD ?= build
|
||||||
|
# Intentionally left blank
|
||||||
|
PLATFORMS_FLAG ?=
|
||||||
|
PUSH_FLAG ?=
|
||||||
|
USE_BUILDX ?=1
|
||||||
|
BUILD_PLATFORMS ?=
|
||||||
|
WITH_PUSH ?= false
|
||||||
|
BUILD_TYPE ?= intrenlm-dev
|
||||||
|
|
||||||
|
# Setup buildx flags
|
||||||
|
ifneq ("$(USE_BUILDX)","")
|
||||||
|
BUILD = buildx build
|
||||||
|
ifneq ("$(BUILD_PLATFORMS)","")
|
||||||
|
PLATFORMS_FLAG = --platform="$(BUILD_PLATFORMS)"
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# # Only set platforms flags if using buildx
|
||||||
|
# ifeq ("$(WITH_PUSH)","true")
|
||||||
|
# PUSH_FLAG = --push
|
||||||
|
# endif
|
||||||
|
# endif
|
||||||
|
|
||||||
|
ifeq ($(findstring centos,$(BASE_OS)),centos)
|
||||||
|
DOCKERFILE_PATH ?= ./docker/Dockerfile-centos
|
||||||
|
else
|
||||||
|
DOCKERFILE_PATH ?= ./docker/Dockerfile-ubuntu
|
||||||
|
endif
|
||||||
|
|
||||||
|
#use -f to specify dockerfile
|
||||||
|
DOCKER_BUILD = DOCKER_BUILDKIT=1 \
|
||||||
|
docker $(BUILD) \
|
||||||
|
--progress=$(BUILD_PROGRESS) \
|
||||||
|
$(EXTRA_DOCKER_BUILD_FLAGS) \
|
||||||
|
$(PLATFORMS_FLAG) \
|
||||||
|
$(PUSH_FLAG) \
|
||||||
|
-f $(DOCKERFILE_PATH) \
|
||||||
|
-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
|
||||||
|
$(BUILD_ARGS) .
|
||||||
|
|
||||||
|
# --target $(BUILD_TYPE)
|
||||||
|
|
||||||
|
.PHONY: all
|
||||||
|
all: devel-image
|
||||||
|
|
||||||
|
.PHONY: devel-image
|
||||||
|
devel-image: BASE_IMAGE := $(BASE_DEVEL)
|
||||||
|
devel-image: DOCKER_TAG := torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}-flashatten${FLASH_ATTEN_VERSION}-${BASE_OS}
|
||||||
|
devel-image:
|
||||||
|
$(DOCKER_BUILD)
|
||||||
|
|
||||||
|
.PHONY: clean
|
||||||
|
clean:
|
||||||
|
-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
|
|
@ -0,0 +1,131 @@
|
||||||
|
ARG BASE_IMAGE
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install the basic environment on centos
|
||||||
|
##############################################################################
|
||||||
|
FROM ${BASE_IMAGE} as base
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
RUN yum install deltarpm -y && yum update -y \
|
||||||
|
&& yum install -y \
|
||||||
|
ca-certificates \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
tar \
|
||||||
|
m4 \
|
||||||
|
bzip2 \
|
||||||
|
gcc \
|
||||||
|
gcc-c++ \
|
||||||
|
file \
|
||||||
|
texinfo \
|
||||||
|
which
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install the conda environment
|
||||||
|
##############################################################################
|
||||||
|
FROM base as conda
|
||||||
|
ARG PYTHON_VERSION=3.10
|
||||||
|
ARG TARGETPLATFORM
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
RUN case ${TARGETPLATFORM} in \
|
||||||
|
"linux/arm64") MINICONDA_ARCH=aarch64 ;; \
|
||||||
|
*) MINICONDA_ARCH=x86_64 ;; \
|
||||||
|
esac && \
|
||||||
|
curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
|
||||||
|
|
||||||
|
RUN chmod +x ~/miniconda.sh && \
|
||||||
|
bash ~/miniconda.sh -b -p /opt/conda && \
|
||||||
|
rm ~/miniconda.sh && \
|
||||||
|
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
|
||||||
|
/opt/conda/bin/conda clean -ya
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install environment dependencies
|
||||||
|
##############################################################################
|
||||||
|
FROM conda as dep
|
||||||
|
WORKDIR /dep
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG GMP_VERSION
|
||||||
|
ARG MPFR_VERSION
|
||||||
|
ARG MPC_VERSION
|
||||||
|
RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
|
||||||
|
&& tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
|
||||||
|
&& cd gmp-${GMP_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
|
||||||
|
&& tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
|
||||||
|
&& cd mpfr-${MPFR_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
|
||||||
|
&& tar -vxf mpc-${MPC_VERSION}.tar.gz \
|
||||||
|
&& cd mpc-${MPC_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& git clone https://github.com/ninja-build/ninja.git \
|
||||||
|
&& cd ninja \
|
||||||
|
&& git checkout release \
|
||||||
|
&& ./configure.py --bootstrap \
|
||||||
|
&& mv ./ninja /usr/bin \
|
||||||
|
&& cd ..
|
||||||
|
|
||||||
|
ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
|
||||||
|
ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG GCC_VERSION
|
||||||
|
ARG GMP_VERSION
|
||||||
|
ARG MPFR_VERSION
|
||||||
|
ARG MPC_VERSION
|
||||||
|
RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
|
||||||
|
&& tar -vxf gcc-${GCC_VERSION}.tar.xz \
|
||||||
|
&& mkdir build \
|
||||||
|
&& cd build/ \
|
||||||
|
&& ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
|
||||||
|
--with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
|
||||||
|
&& make -j64 && make install
|
||||||
|
|
||||||
|
ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
|
||||||
|
ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
|
||||||
|
ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
|
||||||
|
ENV CC=${GCC_HOME}/bin/gcc
|
||||||
|
ENV CXX=${GCC_HOME}/bin/c++
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install InternLM development environment, including flash-attention and apex
|
||||||
|
##############################################################################
|
||||||
|
FROM dep as intrenlm-dev
|
||||||
|
COPY . /InternLM
|
||||||
|
WORKDIR /InternLM
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
|
||||||
|
RUN git submodule update --init --recursive \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
|
||||||
|
&& cd /InternLM/third_party/flash-attention \
|
||||||
|
&& /opt/conda/bin/python setup.py install \
|
||||||
|
&& cd ./csrc \
|
||||||
|
&& cd fused_dense_lib && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../xentropy && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../rotary && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../layer_norm && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../../../../ \
|
||||||
|
&& cd ./third_party/apex \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
|
||||||
|
&& /opt/conda/bin/pip cache purge \
|
||||||
|
&& rm -rf ~/.cache/pip
|
|
@ -0,0 +1,112 @@
|
||||||
|
ARG BASE_IMAGE
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install the basic environment on ubuntu
|
||||||
|
##############################################################################
|
||||||
|
FROM ${BASE_IMAGE} as base
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
ca-certificates \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
tar \
|
||||||
|
m4 \
|
||||||
|
ninja-build
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install the conda environment
|
||||||
|
##############################################################################
|
||||||
|
FROM base as conda
|
||||||
|
ARG PYTHON_VERSION=3.10
|
||||||
|
ARG TARGETPLATFORM
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
RUN case ${TARGETPLATFORM} in \
|
||||||
|
"linux/arm64") MINICONDA_ARCH=aarch64 ;; \
|
||||||
|
*) MINICONDA_ARCH=x86_64 ;; \
|
||||||
|
esac && \
|
||||||
|
curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
|
||||||
|
|
||||||
|
RUN chmod +x ~/miniconda.sh && \
|
||||||
|
bash ~/miniconda.sh -b -p /opt/conda && \
|
||||||
|
rm ~/miniconda.sh && \
|
||||||
|
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
|
||||||
|
/opt/conda/bin/conda clean -ya
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install environment dependencies
|
||||||
|
##############################################################################
|
||||||
|
FROM conda as dep
|
||||||
|
WORKDIR /dep
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG GCC_VERSION
|
||||||
|
ARG GMP_VERSION
|
||||||
|
ARG MPFR_VERSION
|
||||||
|
ARG MPC_VERSION
|
||||||
|
RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
|
||||||
|
&& tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
|
||||||
|
&& cd gmp-${GMP_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
|
||||||
|
&& tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
|
||||||
|
&& cd mpfr-${MPFR_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
|
||||||
|
&& tar -vxf mpc-${MPC_VERSION}.tar.gz \
|
||||||
|
&& cd mpc-${MPC_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
|
||||||
|
&& tar -vxJf gcc-${GCC_VERSION}.tar.xz \
|
||||||
|
&& mkdir build \
|
||||||
|
&& cd build/ \
|
||||||
|
&& ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
|
||||||
|
--with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
|
||||||
|
&& make -j64 && make install
|
||||||
|
|
||||||
|
ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
|
||||||
|
ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
|
||||||
|
ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
|
||||||
|
ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
|
||||||
|
ENV CC=${GCC_HOME}/bin/gcc
|
||||||
|
ENV CXX=${GCC_HOME}/bin/c++
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install InternLM development environment, including flash-attention and apex
|
||||||
|
##############################################################################
|
||||||
|
FROM dep as intrenlm-dev
|
||||||
|
COPY . /InternLM
|
||||||
|
WORKDIR /InternLM
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
|
||||||
|
RUN git submodule update --init --recursive \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
|
||||||
|
&& cd /InternLM/third_party/flash-attention \
|
||||||
|
&& /opt/conda/bin/python setup.py install \
|
||||||
|
&& cd ./csrc \
|
||||||
|
&& cd fused_dense_lib && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../xentropy && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../rotary && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../layer_norm && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../../../../ \
|
||||||
|
&& cd ./third_party/apex \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
|
||||||
|
&& /opt/conda/bin/pip cache purge \
|
||||||
|
&& rm -rf ~/.cache/pip
|
|
@ -0,0 +1,161 @@
|
||||||
|
ARG BASE_IMAGE
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install the basic environment on centos
|
||||||
|
##############################################################################
|
||||||
|
FROM ${BASE_IMAGE} as base
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
RUN yum install deltarpm -y && yum update -y \
|
||||||
|
&& yum install -y \
|
||||||
|
ca-certificates \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
tar \
|
||||||
|
m4 \
|
||||||
|
bzip2 \
|
||||||
|
gcc \
|
||||||
|
gcc-c++ \
|
||||||
|
file \
|
||||||
|
texinfo \
|
||||||
|
which
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install the conda environment
|
||||||
|
##############################################################################
|
||||||
|
FROM base as conda
|
||||||
|
ARG PYTHON_VERSION=3.10
|
||||||
|
ARG TARGETPLATFORM
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
RUN case ${TARGETPLATFORM} in \
|
||||||
|
"linux/arm64") MINICONDA_ARCH=aarch64 ;; \
|
||||||
|
*) MINICONDA_ARCH=x86_64 ;; \
|
||||||
|
esac && \
|
||||||
|
curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
|
||||||
|
|
||||||
|
RUN chmod +x ~/miniconda.sh && \
|
||||||
|
bash ~/miniconda.sh -b -p /opt/conda && \
|
||||||
|
rm ~/miniconda.sh && \
|
||||||
|
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
|
||||||
|
/opt/conda/bin/conda clean -ya
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install environment dependencies
|
||||||
|
##############################################################################
|
||||||
|
FROM conda as dep
|
||||||
|
WORKDIR /dep
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG GMP_VERSION
|
||||||
|
ARG MPFR_VERSION
|
||||||
|
ARG MPC_VERSION
|
||||||
|
RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
|
||||||
|
&& tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
|
||||||
|
&& cd gmp-${GMP_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
|
||||||
|
&& tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
|
||||||
|
&& cd mpfr-${MPFR_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
|
||||||
|
&& tar -vxf mpc-${MPC_VERSION}.tar.gz \
|
||||||
|
&& cd mpc-${MPC_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& git clone https://github.com/ninja-build/ninja.git \
|
||||||
|
&& cd ninja \
|
||||||
|
&& git checkout release \
|
||||||
|
&& ./configure.py --bootstrap \
|
||||||
|
&& mv ./ninja /usr/bin \
|
||||||
|
&& cd ..
|
||||||
|
|
||||||
|
ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
|
||||||
|
ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG GCC_VERSION
|
||||||
|
ARG GMP_VERSION
|
||||||
|
ARG MPFR_VERSION
|
||||||
|
ARG MPC_VERSION
|
||||||
|
RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
|
||||||
|
&& tar -vxf gcc-${GCC_VERSION}.tar.xz \
|
||||||
|
&& mkdir build \
|
||||||
|
&& cd build/ \
|
||||||
|
&& ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
|
||||||
|
--with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
|
||||||
|
&& make -j64 && make install
|
||||||
|
|
||||||
|
ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
|
||||||
|
ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
|
||||||
|
ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
|
||||||
|
ENV CC=${GCC_HOME}/bin/gcc
|
||||||
|
ENV CXX=${GCC_HOME}/bin/c++
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install InternLM development environment, including flash-attention and apex
|
||||||
|
##############################################################################
|
||||||
|
FROM dep as intrenlm-dev
|
||||||
|
COPY . /InternLM
|
||||||
|
WORKDIR /InternLM
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG PYTORCH_VERSION
|
||||||
|
ARG TORCHVISION_VERSION
|
||||||
|
ARG TORCHAUDIO_VERSION
|
||||||
|
|
||||||
|
RUN /opt/conda/bin/pip --no-cache-dir install \
|
||||||
|
transformers==4.29.2 \
|
||||||
|
sentencepiece \
|
||||||
|
numpy \
|
||||||
|
tqdm \
|
||||||
|
psutil \
|
||||||
|
packaging \
|
||||||
|
pre-commit \
|
||||||
|
ninja \
|
||||||
|
gputil \
|
||||||
|
pytest \
|
||||||
|
packaging \
|
||||||
|
boto3 \
|
||||||
|
botocore \
|
||||||
|
torch-scatter \
|
||||||
|
pyecharts \
|
||||||
|
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install \
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/cu117 \
|
||||||
|
torch==${PYTORCH_VERSION}+cu117 \
|
||||||
|
torchvision==${TORCHVISION_VERSION}+cu117 \
|
||||||
|
torchaudio==${TORCHAUDIO_VERSION}
|
||||||
|
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
|
||||||
|
ARG FLASH_ATTEN_TAG
|
||||||
|
|
||||||
|
RUN git submodule update --init --recursive \
|
||||||
|
&& cd /InternLM/third_party/flash-attention \
|
||||||
|
&& git checkout ${FLASH_ATTEN_TAG} \
|
||||||
|
&& /opt/conda/bin/python setup.py install \
|
||||||
|
&& cd ./csrc \
|
||||||
|
&& cd fused_dense_lib && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../xentropy && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../rotary && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../layer_norm && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../../../../ \
|
||||||
|
&& cd ./third_party/apex \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
|
||||||
|
&& /opt/conda/bin/pip cache purge \
|
||||||
|
&& rm -rf ~/.cache/pip
|
|
@ -0,0 +1,142 @@
|
||||||
|
ARG BASE_IMAGE
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install the basic environment on ubuntu
|
||||||
|
##############################################################################
|
||||||
|
FROM ${BASE_IMAGE} as base
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
ca-certificates \
|
||||||
|
cmake \
|
||||||
|
curl \
|
||||||
|
git \
|
||||||
|
wget \
|
||||||
|
tar \
|
||||||
|
m4 \
|
||||||
|
ninja-build
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install the conda environment
|
||||||
|
##############################################################################
|
||||||
|
FROM base as conda
|
||||||
|
ARG PYTHON_VERSION=3.10
|
||||||
|
ARG TARGETPLATFORM
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
RUN case ${TARGETPLATFORM} in \
|
||||||
|
"linux/arm64") MINICONDA_ARCH=aarch64 ;; \
|
||||||
|
*) MINICONDA_ARCH=x86_64 ;; \
|
||||||
|
esac && \
|
||||||
|
curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
|
||||||
|
|
||||||
|
RUN chmod +x ~/miniconda.sh && \
|
||||||
|
bash ~/miniconda.sh -b -p /opt/conda && \
|
||||||
|
rm ~/miniconda.sh && \
|
||||||
|
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
|
||||||
|
/opt/conda/bin/conda clean -ya
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install environment dependencies
|
||||||
|
##############################################################################
|
||||||
|
FROM conda as dep
|
||||||
|
WORKDIR /dep
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG GCC_VERSION
|
||||||
|
ARG GMP_VERSION
|
||||||
|
ARG MPFR_VERSION
|
||||||
|
ARG MPC_VERSION
|
||||||
|
RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
|
||||||
|
&& tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
|
||||||
|
&& cd gmp-${GMP_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
|
||||||
|
&& tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
|
||||||
|
&& cd mpfr-${MPFR_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
|
||||||
|
&& tar -vxf mpc-${MPC_VERSION}.tar.gz \
|
||||||
|
&& cd mpc-${MPC_VERSION}/ \
|
||||||
|
&& ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
|
||||||
|
&& make -j64 && make install \
|
||||||
|
&& cd .. \
|
||||||
|
&& wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
|
||||||
|
&& tar -vxJf gcc-${GCC_VERSION}.tar.xz \
|
||||||
|
&& mkdir build \
|
||||||
|
&& cd build/ \
|
||||||
|
&& ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
|
||||||
|
--with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
|
||||||
|
&& make -j64 && make install
|
||||||
|
|
||||||
|
ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
|
||||||
|
ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
|
||||||
|
ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
|
||||||
|
ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
|
||||||
|
ENV CC=${GCC_HOME}/bin/gcc
|
||||||
|
ENV CXX=${GCC_HOME}/bin/c++
|
||||||
|
|
||||||
|
|
||||||
|
##############################################################################
|
||||||
|
# Install InternLM development environment, including flash-attention and apex
|
||||||
|
##############################################################################
|
||||||
|
FROM dep as intrenlm-dev
|
||||||
|
COPY . /InternLM
|
||||||
|
WORKDIR /InternLM
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG PYTORCH_VERSION
|
||||||
|
ARG TORCHVISION_VERSION
|
||||||
|
ARG TORCHAUDIO_VERSION
|
||||||
|
|
||||||
|
RUN /opt/conda/bin/pip --no-cache-dir install \
|
||||||
|
transformers==4.29.2 \
|
||||||
|
sentencepiece \
|
||||||
|
numpy \
|
||||||
|
tqdm \
|
||||||
|
psutil \
|
||||||
|
packaging \
|
||||||
|
pre-commit \
|
||||||
|
ninja \
|
||||||
|
gputil \
|
||||||
|
pytest \
|
||||||
|
packaging \
|
||||||
|
boto3 \
|
||||||
|
botocore \
|
||||||
|
torch-scatter \
|
||||||
|
pyecharts \
|
||||||
|
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install \
|
||||||
|
--extra-index-url https://download.pytorch.org/whl/cu117 \
|
||||||
|
torch==${PYTORCH_VERSION}+cu117 \
|
||||||
|
torchvision==${TORCHVISION_VERSION}+cu117 \
|
||||||
|
torchaudio==${TORCHAUDIO_VERSION}
|
||||||
|
|
||||||
|
ARG https_proxy
|
||||||
|
ARG http_proxy
|
||||||
|
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
|
||||||
|
ARG FLASH_ATTEN_TAG
|
||||||
|
|
||||||
|
RUN git submodule update --init --recursive \
|
||||||
|
&& cd /InternLM/third_party/flash-attention \
|
||||||
|
&& git checkout ${FLASH_ATTEN_TAG} \
|
||||||
|
&& /opt/conda/bin/python setup.py install \
|
||||||
|
&& cd ./csrc \
|
||||||
|
&& cd fused_dense_lib && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../xentropy && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../rotary && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../layer_norm && /opt/conda/bin/pip install -v . \
|
||||||
|
&& cd ../../../../ \
|
||||||
|
&& cd ./third_party/apex \
|
||||||
|
&& /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
|
||||||
|
&& /opt/conda/bin/pip cache purge \
|
||||||
|
&& rm -rf ~/.cache/pip
|
|
@ -0,0 +1,25 @@
|
||||||
|
## 实验性环境镜像
|
||||||
|
本模块用于测试新版本环境,默认测试新环境 torch=2.0.1,flash-attention=2.1.0。新环境可能具有不稳定性,标准环境安装请参考:[安装文档](../doc/install.md)
|
||||||
|
|
||||||
|
### 镜像构建及拉取
|
||||||
|
构建镜像时请于 InternLM 根目录下执行 docker.Makefile,该文件与标准环境镜像共用,所使用的 Dockerfile 位于 experiment 目录下。也可直接从 https://hub.docker.com/r/internlm/internlm 拉取镜像,命令如下:
|
||||||
|
```bash
|
||||||
|
# 构建镜像
|
||||||
|
# ubuntu20.04
|
||||||
|
make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
|
||||||
|
# centos7
|
||||||
|
make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
|
||||||
|
|
||||||
|
# 拉取镜像
|
||||||
|
# ubuntu20.04
|
||||||
|
docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
|
||||||
|
# centos7
|
||||||
|
docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
|
||||||
|
```
|
||||||
|
|
||||||
|
### 容器启动
|
||||||
|
对于使用 dockerfile 构建或拉取的本地标准镜像,使用如下命令启动并进入容器:
|
||||||
|
```bash
|
||||||
|
docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
|
||||||
|
```
|
||||||
|
容器内默认目录即 `/InternLM`,根据[使用文档](../doc/usage.md)即可启动训练。
|
|
@ -0,0 +1,25 @@
|
||||||
|
## Environment Image for experiment
|
||||||
|
This module is used to test the new version environment, the default test new environment is torch=2.0.1, flash-attention=2.1.0. The new environment may be unstable, for the standard environment installation please refer to: [installation guide](../doc/en/install.md)
|
||||||
|
|
||||||
|
### Build and Pull Image
|
||||||
|
When building the image, please make docker.Makefile in the InternLM root directory. This Makefile is shared with the standard environment image, and the Dockerfile used is located in the experiment directory. You can also pull the image directly from https://hub.docker.com/r/internlm/internlm, the command is as follows:
|
||||||
|
```bash
|
||||||
|
# Build Image
|
||||||
|
# ubuntu20.04
|
||||||
|
make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
|
||||||
|
# centos7
|
||||||
|
make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
|
||||||
|
|
||||||
|
# Pull Image
|
||||||
|
# ubuntu20.04
|
||||||
|
docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
|
||||||
|
# centos7
|
||||||
|
docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run Container
|
||||||
|
For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
|
||||||
|
```bash
|
||||||
|
docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
|
||||||
|
```
|
||||||
|
The default directory in the container is `/InternLM`, please start training according to the [Usage](../doc/en/usage.md).
|
|
@ -7,6 +7,7 @@ from .parallel_context import (
|
||||||
from .process_group_initializer import (
|
from .process_group_initializer import (
|
||||||
Initializer_Data,
|
Initializer_Data,
|
||||||
Initializer_Model,
|
Initializer_Model,
|
||||||
|
Initializer_Nettest,
|
||||||
Initializer_Pipeline,
|
Initializer_Pipeline,
|
||||||
Initializer_Tensor,
|
Initializer_Tensor,
|
||||||
Initializer_Zero1,
|
Initializer_Zero1,
|
||||||
|
@ -34,6 +35,7 @@ __all__ = [
|
||||||
"Initializer_Pipeline",
|
"Initializer_Pipeline",
|
||||||
"Initializer_Data",
|
"Initializer_Data",
|
||||||
"Initializer_Zero1",
|
"Initializer_Zero1",
|
||||||
|
"Initializer_Nettest",
|
||||||
"ProcessGroupInitializer",
|
"ProcessGroupInitializer",
|
||||||
"Initializer_Model",
|
"Initializer_Model",
|
||||||
"seed",
|
"seed",
|
||||||
|
|
|
@ -143,6 +143,7 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||||
self.pipeline_parallel_size = 1
|
self.pipeline_parallel_size = 1
|
||||||
self.tensor_parallel_size = 1
|
self.tensor_parallel_size = 1
|
||||||
self.zero1_parallel_size = -1
|
self.zero1_parallel_size = -1
|
||||||
|
self.nettest_parallel_size = 1
|
||||||
self.num_processes_on_current_node = -1
|
self.num_processes_on_current_node = -1
|
||||||
self.virtual_pipeline_parallel_size = None
|
self.virtual_pipeline_parallel_size = None
|
||||||
self.virtual_pipeline_parallel_rank = None
|
self.virtual_pipeline_parallel_rank = None
|
||||||
|
@ -442,6 +443,9 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||||
# instead, it should be calculated based on other parallel config
|
# instead, it should be calculated based on other parallel config
|
||||||
self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
|
self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
|
||||||
|
|
||||||
|
# the recommended nettest_parallel_size is 32 GPUs
|
||||||
|
self.nettest_parallel_size = 32
|
||||||
|
|
||||||
if self.zero1_parallel_size <= 0:
|
if self.zero1_parallel_size <= 0:
|
||||||
self.zero1_parallel_size = self.data_parallel_size
|
self.zero1_parallel_size = self.data_parallel_size
|
||||||
|
|
||||||
|
@ -454,6 +458,7 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||||
self.pipeline_parallel_size,
|
self.pipeline_parallel_size,
|
||||||
self.tensor_parallel_size,
|
self.tensor_parallel_size,
|
||||||
self.zero1_parallel_size,
|
self.zero1_parallel_size,
|
||||||
|
self.nettest_parallel_size,
|
||||||
]
|
]
|
||||||
|
|
||||||
# run initialization of different process groups
|
# run initialization of different process groups
|
||||||
|
@ -462,6 +467,7 @@ class ParallelContext(metaclass=SingletonMeta):
|
||||||
initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
|
initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
|
||||||
initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
|
initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
|
||||||
initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
|
initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
|
||||||
|
initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
|
||||||
if self.pipeline_parallel_size > 1:
|
if self.pipeline_parallel_size > 1:
|
||||||
initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
|
initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
|
||||||
for initializer in initializers:
|
for initializer in initializers:
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
|
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
|
||||||
|
|
||||||
|
import math
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
|
@ -31,6 +32,9 @@ class ParallelMode(Enum):
|
||||||
# zero1 parallel
|
# zero1 parallel
|
||||||
ZERO1 = "zero1"
|
ZERO1 = "zero1"
|
||||||
|
|
||||||
|
# runntime network test
|
||||||
|
NETTEST = "nettest"
|
||||||
|
|
||||||
|
|
||||||
class ProcessGroupInitializer(ABC):
|
class ProcessGroupInitializer(ABC):
|
||||||
"""An object, knowing the parallelism configuration, that initializes parallel groups.
|
"""An object, knowing the parallelism configuration, that initializes parallel groups.
|
||||||
|
@ -52,6 +56,7 @@ class ProcessGroupInitializer(ABC):
|
||||||
pipeline_parallel_size: int,
|
pipeline_parallel_size: int,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
zero1_parallel_size: int,
|
zero1_parallel_size: int,
|
||||||
|
nettest_parallel_size: int,
|
||||||
):
|
):
|
||||||
self.rank = rank
|
self.rank = rank
|
||||||
self.world_size = world_size
|
self.world_size = world_size
|
||||||
|
@ -59,6 +64,7 @@ class ProcessGroupInitializer(ABC):
|
||||||
self.pipeline_parallel_size = pipeline_parallel_size
|
self.pipeline_parallel_size = pipeline_parallel_size
|
||||||
self.tensor_parallel_size = tensor_parallel_size
|
self.tensor_parallel_size = tensor_parallel_size
|
||||||
self.zero1_parallel_size = zero1_parallel_size
|
self.zero1_parallel_size = zero1_parallel_size
|
||||||
|
self.nettest_parallel_size = nettest_parallel_size
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -332,3 +338,52 @@ class Initializer_Zero1(ProcessGroupInitializer):
|
||||||
ranks_in_group = ranks
|
ranks_in_group = ranks
|
||||||
|
|
||||||
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
||||||
|
|
||||||
|
|
||||||
|
class Initializer_Nettest(ProcessGroupInitializer):
|
||||||
|
"""A ProcessGroupInitializer for network test, especailly for NCCL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
rank (int): The rank of current process.
|
||||||
|
world_size (int): Size of whole communication world.
|
||||||
|
nettest_parallel_size (int): Size of a network test group.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.num_nettest_group = math.ceil(self.world_size / self.nettest_parallel_size)
|
||||||
|
|
||||||
|
def init_dist_group(self, use_cpu: bool = False):
|
||||||
|
"""Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
|
||||||
|
A Tensor parallelism's information tuple.
|
||||||
|
"""
|
||||||
|
local_rank = None
|
||||||
|
ranks_in_group = None
|
||||||
|
process_group = None
|
||||||
|
cpu_group = None
|
||||||
|
group_world_size = None
|
||||||
|
mode = ParallelMode.NETTEST
|
||||||
|
|
||||||
|
for i in range(self.num_nettest_group):
|
||||||
|
ranks = []
|
||||||
|
for j in range(self.nettest_parallel_size):
|
||||||
|
rank = i * self.nettest_parallel_size + j
|
||||||
|
if rank < self.world_size:
|
||||||
|
ranks.append(rank)
|
||||||
|
group = dist.new_group(ranks)
|
||||||
|
if use_cpu:
|
||||||
|
group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
|
||||||
|
else:
|
||||||
|
group_cpu = None
|
||||||
|
|
||||||
|
if self.rank in ranks:
|
||||||
|
local_rank = ranks.index(self.rank)
|
||||||
|
group_world_size = len(ranks)
|
||||||
|
process_group = group
|
||||||
|
cpu_group = group_cpu
|
||||||
|
ranks_in_group = ranks
|
||||||
|
|
||||||
|
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
|
||||||
|
|
|
@ -30,7 +30,7 @@ def get_tensor_shape():
|
||||||
|
|
||||||
if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
|
if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
|
||||||
if gpc.config.model.use_flash_attn:
|
if gpc.config.model.use_flash_attn:
|
||||||
if gpc.config.model.sequence_parallel:
|
if gpc.config.parallel.sequence_parallel:
|
||||||
sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
|
sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
|
||||||
tensor_shape = (
|
tensor_shape = (
|
||||||
gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"] // sequence_world_size,
|
gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"] // sequence_world_size,
|
||||||
|
@ -140,7 +140,7 @@ class PipelineScheduler(BaseScheduler):
|
||||||
and gpc.get_world_size(ParallelMode.TENSOR) > 1
|
and gpc.get_world_size(ParallelMode.TENSOR) > 1
|
||||||
)
|
)
|
||||||
|
|
||||||
if gpc.config.model.sequence_parallel:
|
if gpc.config.parallel.sequence_parallel:
|
||||||
self.scatter_gather_tensors = False
|
self.scatter_gather_tensors = False
|
||||||
|
|
||||||
# cache for the batch data
|
# cache for the batch data
|
||||||
|
|
|
@ -38,6 +38,11 @@ class TrainState:
|
||||||
# Total step count
|
# Total step count
|
||||||
self.total_steps: int = config.data.total_steps
|
self.total_steps: int = config.data.total_steps
|
||||||
|
|
||||||
|
# resume tensorboard folder, need load from checkpoint or set manually.
|
||||||
|
self.resume_tb_folder = config.resume_tb_folder
|
||||||
|
|
||||||
|
self.tensorboard_folder = config.tensorboard_folder
|
||||||
|
|
||||||
def init_batch_sampler(self, train_dl):
|
def init_batch_sampler(self, train_dl):
|
||||||
# Copy of the batch sampler from the DataLoader
|
# Copy of the batch sampler from the DataLoader
|
||||||
self.batch_sampler = train_dl.batch_sampler.copy()
|
self.batch_sampler = train_dl.batch_sampler.copy()
|
||||||
|
@ -73,8 +78,12 @@ class TrainState:
|
||||||
self.step_count = other_stuffs.get("step_count", other_stuffs["batch_count"]) + 1
|
self.step_count = other_stuffs.get("step_count", other_stuffs["batch_count"]) + 1
|
||||||
|
|
||||||
# track the actual updates of sampler when using weighted sampling
|
# track the actual updates of sampler when using weighted sampling
|
||||||
self.batch_sampler = train_dl.batch_sampler.copy()
|
if hasattr(self, "batch_sampler"):
|
||||||
self.batch_sampler_iter = iter(self.batch_sampler)
|
self.batch_sampler = train_dl.batch_sampler.copy()
|
||||||
|
self.batch_sampler_iter = iter(self.batch_sampler)
|
||||||
|
|
||||||
|
# resume tensorboard from older tensorboard_folder
|
||||||
|
self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)
|
||||||
|
|
||||||
def state_dict(self):
|
def state_dict(self):
|
||||||
return {
|
return {
|
||||||
|
@ -83,6 +92,7 @@ class TrainState:
|
||||||
"num_consumed_tokens": self.num_consumed_tokens,
|
"num_consumed_tokens": self.num_consumed_tokens,
|
||||||
"inf_nan_skip_batches": self.inf_nan_skip_batches,
|
"inf_nan_skip_batches": self.inf_nan_skip_batches,
|
||||||
"step_count": self.step_count,
|
"step_count": self.step_count,
|
||||||
|
"tensorboard_folder": self.tensorboard_folder,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ import torch
|
||||||
|
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
|
|
||||||
DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1}
|
DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1, "code": 2}
|
||||||
|
|
||||||
|
|
||||||
def get_dataset_type_id(path):
|
def get_dataset_type_id(path):
|
||||||
|
|
|
@ -1,9 +1,15 @@
|
||||||
from .initialize_trainer import initialize_trainer
|
from .initialize_trainer import initialize_trainer
|
||||||
from .launch import get_default_parser, launch_from_slurm, launch_from_torch
|
from .launch import (
|
||||||
|
get_default_parser,
|
||||||
|
initialize_distributed_env,
|
||||||
|
launch_from_slurm,
|
||||||
|
launch_from_torch,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"get_default_parser",
|
"get_default_parser",
|
||||||
"initialize_trainer",
|
"initialize_trainer",
|
||||||
"launch_from_slurm",
|
"launch_from_slurm",
|
||||||
"launch_from_torch",
|
"launch_from_torch",
|
||||||
|
"initialize_distributed_env",
|
||||||
]
|
]
|
||||||
|
|
|
@ -3,16 +3,15 @@
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
||||||
import torch
|
|
||||||
from torch import Tensor, nn
|
from torch import Tensor, nn
|
||||||
|
|
||||||
|
|
||||||
def scaled_init_method_normal(sigma, num_layers):
|
def scaled_init_method_normal(sigma: float = 1.0, num_layers: int = 1):
|
||||||
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
|
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
|
||||||
std = sigma / math.sqrt(2.0 * num_layers)
|
std = sigma / math.sqrt(2.0 * num_layers)
|
||||||
|
|
||||||
def init_(tensor):
|
def init_(tensor):
|
||||||
return torch.nn.init.normal_(tensor, mean=0.0, std=std)
|
return nn.init.normal_(tensor, mean=0.0, std=std)
|
||||||
|
|
||||||
return init_
|
return init_
|
||||||
|
|
||||||
|
@ -32,3 +31,33 @@ def normal_(mean: float = 0.0, std: float = 1.0):
|
||||||
return nn.init.normal_(tensor, mean, std)
|
return nn.init.normal_(tensor, mean, std)
|
||||||
|
|
||||||
return initializer
|
return initializer
|
||||||
|
|
||||||
|
|
||||||
|
def scaled_init_method_uniform(sigma: float = 1.0, num_layers: int = 1):
|
||||||
|
"""Init method based on p(x)=Uniform(-a, a) where std(x)=sigma/sqrt(2*num_layers)."""
|
||||||
|
std = sigma / math.sqrt(2.0 * num_layers)
|
||||||
|
a = math.sqrt(3.0 * std)
|
||||||
|
|
||||||
|
def init_(tensor):
|
||||||
|
return nn.init.uniform_(tensor, -a, a)
|
||||||
|
|
||||||
|
return init_
|
||||||
|
|
||||||
|
|
||||||
|
def uniform_(mean: float = 0.0, std: float = 1.0):
|
||||||
|
r"""Return the initializer filling the input Tensor with values drawn from the uniform distribution
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
\mathcal{U}(mean-a, mean+a), where a satisfies \mathcal{U}_{std}=std.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
mean (float): the mean of the uniform distribution. Defaults 0.0.
|
||||||
|
std (float): the standard deviation of the uniform distribution. Defaults 1.0.
|
||||||
|
"""
|
||||||
|
|
||||||
|
a = math.sqrt(3.0 * std)
|
||||||
|
|
||||||
|
def initializer(tensor: Tensor):
|
||||||
|
return nn.init.uniform_(tensor, mean - a, mean + a)
|
||||||
|
|
||||||
|
return initializer
|
||||||
|
|
|
@ -10,6 +10,7 @@ import torch
|
||||||
|
|
||||||
from internlm.core.context import Config
|
from internlm.core.context import Config
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
|
from internlm.utils.common import get_master_node
|
||||||
from internlm.utils.logger import get_logger
|
from internlm.utils.logger import get_logger
|
||||||
from internlm.utils.storage_manager import init_storage_manager
|
from internlm.utils.storage_manager import init_storage_manager
|
||||||
|
|
||||||
|
@ -108,67 +109,100 @@ def args_sanity_check():
|
||||||
logger.info(f"valid_every: {data.valid_every}")
|
logger.info(f"valid_every: {data.valid_every}")
|
||||||
|
|
||||||
# processing the checkpoint config
|
# processing the checkpoint config
|
||||||
if "enable_save_ckpt" not in gpc.config.ckpt:
|
ckpt = gpc.config.ckpt
|
||||||
gpc.config.ckpt._add_item("enable_save_ckpt", False)
|
if "enable_save_ckpt" not in ckpt:
|
||||||
|
ckpt._add_item("enable_save_ckpt", False)
|
||||||
|
|
||||||
if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0:
|
# Saving checkpoint args.
|
||||||
gpc.config.ckpt._add_item("checkpoint_every", float("inf"))
|
if ckpt.enable_save_ckpt:
|
||||||
|
assert "checkpoint_every" in ckpt, "If enable save checkpoint, must give checkpoint_every in config.data!"
|
||||||
|
assert ckpt.checkpoint_every > 0
|
||||||
|
assert "save_ckpt_folder" in ckpt, "If enable save checkpoint, must give save_ckpt_folder in config.data!"
|
||||||
|
|
||||||
if "load_optimizer" not in gpc.config.ckpt:
|
if "async_upload" not in ckpt:
|
||||||
gpc.config.ckpt._add_item("load_optimizer", True)
|
ckpt._add_item("async_upload", False) # async defalut is False.
|
||||||
|
else:
|
||||||
|
if ckpt.async_upload:
|
||||||
|
assert "save_ckpt_folder" in ckpt
|
||||||
|
if "boto3:" not in ckpt.save_ckpt_folder:
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.warning(
|
||||||
|
"Storing ckpt on file system does not support asynchronous storage, will use sync save!"
|
||||||
|
)
|
||||||
|
ckpt.async_upload = False
|
||||||
|
else:
|
||||||
|
if "async_upload_tmp_folder" not in ckpt:
|
||||||
|
ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
|
||||||
|
|
||||||
if "save_ckpt_folder" not in gpc.config.ckpt:
|
if not ckpt.async_upload:
|
||||||
gpc.config.ckpt._add_item("save_ckpt_folder", None)
|
ckpt._add_item("async_upload_tmp_folder", None)
|
||||||
|
|
||||||
if "load_ckpt_folder" not in gpc.config.ckpt:
|
if "snapshot_ckpt_folder" not in ckpt:
|
||||||
gpc.config.ckpt._add_item("load_ckpt_folder", None)
|
ckpt._add_item("snapshot_ckpt_folder", os.path.join(ckpt.save_ckpt_folder, "snapshot"))
|
||||||
|
|
||||||
if "load_model_only_folder" not in gpc.config.ckpt:
|
if "oss_snapshot_freq" not in ckpt:
|
||||||
gpc.config.ckpt._add_item("load_model_only_folder", None)
|
ckpt._add_item("oss_snapshot_freq", float("inf")) # if oss_snapshot_freq not given, we disable.
|
||||||
|
else:
|
||||||
|
ckpt._add_item("checkpoint_every", float("inf"))
|
||||||
|
ckpt._add_item("oss_snapshot_freq", float("inf"))
|
||||||
|
ckpt._add_item("save_ckpt_folder", None)
|
||||||
|
ckpt._add_item("async_upload", False)
|
||||||
|
ckpt._add_item("async_upload_tmp_folder", None)
|
||||||
|
ckpt._add_item("snapshot_ckpt_folder", None)
|
||||||
|
ckpt._add_item("snapshot_ckpt_folder", None)
|
||||||
|
|
||||||
if "async_upload" not in gpc.config.ckpt:
|
# Loading checkpoint args.
|
||||||
gpc.config.ckpt._add_item("async_upload", False)
|
if "load_model_only_folder" not in ckpt:
|
||||||
|
ckpt._add_item("load_model_only_folder", None)
|
||||||
|
|
||||||
if "async_upload_tmp_folder" not in gpc.config.ckpt:
|
if "load_ckpt_folder" not in ckpt:
|
||||||
gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
|
ckpt._add_item("load_ckpt_folder", None)
|
||||||
|
|
||||||
if gpc.config.ckpt.async_upload:
|
if "load_optimizer" not in ckpt:
|
||||||
assert "save_ckpt_folder" in gpc.config.ckpt
|
ckpt._add_item("load_optimizer", True)
|
||||||
if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
|
|
||||||
if gpc.is_rank_for_log():
|
|
||||||
logger.warning("Storing ckpt on file system does not support asynchronous storage, will use sync save!")
|
|
||||||
gpc.config.ckpt.async_upload = False
|
|
||||||
|
|
||||||
if "snapshot_ckpt_folder" not in gpc.config.ckpt:
|
if "stop_file_path" not in ckpt:
|
||||||
gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot"))
|
ckpt._add_item("stop_file_path", None)
|
||||||
|
|
||||||
if "oss_snapshot_freq" not in gpc.config.ckpt and gpc.config.ckpt.checkpoint_every != float("inf"):
|
if "load_given_ckpt" not in ckpt:
|
||||||
gpc.config.ckpt._add_item("oss_snapshot_freq", gpc.config.ckpt.checkpoint_every / 2)
|
# If 'load_given_ckpt' is not given, we set it to False, so internlm can have opportunity
|
||||||
assert gpc.config.ckpt.oss_snapshot_freq > 0
|
# to auto-load latest checkpoint.
|
||||||
|
ckpt._add_item("load_given_ckpt", False)
|
||||||
|
|
||||||
assert not (
|
if ckpt.load_given_ckpt:
|
||||||
gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None
|
# Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
|
||||||
), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time."
|
if ckpt.load_ckpt_folder and ckpt.load_model_only_folder:
|
||||||
|
logger.warning(
|
||||||
|
"Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
|
||||||
|
and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
|
||||||
|
)
|
||||||
|
ckpt.load_model_only_folder = None
|
||||||
|
|
||||||
if gpc.is_rank_for_log():
|
if gpc.is_rank_for_log():
|
||||||
logger.info("+" * 15 + " Ckpt Info " + "+" * 15) # pylint: disable=W1201
|
logger.info("+" * 15 + " Ckpt Info " + "+" * 15) # pylint: disable=W1201
|
||||||
logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}")
|
logger.info(f"is enable save ckpt: {ckpt.enable_save_ckpt}")
|
||||||
logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}")
|
logger.info(f"save_ckpt_folder: {ckpt.save_ckpt_folder}")
|
||||||
logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}")
|
logger.info(f"checkpoint_every: {ckpt.checkpoint_every}")
|
||||||
logger.info(f"async_upload: {gpc.config.ckpt.async_upload}")
|
logger.info(f"load_given_ckpt: {ckpt.load_given_ckpt}")
|
||||||
if gpc.config.ckpt.async_upload:
|
|
||||||
logger.info(f"async_upload_tmp_folder: {gpc.config.ckpt.async_upload_tmp_folder}")
|
|
||||||
|
|
||||||
# initialization storage manager
|
# initialization storage manager
|
||||||
init_storage_manager(gpc.config.ckpt)
|
init_storage_manager(ckpt)
|
||||||
|
|
||||||
# tensorboard writer config
|
# tensorboard writer config
|
||||||
if "enable_tb" not in gpc.config:
|
if "enable_tb" not in gpc.config:
|
||||||
gpc.config._add_item("enable_tb", True)
|
gpc.config._add_item("enable_tb", True)
|
||||||
if "tensorboard_folder" not in gpc.config:
|
if "tensorboard_folder" not in gpc.config:
|
||||||
gpc.config._add_item("tensorboard_folder", None)
|
gpc.config._add_item(
|
||||||
|
"tensorboard_folder", os.environ["tensorboard_folder"] if "tensorboard_folder" in os.environ else None
|
||||||
|
)
|
||||||
if "resume_tb_folder" not in gpc.config:
|
if "resume_tb_folder" not in gpc.config:
|
||||||
gpc.config._add_item("resume_tb_folder", None)
|
gpc.config._add_item(
|
||||||
|
"resume_tb_folder", os.environ["resume_tb_folder"] if "resume_tb_folder" in os.environ else None
|
||||||
|
)
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(f"tensorboard_folder: {gpc.config.tensorboard_folder}")
|
||||||
|
logger.info(f"resume_tb_folder: {gpc.config.resume_tb_folder}")
|
||||||
|
|
||||||
# cudnn
|
# cudnn
|
||||||
torch.backends.cudnn.benchmark = gpc.config.get("cudnn_benchmark", False)
|
torch.backends.cudnn.benchmark = gpc.config.get("cudnn_benchmark", False)
|
||||||
|
@ -191,10 +225,8 @@ def args_sanity_check():
|
||||||
elif gpc.config.model.dtype in ("torch.float16", "torch.half"):
|
elif gpc.config.model.dtype in ("torch.float16", "torch.half"):
|
||||||
gpc.config.model.dtype = torch.float16
|
gpc.config.model.dtype = torch.float16
|
||||||
elif gpc.config.model.dtype == "torch.float32":
|
elif gpc.config.model.dtype == "torch.float32":
|
||||||
assert gpc.config.model.use_flash_attn is False, "when using float32, the use_flash_attn must be False"
|
|
||||||
gpc.config.model.dtype = torch.float32
|
gpc.config.model.dtype = torch.float32
|
||||||
elif gpc.config.model.dtype == "torch.tf32":
|
elif gpc.config.model.dtype == "torch.tf32":
|
||||||
assert gpc.config.model.use_flash_attn is False, "when using tf32, the use_flash_attn must be False"
|
|
||||||
torch.backends.cudnn.allow_tf32 = True
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
torch.backends.cuda.matmul.allow_tf32 = True
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
gpc.config.model.dtype = torch.float32
|
gpc.config.model.dtype = torch.float32
|
||||||
|
@ -236,17 +268,32 @@ def args_sanity_check():
|
||||||
# process the model config
|
# process the model config
|
||||||
if "use_flash_attn" not in gpc.config.model:
|
if "use_flash_attn" not in gpc.config.model:
|
||||||
gpc.config.model._add_item("use_flash_attn", True)
|
gpc.config.model._add_item("use_flash_attn", True)
|
||||||
if "sequence_parallel" not in gpc.config.model:
|
|
||||||
gpc.config.model._add_item("sequence_parallel", False)
|
# process the parallel config
|
||||||
|
if "sequence_parallel" not in gpc.config.parallel:
|
||||||
|
gpc.config.parallel._add_item("sequence_parallel", False)
|
||||||
else:
|
else:
|
||||||
assert not (
|
assert not (
|
||||||
gpc.config.model.sequence_parallel is True and gpc.config.model.use_flash_attn is False
|
gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
|
||||||
), "sequence parallel does not support use_flash_attn=False"
|
), "sequence parallel does not support use_flash_attn=False"
|
||||||
|
|
||||||
# feishu webhook address for alerting
|
# feishu webhook address for alerting
|
||||||
if "alert_address" not in gpc.config:
|
if "alert_address" not in gpc.config:
|
||||||
gpc.config._add_item("alert_address", None)
|
gpc.config._add_item("alert_address", None)
|
||||||
|
|
||||||
|
optim_ckpt = gpc.config.hybrid_zero_optimizer
|
||||||
|
if "zero_overlap_communication" in optim_ckpt:
|
||||||
|
# Compatible with the old interfaces.
|
||||||
|
optim_ckpt._add_item("overlap_sync_grad", optim_ckpt.zero_overlap_communication)
|
||||||
|
if "overlap_sync_grad" not in optim_ckpt:
|
||||||
|
optim_ckpt._add_item("overlap_sync_grad", False)
|
||||||
|
if "overlap_sync_param" not in optim_ckpt:
|
||||||
|
optim_ckpt._add_item("overlap_sync_param", False)
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(
|
||||||
|
f"overlap_sync_grad:{optim_ckpt.overlap_sync_grad}, overlap_sync_param:{optim_ckpt.overlap_sync_param}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def launch(
|
def launch(
|
||||||
config: Union[str, Path, Config, Dict],
|
config: Union[str, Path, Config, Dict],
|
||||||
|
@ -293,8 +340,6 @@ def launch(
|
||||||
# init process groups for different parallel modes from config
|
# init process groups for different parallel modes from config
|
||||||
gpc.init_parallel_groups()
|
gpc.init_parallel_groups()
|
||||||
|
|
||||||
args_sanity_check()
|
|
||||||
|
|
||||||
# set cuda device
|
# set cuda device
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
# if local rank is not given, calculate automatically
|
# if local rank is not given, calculate automatically
|
||||||
|
@ -347,7 +392,11 @@ def launch_from_slurm(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nccl", seed: int = 1024):
|
def launch_from_torch(
|
||||||
|
config: Union[str, Path, Config, Dict],
|
||||||
|
backend: str = "nccl",
|
||||||
|
seed: int = 1024,
|
||||||
|
):
|
||||||
"""A wrapper for internlm.launch for torchrun or torch.distributed.launch by reading rank and world size
|
"""A wrapper for internlm.launch for torchrun or torch.distributed.launch by reading rank and world size
|
||||||
from the environment variables set by PyTorch
|
from the environment variables set by PyTorch
|
||||||
|
|
||||||
|
@ -375,3 +424,38 @@ def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nc
|
||||||
backend=backend,
|
backend=backend,
|
||||||
seed=seed,
|
seed=seed,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_distributed_env(
|
||||||
|
config: str,
|
||||||
|
launcher: str = "slurm",
|
||||||
|
master_port: int = 8888,
|
||||||
|
seed: int = 1024,
|
||||||
|
args_check=True,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Initialize distributed environment for distributed training.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config (str): Config file path.
|
||||||
|
launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
|
||||||
|
master_port (str): The master port for distributed training. 8888 by default.
|
||||||
|
seed (int, optional): Specified random seed for every process. 1024 by default.
|
||||||
|
"""
|
||||||
|
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
if launcher == "torch":
|
||||||
|
launch_from_torch(config=config, seed=seed)
|
||||||
|
elif launcher == "slurm":
|
||||||
|
launch_from_slurm(
|
||||||
|
config=config,
|
||||||
|
host=get_master_node(),
|
||||||
|
port=master_port,
|
||||||
|
seed=seed,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
|
||||||
|
|
||||||
|
if args_check:
|
||||||
|
args_sanity_check()
|
||||||
|
|
|
@ -7,6 +7,7 @@ import rotary_emb
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from einops import rearrange
|
from einops import rearrange
|
||||||
|
from flash_attn.layers.rotary import ApplyRotaryEmb as LegacyApplyRotaryEmb
|
||||||
from flash_attn.layers.rotary import ApplyRotaryEmbQKV_ as LegacyApplyRotaryEmbQKV_
|
from flash_attn.layers.rotary import ApplyRotaryEmbQKV_ as LegacyApplyRotaryEmbQKV_
|
||||||
from torch import Tensor, nn
|
from torch import Tensor, nn
|
||||||
|
|
||||||
|
@ -56,7 +57,7 @@ class Embedding1D(nn.Module):
|
||||||
|
|
||||||
output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
|
output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
|
||||||
|
|
||||||
if gpc.config.model.sequence_parallel:
|
if gpc.config.parallel.sequence_parallel:
|
||||||
output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
|
output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
@ -111,6 +112,7 @@ class ApplyRotaryEmbQKV_(torch.autograd.Function):
|
||||||
|
|
||||||
apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
|
apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
|
||||||
legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
|
legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
|
||||||
|
legacy_apply_rotary_embed = LegacyApplyRotaryEmb.apply
|
||||||
|
|
||||||
|
|
||||||
class RotaryEmbedding(torch.nn.Module):
|
class RotaryEmbedding(torch.nn.Module):
|
||||||
|
@ -135,15 +137,13 @@ class RotaryEmbedding(torch.nn.Module):
|
||||||
""" """
|
""" """
|
||||||
super().__init__()
|
super().__init__()
|
||||||
# Generate and save the inverse frequency buffer (non trainable)
|
# Generate and save the inverse frequency buffer (non trainable)
|
||||||
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
|
self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
|
||||||
self.register_buffer("inv_freq", inv_freq)
|
|
||||||
self.scale_base = scale_base
|
self.scale_base = scale_base
|
||||||
scale = (
|
self.scale = (
|
||||||
(torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
|
(torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
|
||||||
if scale_base > 0
|
if scale_base > 0
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
self.register_buffer("scale", scale)
|
|
||||||
|
|
||||||
self._seq_len_cached = 0
|
self._seq_len_cached = 0
|
||||||
self._cos_cached = None
|
self._cos_cached = None
|
||||||
|
@ -218,3 +218,15 @@ class RotaryEmbedding(torch.nn.Module):
|
||||||
self._cos_k_cached[seqlen_offset:],
|
self._cos_k_cached[seqlen_offset:],
|
||||||
self._sin_k_cached[seqlen_offset:],
|
self._sin_k_cached[seqlen_offset:],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _single_forward(self, x, indexes=0):
|
||||||
|
assert self.scale is None
|
||||||
|
self._update_cos_sin_cache(x, indexes)
|
||||||
|
x = x[None, ...]
|
||||||
|
ret = legacy_apply_rotary_embed(x, self._cos_cached[indexes], self._sin_cached[indexes]).squeeze(0)
|
||||||
|
return ret
|
||||||
|
|
||||||
|
def _single_eval_forward(self, x, seqlen_offset=0):
|
||||||
|
assert self.scale is None
|
||||||
|
self._update_cos_sin_cache(x, seqlen_offset + x.shape[1])
|
||||||
|
return legacy_apply_rotary_embed(x, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:])
|
||||||
|
|
|
@ -62,7 +62,7 @@ class ScaleColumnParallelLinear(nn.Linear):
|
||||||
weight,
|
weight,
|
||||||
self.bias,
|
self.bias,
|
||||||
process_group=self.process_group,
|
process_group=self.process_group,
|
||||||
sequence_parallel=gpc.config.model.sequence_parallel,
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -111,7 +111,7 @@ class RewardModelLinear(ScaleColumnParallelLinear):
|
||||||
weight,
|
weight,
|
||||||
self.bias,
|
self.bias,
|
||||||
process_group=self.process_group,
|
process_group=self.process_group,
|
||||||
sequence_parallel=gpc.config.model.sequence_parallel,
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -173,7 +173,7 @@ class FeedForward(nn.Module):
|
||||||
hidden_features,
|
hidden_features,
|
||||||
process_group,
|
process_group,
|
||||||
bias,
|
bias,
|
||||||
sequence_parallel=gpc.config.model.sequence_parallel,
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
device=device,
|
device=device,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
)
|
)
|
||||||
|
@ -182,7 +182,7 @@ class FeedForward(nn.Module):
|
||||||
hidden_features,
|
hidden_features,
|
||||||
process_group,
|
process_group,
|
||||||
bias,
|
bias,
|
||||||
sequence_parallel=gpc.config.model.sequence_parallel,
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
device=device,
|
device=device,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
)
|
)
|
||||||
|
@ -191,7 +191,7 @@ class FeedForward(nn.Module):
|
||||||
out_features,
|
out_features,
|
||||||
process_group,
|
process_group,
|
||||||
bias=bias,
|
bias=bias,
|
||||||
sequence_parallel=gpc.config.model.sequence_parallel,
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
device=device,
|
device=device,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
)
|
)
|
||||||
|
|
|
@ -176,7 +176,7 @@ class AccPerplex:
|
||||||
res.update(ds_acc)
|
res.update(ds_acc)
|
||||||
res.update(ds_tokens)
|
res.update(ds_tokens)
|
||||||
|
|
||||||
loss_res = self.loss_with_type_id.get_metric()
|
loss_res = self.loss_with_type_id.get_metric(reset)
|
||||||
res.update(loss_res)
|
res.update(loss_res)
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
|
@ -121,7 +121,7 @@ class PackedFlashBaseLayer1D(nn.Module):
|
||||||
process_group=gpc.get_group(ParallelMode.TENSOR),
|
process_group=gpc.get_group(ParallelMode.TENSOR),
|
||||||
bias1=False,
|
bias1=False,
|
||||||
bias2=False,
|
bias2=False,
|
||||||
sequence_parallel=gpc.config.model.sequence_parallel,
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
checkpoint_lvl=0,
|
checkpoint_lvl=0,
|
||||||
heuristic="auto",
|
heuristic="auto",
|
||||||
device=device,
|
device=device,
|
||||||
|
@ -294,7 +294,7 @@ class PackedFlashInternLm1D(nn.Module):
|
||||||
max_position_embeddings=-1,
|
max_position_embeddings=-1,
|
||||||
process_group=gpc.get_group(ParallelMode.TENSOR),
|
process_group=gpc.get_group(ParallelMode.TENSOR),
|
||||||
padding_idx=None,
|
padding_idx=None,
|
||||||
sequence_parallel=gpc.config.model.sequence_parallel,
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
device=device,
|
device=device,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
)
|
)
|
||||||
|
|
|
@ -82,7 +82,7 @@ class MHA(nn.Module):
|
||||||
3 * embed_dim,
|
3 * embed_dim,
|
||||||
process_group,
|
process_group,
|
||||||
bias=True,
|
bias=True,
|
||||||
sequence_parallel=gpc.config.model.sequence_parallel,
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
**factory_kwargs,
|
**factory_kwargs,
|
||||||
) # according to https://spaces.ac.cn/archives/9577
|
) # according to https://spaces.ac.cn/archives/9577
|
||||||
|
|
||||||
|
@ -95,7 +95,11 @@ class MHA(nn.Module):
|
||||||
|
|
||||||
# output projection always have the bias (for now)
|
# output projection always have the bias (for now)
|
||||||
self.out_proj = RowParallelLinearTorch(
|
self.out_proj = RowParallelLinearTorch(
|
||||||
embed_dim, embed_dim, process_group, sequence_parallel=gpc.config.model.sequence_parallel, **factory_kwargs
|
embed_dim,
|
||||||
|
embed_dim,
|
||||||
|
process_group,
|
||||||
|
sequence_parallel=gpc.config.parallel.sequence_parallel,
|
||||||
|
**factory_kwargs,
|
||||||
)
|
)
|
||||||
# need to assign tp attribute so that internlm know it is tensor parallel module
|
# need to assign tp attribute so that internlm know it is tensor parallel module
|
||||||
if gpc.get_world_size(ParallelMode.TENSOR) > 1:
|
if gpc.get_world_size(ParallelMode.TENSOR) > 1:
|
||||||
|
@ -128,7 +132,13 @@ class MHA(nn.Module):
|
||||||
qkv = self.rotary_emb(qkv, **kwargs)
|
qkv = self.rotary_emb(qkv, **kwargs)
|
||||||
|
|
||||||
if inference_params is None:
|
if inference_params is None:
|
||||||
context = self.inner_attn(qkv)
|
if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
|
||||||
|
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
||||||
|
if qkv.dtype not in [torch.float16, torch.bfloat16]:
|
||||||
|
qkv = qkv.to(torch.bfloat16)
|
||||||
|
context = self.inner_attn(qkv).to(x.dtype)
|
||||||
|
else:
|
||||||
|
context = self.inner_attn(qkv)
|
||||||
else:
|
else:
|
||||||
q = qkv[:, :, 0]
|
q = qkv[:, :, 0]
|
||||||
assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
|
assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
|
||||||
|
@ -160,7 +170,14 @@ class MHA(nn.Module):
|
||||||
kwargs.pop("indexes")
|
kwargs.pop("indexes")
|
||||||
|
|
||||||
if inference_params is None:
|
if inference_params is None:
|
||||||
context = self.inner_attn(qkv, **kwargs)
|
if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
|
||||||
|
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
|
||||||
|
if qkv.dtype not in [torch.float16, torch.bfloat16]:
|
||||||
|
qkv = qkv.to(torch.bfloat16)
|
||||||
|
context = self.inner_attn(qkv, **kwargs).to(x.dtype)
|
||||||
|
else:
|
||||||
|
context = self.inner_attn(qkv, **kwargs)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("Not support this right now")
|
raise RuntimeError("Not support this right now")
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
from itertools import product
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
@ -19,6 +20,7 @@ from internlm.solver.optimizer.store import (
|
||||||
)
|
)
|
||||||
from internlm.solver.optimizer.utils import (
|
from internlm.solver.optimizer.utils import (
|
||||||
DynamicGradScaler,
|
DynamicGradScaler,
|
||||||
|
ParamBcastSyncHandler,
|
||||||
flatten,
|
flatten,
|
||||||
get_grad_accumulate_object,
|
get_grad_accumulate_object,
|
||||||
has_inf_or_nan,
|
has_inf_or_nan,
|
||||||
|
@ -87,9 +89,9 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
self,
|
self,
|
||||||
optimizer: Optimizer,
|
optimizer: Optimizer,
|
||||||
cpu_offload=False,
|
cpu_offload=False,
|
||||||
overlap_broadcast=False,
|
|
||||||
grad_scal_cfg: Config = None,
|
grad_scal_cfg: Config = None,
|
||||||
zero_cfg: Config = None,
|
zero_cfg: Config = None,
|
||||||
|
param_bcast_sync_handler: ParamBcastSyncHandler = None,
|
||||||
):
|
):
|
||||||
# DynamicGradScaler related args
|
# DynamicGradScaler related args
|
||||||
if gpc.config.model.dtype is torch.float32:
|
if gpc.config.model.dtype is torch.float32:
|
||||||
|
@ -104,9 +106,10 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
max_scale = grad_scal_cfg.max_scale
|
max_scale = grad_scal_cfg.max_scale
|
||||||
|
|
||||||
# Zero related args
|
# Zero related args
|
||||||
overlap_communication = zero_cfg.zero_overlap_communication
|
|
||||||
reduce_bucket_size = zero_cfg.reduce_bucket_size
|
reduce_bucket_size = zero_cfg.reduce_bucket_size
|
||||||
clip_grad_norm = zero_cfg.clip_grad_norm
|
clip_grad_norm = zero_cfg.clip_grad_norm
|
||||||
|
self._overlap_sync_grad = zero_cfg.overlap_sync_grad
|
||||||
|
self._overlap_sync_param = zero_cfg.overlap_sync_param
|
||||||
|
|
||||||
super().__init__(optim=optimizer)
|
super().__init__(optim=optimizer)
|
||||||
|
|
||||||
|
@ -127,7 +130,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
self._fp32_flat_param_groups_of_current_rank = dict()
|
self._fp32_flat_param_groups_of_current_rank = dict()
|
||||||
|
|
||||||
# communication params
|
# communication params
|
||||||
self._overlap_communication = overlap_communication
|
# self._overlap_communication = overlap_communication
|
||||||
self._reduce_bucket_size = reduce_bucket_size
|
self._reduce_bucket_size = reduce_bucket_size
|
||||||
|
|
||||||
# gradient scaler
|
# gradient scaler
|
||||||
|
@ -158,7 +161,12 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
+ f"zo-{self._zero_local_rank}.pt"
|
+ f"zo-{self._zero_local_rank}.pt"
|
||||||
)
|
)
|
||||||
self.params_per_rank_id_dict = []
|
self.params_per_rank_id_dict = []
|
||||||
self.overlap_broadcast = overlap_broadcast
|
self._param_bcast_sync_handler = param_bcast_sync_handler
|
||||||
|
if self._overlap_sync_param:
|
||||||
|
assert self._param_bcast_sync_handler is not None
|
||||||
|
self._broadcast_comm_stream = torch.cuda.Stream()
|
||||||
|
else:
|
||||||
|
self._broadcast_comm_stream = torch.cuda.current_stream()
|
||||||
|
|
||||||
# iterate over the param group in the optimizer
|
# iterate over the param group in the optimizer
|
||||||
# partition these param groups for data parallel training
|
# partition these param groups for data parallel training
|
||||||
|
@ -228,12 +236,14 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
|
|
||||||
# initialize communication stream for
|
# initialize communication stream for
|
||||||
# communication-computation overlapping
|
# communication-computation overlapping
|
||||||
if self._overlap_communication:
|
if self._overlap_sync_grad:
|
||||||
self._comm_stream = torch.cuda.Stream()
|
self._comm_stream = torch.cuda.Stream()
|
||||||
|
else:
|
||||||
|
self._comm_stream = torch.cuda.current_stream()
|
||||||
|
|
||||||
# reduction hook is only used if overlapping communication
|
# reduction hook is only used if overlapping communication
|
||||||
# if it is stage 1 without overlapping, no hook will be attached
|
# if it is stage 1 without overlapping, no hook will be attached
|
||||||
if self._overlap_communication:
|
if self._overlap_sync_grad:
|
||||||
self._attach_reduction_hook()
|
self._attach_reduction_hook()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -267,8 +277,10 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
global_id = str(i)
|
global_id = str(i)
|
||||||
for j in range(len(param.size())):
|
for j in range(len(param.size())):
|
||||||
global_id = "_".join([global_id, str(param.size()[j])])
|
global_id = "_".join([global_id, str(param.size()[j])])
|
||||||
|
if self._overlap_sync_param:
|
||||||
rank_to_go = numel_per_rank.index(min(numel_per_rank))
|
rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param)
|
||||||
|
else:
|
||||||
|
rank_to_go = numel_per_rank.index(min(numel_per_rank))
|
||||||
params_per_rank[rank_to_go].append(param)
|
params_per_rank[rank_to_go].append(param)
|
||||||
self.params_per_rank_id_dict[-1][rank_to_go].append(global_id)
|
self.params_per_rank_id_dict[-1][rank_to_go].append(global_id)
|
||||||
numel_per_rank[rank_to_go] += param.numel()
|
numel_per_rank[rank_to_go] += param.numel()
|
||||||
|
@ -299,7 +311,9 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
self._grad_store.add_accumulate_grad_object(accum_grad_obj)
|
self._grad_store.add_accumulate_grad_object(accum_grad_obj)
|
||||||
|
|
||||||
reduction_func = partial(
|
reduction_func = partial(
|
||||||
self._store_and_try_reduce_grads_by_bucket, param=param, reduce_rank=reduce_rank
|
self._store_and_try_reduce_grads_by_bucket,
|
||||||
|
param=param,
|
||||||
|
reduce_rank=reduce_rank,
|
||||||
)
|
)
|
||||||
|
|
||||||
# define hook
|
# define hook
|
||||||
|
@ -384,17 +398,17 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
|
self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
|
||||||
|
|
||||||
def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
|
def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
|
||||||
if self._overlap_communication:
|
if self._overlap_sync_grad:
|
||||||
stream = self._comm_stream
|
self._comm_stream.synchronize()
|
||||||
stream.synchronize()
|
|
||||||
self._param_store.clear_grads_of_previous_reduced_params()
|
self._param_store.clear_grads_of_previous_reduced_params()
|
||||||
else:
|
|
||||||
stream = torch.cuda.current_stream()
|
|
||||||
|
|
||||||
with torch.cuda.stream(stream):
|
with torch.cuda.stream(self._comm_stream):
|
||||||
flat = bucket.flatten()
|
flat = bucket.flatten()
|
||||||
reduced_flat = reduce_tensor(
|
reduced_flat = reduce_tensor(
|
||||||
tensor=flat, dtype=self.dtype, dst_rank=reduce_rank, parallel_mode=ParallelMode.DATA
|
tensor=flat,
|
||||||
|
dtype=self.dtype,
|
||||||
|
dst_rank=reduce_rank,
|
||||||
|
parallel_mode=ParallelMode.DATA,
|
||||||
)
|
)
|
||||||
|
|
||||||
# update the reduced tensor
|
# update the reduced tensor
|
||||||
|
@ -483,6 +497,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
grads = [self.padding_grad]
|
grads = [self.padding_grad]
|
||||||
params = [self.padding_tensor]
|
params = [self.padding_tensor]
|
||||||
|
|
||||||
|
norm = 0
|
||||||
if self._clip_grad_norm > 0:
|
if self._clip_grad_norm > 0:
|
||||||
# this norm is before scaling, it will be very large
|
# this norm is before scaling, it will be very large
|
||||||
norm = compute_norm(
|
norm = compute_norm(
|
||||||
|
@ -507,7 +522,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
|
|
||||||
# if not overlapping communication (no reduction hook is attached)
|
# if not overlapping communication (no reduction hook is attached)
|
||||||
# we need to manually reduce these gradients
|
# we need to manually reduce these gradients
|
||||||
if not self._overlap_communication:
|
if not self._overlap_sync_grad:
|
||||||
for group_id in range(len(self._fp16_param_groups)):
|
for group_id in range(len(self._fp16_param_groups)):
|
||||||
for param in self._fp16_param_groups[group_id]:
|
for param in self._fp16_param_groups[group_id]:
|
||||||
if param.grad is not None:
|
if param.grad is not None:
|
||||||
|
@ -522,18 +537,21 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
groups_norms.append(self._compute_norm_with_stage(group_id=group_id))
|
groups_norms.append(self._compute_norm_with_stage(group_id=group_id))
|
||||||
|
|
||||||
# clear reduced grads
|
# clear reduced grads
|
||||||
if self._overlap_communication:
|
if self._overlap_sync_grad:
|
||||||
# grads in the last bucket is reduced
|
# grads in the last bucket is reduced
|
||||||
self._comm_stream.synchronize()
|
self._comm_stream.synchronize()
|
||||||
self._param_store.clear_grads_of_previous_reduced_params()
|
self._param_store.clear_grads_of_previous_reduced_params()
|
||||||
|
|
||||||
# compute norm for gradients in the last bucket
|
# compute norm for gradients in the last bucket
|
||||||
total_norms = []
|
total_norms = {}
|
||||||
for group_id in range(self.num_param_groups):
|
for group_id in range(self.num_param_groups):
|
||||||
total_norms.append(
|
group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
|
||||||
self._compute_norm_with_stage(
|
group_name = f"{group_id}_{group_name}"
|
||||||
group_id=group_id, last_bucket=True, last_stage=True, previous_norm=groups_norms[group_id]
|
total_norms[group_name] = self._compute_norm_with_stage(
|
||||||
)
|
group_id=group_id,
|
||||||
|
last_bucket=True,
|
||||||
|
last_stage=True,
|
||||||
|
previous_norm=groups_norms[group_id],
|
||||||
)
|
)
|
||||||
|
|
||||||
timer("sync_grad").start()
|
timer("sync_grad").start()
|
||||||
|
@ -552,7 +570,7 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
# found_inf = self._check_overflow()
|
# found_inf = self._check_overflow()
|
||||||
# Because you may encounter inf when computing norm
|
# Because you may encounter inf when computing norm
|
||||||
|
|
||||||
if -1 in norms:
|
if -1 in norms.values():
|
||||||
found_inf = True
|
found_inf = True
|
||||||
|
|
||||||
loss_scale = float(self.loss_scale.item()) # backup
|
loss_scale = float(self.loss_scale.item()) # backup
|
||||||
|
@ -562,10 +580,13 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
if found_inf:
|
if found_inf:
|
||||||
if gpc.is_rank_for_log():
|
if gpc.is_rank_for_log():
|
||||||
logger.warning("Overflow occurs, please check it.")
|
logger.warning("Overflow occurs, please check it.")
|
||||||
send_alert_message(address=gpc.config.alert_address, message="Overflow occurs, please check it.")
|
send_alert_message(
|
||||||
|
address=gpc.config.alert_address,
|
||||||
|
message="Overflow occurs, please check it.",
|
||||||
|
)
|
||||||
self._grad_store._averaged_gradients = dict()
|
self._grad_store._averaged_gradients = dict()
|
||||||
self.zero_grad()
|
self.zero_grad()
|
||||||
return False, None
|
return False, norms
|
||||||
|
|
||||||
# copy the grad of fp16 param to fp32 param
|
# copy the grad of fp16 param to fp32 param
|
||||||
single_grad_partition_groups = []
|
single_grad_partition_groups = []
|
||||||
|
@ -597,15 +618,17 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
|
|
||||||
# unscale and clip grads
|
# unscale and clip grads
|
||||||
# get the global norm
|
# get the global norm
|
||||||
global_norm_groups = []
|
global_norm_groups = {}
|
||||||
if self._clip_grad_norm > 0:
|
if self._clip_grad_norm > 0:
|
||||||
for norm in norms:
|
for group_name, norm in norms.items():
|
||||||
global_norm_groups.append(norm**0.5)
|
global_norm_groups[group_name] = norm**0.5
|
||||||
|
|
||||||
# the following operations are performed only on the rank to which parameters are assigned.
|
# the following operations are performed only on the rank to which parameters are assigned.
|
||||||
if gpc.config.model.dtype is not torch.float32:
|
if gpc.config.model.dtype is not torch.float32:
|
||||||
if len(single_grad_partition_groups) != 0:
|
if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
|
||||||
self._unscale_and_clip_grads(single_grad_partition_groups, global_norm_groups, loss_scale)
|
self._unscale_and_clip_grads(
|
||||||
|
single_grad_partition_groups, list(global_norm_groups.values()), loss_scale
|
||||||
|
)
|
||||||
|
|
||||||
# update the parameters
|
# update the parameters
|
||||||
timer("step").start()
|
timer("step").start()
|
||||||
|
@ -625,35 +648,42 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
|
fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
|
||||||
fp16_param.data.copy_(fp32_param)
|
fp16_param.data.copy_(fp32_param)
|
||||||
|
|
||||||
# TODO: support broadcast overlap
|
with torch.cuda.stream(self._broadcast_comm_stream):
|
||||||
self.broadcast_params(overlap=False)
|
self.broadcast_params()
|
||||||
|
|
||||||
timer("step").stop()
|
timer("step").stop()
|
||||||
|
|
||||||
# update gradients may not be needed here, because the sync_params function is used in initialization,
|
# update gradients may not be needed here, because the sync_params function is used in initialization,
|
||||||
# so synchronization is maintained
|
# so synchronization is maintained
|
||||||
return True, [global_norm / loss_scale for global_norm in global_norm_groups]
|
for group_name, global_norm in global_norm_groups.items():
|
||||||
|
global_norm_groups[group_name] = global_norm / loss_scale
|
||||||
|
return True, global_norm_groups
|
||||||
|
|
||||||
def broadcast_params(self, overlap=False):
|
def broadcast_params(self):
|
||||||
handles = []
|
handles = []
|
||||||
|
|
||||||
for group_id in range(self.num_param_groups):
|
for rank, group_id in product(range(self._zero_world_size), range(self.num_param_groups)):
|
||||||
for rank in range(self._zero_world_size):
|
# The following operations are performed only on the rank to which parameters are assigned.
|
||||||
# The following operations are performed only on the rank to which parameters are assigned.
|
if rank in self.param_group_no_params_ranks[group_id]:
|
||||||
if rank not in self.param_group_no_params_ranks[group_id]:
|
continue
|
||||||
fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
|
fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
|
||||||
# grank = gpc.get_ranks_in_group(group_type)[rank] # need to convert to the global rank
|
# grank = gpc.get_ranks_in_group(group_type)[rank] # need to convert to the global rank
|
||||||
# assert grank == rank, f"{grank} == {rank}"
|
# assert grank == rank, f"{grank} == {rank}"
|
||||||
g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode)[rank]
|
g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode)[rank]
|
||||||
handle = dist.broadcast(
|
handle = dist.broadcast(
|
||||||
fp16_param, src=g_rank, group=gpc.get_group(ParallelMode.ZERO1), async_op=True
|
fp16_param,
|
||||||
)
|
src=g_rank,
|
||||||
handles.append(handle)
|
group=gpc.get_group(ParallelMode.ZERO1),
|
||||||
|
async_op=True,
|
||||||
|
)
|
||||||
|
|
||||||
if not overlap:
|
if self._overlap_sync_param:
|
||||||
for handle in handles:
|
self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
|
||||||
handle.wait()
|
else:
|
||||||
else:
|
handles.append(handle)
|
||||||
return handles
|
|
||||||
|
for handle in handles:
|
||||||
|
handle.wait()
|
||||||
|
|
||||||
##################
|
##################
|
||||||
# FP16 Utilities #
|
# FP16 Utilities #
|
||||||
|
@ -671,7 +701,11 @@ class HybridZeroOptimizer(BaseOptimizer):
|
||||||
if avg_grad is not None and has_inf_or_nan(avg_grad):
|
if avg_grad is not None and has_inf_or_nan(avg_grad):
|
||||||
self._found_overflow.fill_(1.0)
|
self._found_overflow.fill_(1.0)
|
||||||
break
|
break
|
||||||
dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.GLOBAL))
|
dist.all_reduce(
|
||||||
|
self._found_overflow,
|
||||||
|
op=dist.ReduceOp.MAX,
|
||||||
|
group=gpc.get_group(ParallelMode.GLOBAL),
|
||||||
|
)
|
||||||
|
|
||||||
return self._found_overflow.item() > 0
|
return self._found_overflow.item() > 0
|
||||||
|
|
||||||
|
|
|
@ -3,15 +3,18 @@
|
||||||
|
|
||||||
import math
|
import math
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Dict, Optional
|
from collections import OrderedDict
|
||||||
|
from functools import partial
|
||||||
|
from typing import Any, Dict, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch import Tensor
|
from torch import Tensor, nn
|
||||||
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
|
||||||
|
|
||||||
from internlm.core.context import ParallelMode
|
from internlm.core.context import ParallelMode
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
|
from internlm.core.naive_amp import NaiveAMPModel
|
||||||
from internlm.utils.common import get_tensor_norm, move_norm_to_cuda
|
from internlm.utils.common import get_tensor_norm, move_norm_to_cuda
|
||||||
from internlm.utils.logger import get_logger
|
from internlm.utils.logger import get_logger
|
||||||
from internlm.utils.parallel import is_model_parallel_parameter
|
from internlm.utils.parallel import is_model_parallel_parameter
|
||||||
|
@ -60,12 +63,19 @@ def get_grad_accumulate_object(tensor):
|
||||||
|
|
||||||
|
|
||||||
def split_half_float_double(tensor_list):
|
def split_half_float_double(tensor_list):
|
||||||
dtypes = ["torch.cuda.HalfTensor", "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor", "torch.cuda.BFloat16Tensor"]
|
dtype_buckets = {
|
||||||
buckets = []
|
"torch.cuda.HalfTensor": [],
|
||||||
for _, dtype in enumerate(dtypes):
|
"torch.cuda.FloatTensor": [],
|
||||||
bucket = [t for t in tensor_list if t.type() == dtype]
|
"torch.cuda.DoubleTensor": [],
|
||||||
if bucket:
|
"torch.cuda.BFloat16Tensor": [],
|
||||||
buckets.append(bucket)
|
}
|
||||||
|
|
||||||
|
for t in tensor_list:
|
||||||
|
dtype = t.type()
|
||||||
|
if dtype in dtype_buckets:
|
||||||
|
dtype_buckets[dtype].append(t)
|
||||||
|
|
||||||
|
buckets = [bucket for bucket in dtype_buckets.values() if bucket]
|
||||||
return buckets
|
return buckets
|
||||||
|
|
||||||
|
|
||||||
|
@ -184,7 +194,10 @@ def calc_l2_norm(grads):
|
||||||
if APEX_AVAILABLE:
|
if APEX_AVAILABLE:
|
||||||
dummy_overflow_buf = torch.cuda.IntTensor([0])
|
dummy_overflow_buf = torch.cuda.IntTensor([0])
|
||||||
norm, _ = multi_tensor_applier(
|
norm, _ = multi_tensor_applier(
|
||||||
amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads], False # no per-parameter norm
|
amp_C.multi_tensor_l2norm,
|
||||||
|
dummy_overflow_buf,
|
||||||
|
[grads],
|
||||||
|
False, # no per-parameter norm
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
norm, _ = multi_tensor_l2norm_torch(grads, False)
|
norm, _ = multi_tensor_l2norm_torch(grads, False)
|
||||||
|
@ -228,7 +241,11 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no
|
||||||
|
|
||||||
# Take max across all model-parallel GPUs.
|
# Take max across all model-parallel GPUs.
|
||||||
if gpc.get_world_size(ParallelMode.MODEL) > 1:
|
if gpc.get_world_size(ParallelMode.MODEL) > 1:
|
||||||
dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.MODEL))
|
dist.all_reduce(
|
||||||
|
total_norm_cuda,
|
||||||
|
op=dist.ReduceOp.MAX,
|
||||||
|
group=gpc.get_group(ParallelMode.MODEL),
|
||||||
|
)
|
||||||
total_norm = total_norm_cuda[0].item()
|
total_norm = total_norm_cuda[0].item()
|
||||||
else:
|
else:
|
||||||
tensor_parallel_grads = []
|
tensor_parallel_grads = []
|
||||||
|
@ -280,7 +297,11 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no
|
||||||
|
|
||||||
# Sum across all model-parallel GPUs.
|
# Sum across all model-parallel GPUs.
|
||||||
if gpc.is_initialized(ParallelMode.MODEL):
|
if gpc.is_initialized(ParallelMode.MODEL):
|
||||||
dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.MODEL))
|
dist.all_reduce(
|
||||||
|
total_norm,
|
||||||
|
op=dist.ReduceOp.SUM,
|
||||||
|
group=gpc.get_group(ParallelMode.MODEL),
|
||||||
|
)
|
||||||
|
|
||||||
# This is because we use zero1, so we need to use this reduction.
|
# This is because we use zero1, so we need to use this reduction.
|
||||||
# TODO: Check zero group to be a subset of dp group.
|
# TODO: Check zero group to be a subset of dp group.
|
||||||
|
@ -459,3 +480,90 @@ class DynamicGradScaler(BaseGradScaler):
|
||||||
self._scale = self._scale.fill_(state_dict["_scale"])
|
self._scale = self._scale.fill_(state_dict["_scale"])
|
||||||
self._growth_step = state_dict["_growth_step"]
|
self._growth_step = state_dict["_growth_step"]
|
||||||
self._hysteresis_step = state_dict["_hysteresis_step"]
|
self._hysteresis_step = state_dict["_hysteresis_step"]
|
||||||
|
|
||||||
|
|
||||||
|
class ParamBcastSyncHandler:
|
||||||
|
"""
|
||||||
|
Model Partition Handler for overlap broadcast with forward
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model: Union[nn.Module, nn.ModuleList]) -> None:
|
||||||
|
self._block_to_param = OrderedDict() # <key: nn.Module> <value: list(param)>
|
||||||
|
self._param_to_rank = dict() # <key: param> <value: rank)>
|
||||||
|
self._block_to_rank = dict() # <key: nn.Module> <value: rank)>
|
||||||
|
self._bcast_handles = dict() # <key: rank> <value: list(bcast handles))>
|
||||||
|
|
||||||
|
zero1_size = gpc.get_world_size(ParallelMode.ZERO1)
|
||||||
|
total_param_num = sum(p.numel() for p in model.parameters())
|
||||||
|
avg_param_num = total_param_num * 1.0 // zero1_size
|
||||||
|
|
||||||
|
# just want to share same for loop for ModuleList and Module
|
||||||
|
if not isinstance(model, nn.ModuleList):
|
||||||
|
model = [model]
|
||||||
|
|
||||||
|
# record the parameters to transformer/embeding/head/norm block
|
||||||
|
for _chunk in model:
|
||||||
|
if isinstance(_chunk, NaiveAMPModel):
|
||||||
|
_chunk = _chunk.model
|
||||||
|
|
||||||
|
for _, children in _chunk.named_children():
|
||||||
|
# should be the transformer block definaton in modeling_xxx.py
|
||||||
|
if isinstance(children, nn.ModuleList):
|
||||||
|
# record the block that a parameter belongs to
|
||||||
|
for _, block in enumerate(children):
|
||||||
|
# self._block_to_param[f"{name}.{idx}"] = list(block.parameters())
|
||||||
|
self._block_to_param[block] = list(block.parameters())
|
||||||
|
else:
|
||||||
|
# record the block that a parameter belongs to
|
||||||
|
# self._block_to_param[name] = list(children.parameters())
|
||||||
|
self._block_to_param[children] = list(children.parameters())
|
||||||
|
|
||||||
|
alloc_num = 0
|
||||||
|
rank_to_go = 0
|
||||||
|
|
||||||
|
# process the parameters in block_to_param sequencially,
|
||||||
|
# allocate each parameter to a local rank of ParallelMode.ZERO1,
|
||||||
|
# NOTE that we do NOT consider following scenarios:
|
||||||
|
# 1) whether a parameter is trainable;
|
||||||
|
# 2) paramters maybe in different optimizer group
|
||||||
|
for block, params in self._block_to_param.items():
|
||||||
|
# allocate a model block to a local rank of ParallelMode.ZERO1
|
||||||
|
self._block_to_rank[block] = [rank_to_go]
|
||||||
|
for p in params:
|
||||||
|
alloc_num = alloc_num + p.numel()
|
||||||
|
# in this case, allocate the param to next rank if possible
|
||||||
|
if alloc_num > avg_param_num * 1.01 and rank_to_go < zero1_size - 1:
|
||||||
|
rank_to_go = rank_to_go + 1
|
||||||
|
alloc_num = 0
|
||||||
|
self._block_to_rank[block].append(rank_to_go)
|
||||||
|
# allocate a parameter to a local rank of ParallelMode.ZERO1
|
||||||
|
self._param_to_rank[p] = rank_to_go
|
||||||
|
|
||||||
|
# initialize an empty list for _bcast_handles of each rank
|
||||||
|
for rank in range(gpc.get_world_size(ParallelMode.ZERO1)):
|
||||||
|
self._bcast_handles[rank] = []
|
||||||
|
|
||||||
|
# register_forward_pre_hook for transformer/embeding/norm/xxx block
|
||||||
|
self._register_sync_parameters_hook()
|
||||||
|
|
||||||
|
def _register_sync_parameters_hook(self) -> None:
|
||||||
|
def _pre_forward_hook(model: nn.Module, inputs: Any): # pylint: disable=W0613
|
||||||
|
bcast_handles = []
|
||||||
|
# gather all required broadcast hanles into a list
|
||||||
|
for rank in self._block_to_rank[model]:
|
||||||
|
bcast_handles.extend(self._bcast_handles[rank])
|
||||||
|
# need to clear _bcast_handles since they would be processed later
|
||||||
|
self._bcast_handles[rank] = []
|
||||||
|
# wait all required broadcast handles to be completed
|
||||||
|
for handle in bcast_handles:
|
||||||
|
handle.wait()
|
||||||
|
|
||||||
|
# register_forward_pre_hook for transformer/embeding/norm/xxx block
|
||||||
|
for block, _ in self._block_to_rank.items():
|
||||||
|
block.register_forward_pre_hook(partial(_pre_forward_hook))
|
||||||
|
|
||||||
|
def get_rank_by_param(self, param) -> int:
|
||||||
|
return self._param_to_rank[param]
|
||||||
|
|
||||||
|
def add_bcast_handle(self, rank, handle) -> None:
|
||||||
|
self._bcast_handles[rank].append(handle)
|
||||||
|
|
|
@ -0,0 +1,19 @@
|
||||||
|
from .training_internlm import (
|
||||||
|
get_train_data_loader,
|
||||||
|
get_validation_data_loader,
|
||||||
|
initialize_llm_profile,
|
||||||
|
initialize_model,
|
||||||
|
initialize_optimizer,
|
||||||
|
load_new_batch,
|
||||||
|
record_current_batch_training_metrics,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"get_train_data_loader",
|
||||||
|
"get_validation_data_loader",
|
||||||
|
"initialize_llm_profile",
|
||||||
|
"initialize_model",
|
||||||
|
"initialize_optimizer",
|
||||||
|
"load_new_batch",
|
||||||
|
"record_current_batch_training_metrics",
|
||||||
|
]
|
|
@ -0,0 +1,422 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
|
import time
|
||||||
|
from functools import partial
|
||||||
|
from typing import Callable, Iterable, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
from torch import nn
|
||||||
|
from torch.utils.data import ConcatDataset, DataLoader
|
||||||
|
|
||||||
|
from internlm.core.context import ParallelMode
|
||||||
|
from internlm.core.context import global_context as gpc
|
||||||
|
from internlm.core.naive_amp import NaiveAMPModel
|
||||||
|
from internlm.core.trainer import TrainState
|
||||||
|
from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
|
||||||
|
from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
|
||||||
|
from internlm.data.dataset import get_dataset_dict
|
||||||
|
from internlm.data.dummy_dataset import RandomDataset
|
||||||
|
from internlm.data.packed_dataset import (
|
||||||
|
PackedDataset,
|
||||||
|
PackedDatasetWithoutCuSeqlen,
|
||||||
|
get_packed_dataset_without_short_length,
|
||||||
|
)
|
||||||
|
from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
|
||||||
|
from internlm.monitor import set_env_var
|
||||||
|
from internlm.monitor.monitor import monitor_manager as mm
|
||||||
|
from internlm.solver.beta2_scheduler import Beta2Scheduler
|
||||||
|
from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
|
||||||
|
from internlm.solver.optimizer import HybridZeroOptimizer
|
||||||
|
from internlm.solver.optimizer.utils import ParamBcastSyncHandler
|
||||||
|
from internlm.utils.common import DummyProfile
|
||||||
|
from internlm.utils.logger import get_logger
|
||||||
|
from internlm.utils.megatron_timers import megatron_timer as timer
|
||||||
|
from internlm.utils.parallel import (
|
||||||
|
is_no_pp_or_last_stage,
|
||||||
|
sync_model_param,
|
||||||
|
sync_model_param_within_tp,
|
||||||
|
)
|
||||||
|
from internlm.utils.registry import MODEL_INITIALIZER
|
||||||
|
|
||||||
|
logger = get_logger(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_model():
|
||||||
|
"""
|
||||||
|
Initialize model.
|
||||||
|
|
||||||
|
Returns: The neural network model to be trained or evaluated.
|
||||||
|
"""
|
||||||
|
|
||||||
|
model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
|
||||||
|
if isinstance(model, nn.ModuleList):
|
||||||
|
model = nn.ModuleList(
|
||||||
|
[
|
||||||
|
NaiveAMPModel(
|
||||||
|
model=_m,
|
||||||
|
output_to_fp32=False, # manually controlled by interleaved pipleline scheduler
|
||||||
|
dtype=gpc.config.model.get("dtype", torch.half),
|
||||||
|
sync_buffer=False,
|
||||||
|
)
|
||||||
|
for _m in model
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model = NaiveAMPModel(
|
||||||
|
model=model,
|
||||||
|
output_to_fp32=is_no_pp_or_last_stage(),
|
||||||
|
dtype=gpc.config.model.get("dtype", torch.half),
|
||||||
|
sync_buffer=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# This sync is very important, cause the model weights kept in optimizer are copied
|
||||||
|
# from the origin parameters in the memory, so we should make sure the dp sync
|
||||||
|
# does not influence the model weights in optimizer be different with the origin parameters.
|
||||||
|
sync_model_param(model, parallel_mode=ParallelMode.DATA)
|
||||||
|
|
||||||
|
# This function is needed to make sure parameters that are not splitted by tensor parallelism are
|
||||||
|
# the same across tensor parallelism.
|
||||||
|
sync_model_param_within_tp(model)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
|
||||||
|
"""
|
||||||
|
Initialize optimizer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (torch.nn.Module): Your model instance to be trained or evaluated.
|
||||||
|
|
||||||
|
Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
|
||||||
|
"""
|
||||||
|
if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
|
||||||
|
param_bcast_sync_handler = ParamBcastSyncHandler(model)
|
||||||
|
else:
|
||||||
|
param_bcast_sync_handler = None
|
||||||
|
|
||||||
|
adam_cfg = gpc.config.adam
|
||||||
|
naive_optimizer = torch.optim.AdamW(
|
||||||
|
params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
|
||||||
|
lr=adam_cfg.lr,
|
||||||
|
betas=(adam_cfg.adam_beta1, adam_cfg.adam_beta2),
|
||||||
|
eps=adam_cfg.adam_eps,
|
||||||
|
)
|
||||||
|
|
||||||
|
optimizer = HybridZeroOptimizer(
|
||||||
|
naive_optimizer,
|
||||||
|
grad_scal_cfg=gpc.config.grad_scaler,
|
||||||
|
zero_cfg=gpc.config.hybrid_zero_optimizer,
|
||||||
|
param_bcast_sync_handler=param_bcast_sync_handler,
|
||||||
|
)
|
||||||
|
|
||||||
|
beta2_scheduler = Beta2Scheduler(optimizer=naive_optimizer, **gpc.config.beta2_scheduler)
|
||||||
|
|
||||||
|
lr_scheduler = FineTuneCosineAnnealingWarmupLR(optimizer, **gpc.config.lr_scheduler)
|
||||||
|
|
||||||
|
return optimizer, beta2_scheduler, lr_scheduler
|
||||||
|
|
||||||
|
|
||||||
|
def get_train_data_loader(
|
||||||
|
num_worker: int = 0, dataset_generate_func: Callable = None, train_sampler=None, train_collate_fn=None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Generate and return the training data loader.
|
||||||
|
|
||||||
|
Returns: A tuple of (train_dl, dataset_types).
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Get the dataset types
|
||||||
|
dataset_types = None
|
||||||
|
dataset_types = list(DATASET_TYPE_IDS_MAP.keys())
|
||||||
|
data_cfg = gpc.config.data
|
||||||
|
|
||||||
|
# Get the sample weight dictionary
|
||||||
|
train_folder = data_cfg.train_folder
|
||||||
|
|
||||||
|
if not train_folder:
|
||||||
|
train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
|
||||||
|
if data_cfg.pack_sample_into_one:
|
||||||
|
train_ds = PackedDatasetWithoutCuSeqlen(
|
||||||
|
train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
train_ds = PackedDataset(
|
||||||
|
train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if dataset_generate_func is not None:
|
||||||
|
train_ds = dataset_generate_func()
|
||||||
|
else:
|
||||||
|
train_ds = get_packed_dataset_without_short_length(
|
||||||
|
folder=data_cfg.train_folder,
|
||||||
|
packed_length=data_cfg.packed_length,
|
||||||
|
max_length_per_sample=data_cfg.seq_len,
|
||||||
|
show_progress=dist.get_rank() == 0,
|
||||||
|
min_length=data_cfg.min_length,
|
||||||
|
min_length_dict=data_cfg.get("min_length_dict", {}),
|
||||||
|
pack_into_one_sample=data_cfg.pack_sample_into_one,
|
||||||
|
)
|
||||||
|
|
||||||
|
if dataset_generate_func is None or not train_folder:
|
||||||
|
# partition already completed
|
||||||
|
assert isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen, ConcatDataset))
|
||||||
|
# Create the training dataset sampler
|
||||||
|
train_sampler = StaticBatchSampler(
|
||||||
|
train_ds.datasets if isinstance(train_ds, ConcatDataset) else [train_ds],
|
||||||
|
batch_size=data_cfg.micro_num,
|
||||||
|
rampup_batch_size=data_cfg.rampup_batch_size,
|
||||||
|
micro_bsz=data_cfg.micro_bsz,
|
||||||
|
seed=1024,
|
||||||
|
drop_last=True,
|
||||||
|
data_rank=gpc.get_local_rank(ParallelMode.DATA),
|
||||||
|
data_world_size=gpc.get_world_size(ParallelMode.DATA),
|
||||||
|
)
|
||||||
|
|
||||||
|
if dataset_generate_func is None or not train_folder:
|
||||||
|
train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
|
||||||
|
|
||||||
|
# Create the training data loader
|
||||||
|
train_dl = DataLoader(
|
||||||
|
dataset=train_ds,
|
||||||
|
batch_sampler=train_sampler,
|
||||||
|
num_workers=num_worker,
|
||||||
|
pin_memory=True,
|
||||||
|
collate_fn=train_collate_fn,
|
||||||
|
persistent_workers=num_worker > 0,
|
||||||
|
)
|
||||||
|
|
||||||
|
return train_dl, dataset_types
|
||||||
|
|
||||||
|
|
||||||
|
def get_validation_data_loader(
|
||||||
|
num_worker: int = 0, dataset_generate_func: Callable = None, val_collate_fn=None, dataloader_func=None
|
||||||
|
):
|
||||||
|
"""Generate and return the validation data loader."""
|
||||||
|
|
||||||
|
data_cfg = gpc.config.data
|
||||||
|
|
||||||
|
if not data_cfg.valid_folder:
|
||||||
|
val_ds = RandomDataset(num_samples=gpc.get_world_size(ParallelMode.DATA) * 500, max_len=data_cfg.seq_len)
|
||||||
|
else:
|
||||||
|
if dataset_generate_func is not None:
|
||||||
|
assert val_collate_fn and dataloader_func is not None
|
||||||
|
val_ds = dataset_generate_func()
|
||||||
|
else:
|
||||||
|
val_ds = get_dataset_dict(folder=data_cfg.valid_folder, split="")
|
||||||
|
|
||||||
|
if not isinstance(val_ds, dict):
|
||||||
|
val_ds = {"val": val_ds}
|
||||||
|
|
||||||
|
if val_collate_fn is None or not data_cfg.valid_folder:
|
||||||
|
val_collate_fn = partial(jsonl_ds_collate_fn, max_length_per_sample=data_cfg.seq_len)
|
||||||
|
|
||||||
|
val_dls = {}
|
||||||
|
for val_name, ds in val_ds.items():
|
||||||
|
if dataloader_func and data_cfg.valid_folder is not None:
|
||||||
|
val_dls[val_name] = dataloader_func(dataset=ds, collate_fn=val_collate_fn)
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(
|
||||||
|
f"load validation dataset {val_name} with valid batch size {str(data_cfg.valid_micro_num)} and "
|
||||||
|
f"{ds.size} Byte samples."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# making the batch_size of validate larger can speed up the evaluation, but it should not be too large,
|
||||||
|
# otherwise too much data may be dropped
|
||||||
|
batch_size = min(
|
||||||
|
data_cfg.valid_micro_num * data_cfg.micro_bsz, len(ds) // gpc.get_world_size(ParallelMode.DATA)
|
||||||
|
)
|
||||||
|
batch_size = batch_size // data_cfg.micro_bsz * data_cfg.micro_bsz
|
||||||
|
|
||||||
|
if batch_size == 0 and gpc.is_rank_for_log():
|
||||||
|
logger.info(f"skip validate {val_name}.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
val_dls[val_name] = get_dpsampler_dataloader(
|
||||||
|
ds,
|
||||||
|
shuffle=False,
|
||||||
|
num_workers=num_worker,
|
||||||
|
batch_size=batch_size,
|
||||||
|
collate_fn=val_collate_fn,
|
||||||
|
drop_last=True,
|
||||||
|
) # drop_last=True, otherwise it may cause problems in the last batch
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(
|
||||||
|
f"load validation dataset {val_name} with valid batch size {str(batch_size)} and "
|
||||||
|
f"samples {str(len(val_dls[val_name]))}."
|
||||||
|
)
|
||||||
|
|
||||||
|
return val_dls
|
||||||
|
|
||||||
|
|
||||||
|
def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
|
||||||
|
"""
|
||||||
|
Load and return the new batch data based on training data loader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
train_dl (torch.utils.data.DataLoader): Dataloader for training.
|
||||||
|
train_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
|
||||||
|
train_state (TrainState): Current training state.
|
||||||
|
|
||||||
|
Returns: A batch data and the updated train_iter.
|
||||||
|
"""
|
||||||
|
|
||||||
|
timer("batch-gen").start()
|
||||||
|
try:
|
||||||
|
batch = next(train_iter) # structure is ({'input_ids': Tensor, 'cu_seqlens': Tensor}, Tensor)
|
||||||
|
if hasattr(train_state, "batch_sampler_iter"):
|
||||||
|
next(train_state.batch_sampler_iter)
|
||||||
|
except StopIteration:
|
||||||
|
train_iter = iter(train_dl)
|
||||||
|
batch = next(train_iter)
|
||||||
|
train_state.num_consumed_samples_in_epoch = 0
|
||||||
|
if hasattr(train_state, "batch_sampler"):
|
||||||
|
train_state.batch_sampler_iter = iter(train_state.batch_sampler)
|
||||||
|
next(train_state.batch_sampler_iter)
|
||||||
|
timer("batch-gen").stop()
|
||||||
|
|
||||||
|
if batch[0].get("type_ids", None) is not None:
|
||||||
|
# if use_flash_attn is False, we need to unpack type_ids
|
||||||
|
if not gpc.config.model.use_flash_attn:
|
||||||
|
batch[0]["type_ids"] = unpack_data(batch[0]["type_ids"], batch[0]["cu_seqlens"])
|
||||||
|
|
||||||
|
return batch, train_iter
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_llm_profile(profiling: bool = False, start_time: str = None):
|
||||||
|
"""Initialize and return the profiler context manager instance."""
|
||||||
|
|
||||||
|
if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
|
||||||
|
llm_profile = torch.profiler.profile
|
||||||
|
logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
|
||||||
|
else:
|
||||||
|
llm_profile = DummyProfile
|
||||||
|
|
||||||
|
return llm_profile(
|
||||||
|
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
|
||||||
|
schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
|
||||||
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
|
f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
|
||||||
|
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
||||||
|
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
|
||||||
|
+ f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
|
||||||
|
),
|
||||||
|
with_stack=True,
|
||||||
|
with_modules=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_current_batch_training_metrics(
|
||||||
|
get_tflops_func,
|
||||||
|
logger,
|
||||||
|
writer,
|
||||||
|
success_update,
|
||||||
|
batch_count,
|
||||||
|
batch,
|
||||||
|
train_state,
|
||||||
|
optimizer,
|
||||||
|
beta2_scheduler,
|
||||||
|
trainer,
|
||||||
|
start_time,
|
||||||
|
loss,
|
||||||
|
grad_norm,
|
||||||
|
metric,
|
||||||
|
update_panel,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Print some training metrics of current batch.
|
||||||
|
"""
|
||||||
|
|
||||||
|
set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))
|
||||||
|
|
||||||
|
if success_update in (0, True):
|
||||||
|
train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
|
||||||
|
if is_no_pp_or_last_stage():
|
||||||
|
acc_perplex = metric.get_metric()
|
||||||
|
|
||||||
|
if success_update and gpc.is_rank_for_log():
|
||||||
|
lr = optimizer.param_groups[0]["lr"]
|
||||||
|
if hasattr(trainer.engine.optimizer, "grad_scaler"):
|
||||||
|
scaler = trainer.engine.optimizer.grad_scaler._scale.item()
|
||||||
|
elif hasattr(trainer.engine.optimizer.optim, "grad_scaler"):
|
||||||
|
scaler = trainer.engine.optimizer.optim.grad_scaler._scale.item()
|
||||||
|
|
||||||
|
num_tokens_in_batch = batch[1].nelement()
|
||||||
|
num_samples_in_batch = sum([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
||||||
|
max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
|
||||||
|
max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
||||||
|
min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
||||||
|
|
||||||
|
tk_per_gpu = 0
|
||||||
|
tk_per_gpu = round(
|
||||||
|
num_tokens_in_batch
|
||||||
|
* gpc.get_world_size(ParallelMode.DATA)
|
||||||
|
/ gpc.get_world_size(ParallelMode.GLOBAL)
|
||||||
|
/ (time.time() - start_time),
|
||||||
|
2,
|
||||||
|
)
|
||||||
|
|
||||||
|
tflops = get_tflops_func((time.time() - start_time))
|
||||||
|
|
||||||
|
infos = {
|
||||||
|
"tflops": tflops,
|
||||||
|
"step": batch_count,
|
||||||
|
"loss": loss.item(),
|
||||||
|
"tgs (tokens/gpu/second)": tk_per_gpu,
|
||||||
|
"lr": lr,
|
||||||
|
"loss_scale": scaler,
|
||||||
|
"grad_norm": grad_norm,
|
||||||
|
}
|
||||||
|
|
||||||
|
infos["micro_num"] = len(batch[1])
|
||||||
|
infos["num_consumed_tokens"] = train_state.num_consumed_tokens
|
||||||
|
infos["inf_nan_skip_batches"] = train_state.inf_nan_skip_batches
|
||||||
|
infos["num_samples_in_batch"] = num_samples_in_batch # the number of batches which have the most samples
|
||||||
|
infos["largest_length"] = max_length_in_batch # the longest input
|
||||||
|
infos["largest_batch"] = max_samples_in_batch # the batch with the most samples
|
||||||
|
infos["smallest_batch"] = min_samples_in_batch
|
||||||
|
infos["adam_beta2"] = beta2_scheduler.get_beta2()
|
||||||
|
|
||||||
|
fwd_bwd_time = round(timer("fwd-bwd").elapsed(), 2)
|
||||||
|
infos["fwd_bwd_time"] = fwd_bwd_time
|
||||||
|
|
||||||
|
for key, value in acc_perplex.items():
|
||||||
|
infos[key] = value
|
||||||
|
|
||||||
|
line = ""
|
||||||
|
for key, value in infos.items():
|
||||||
|
line += f"{key}={value} "
|
||||||
|
if isinstance(value, dict):
|
||||||
|
writer.add_scalars(key=key, value=value, step=train_state.step_count)
|
||||||
|
else:
|
||||||
|
writer.add_scalar(key=key, value=value, step=train_state.step_count)
|
||||||
|
|
||||||
|
if update_panel:
|
||||||
|
# metrics shown with dashboard panels
|
||||||
|
panel_metrics = {
|
||||||
|
"step": batch_count,
|
||||||
|
"lr": lr,
|
||||||
|
"num_consumed_tokens": train_state.num_consumed_tokens,
|
||||||
|
"loss": loss.item(),
|
||||||
|
"flops": tflops,
|
||||||
|
"tgs": tk_per_gpu,
|
||||||
|
"acc": acc_perplex["acc"],
|
||||||
|
"perplexity": acc_perplex["perplexity"],
|
||||||
|
"fwd_bwd_time": fwd_bwd_time,
|
||||||
|
}
|
||||||
|
for norm_key, norm_value in grad_norm.items():
|
||||||
|
panel_metrics[norm_key] = norm_value
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"{line}",
|
||||||
|
line=line,
|
||||||
|
extra=panel_metrics,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(line)
|
||||||
|
|
||||||
|
# if loss spike occurs, send alert info to feishu
|
||||||
|
mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
|
|
@ -52,12 +52,12 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def switch_sequence_parallel_mode():
|
def switch_sequence_parallel_mode():
|
||||||
prev_mode = gpc.config.model.sequence_parallel
|
prev_mode = gpc.config.parallel.sequence_parallel
|
||||||
try:
|
try:
|
||||||
gpc.config.model.sequence_parallel = False
|
gpc.config.parallel.sequence_parallel = False
|
||||||
yield
|
yield
|
||||||
finally:
|
finally:
|
||||||
gpc.config.model.sequence_parallel = prev_mode
|
gpc.config.parallel.sequence_parallel = prev_mode
|
||||||
|
|
||||||
|
|
||||||
def evaluate_on_val_dls(
|
def evaluate_on_val_dls(
|
||||||
|
@ -67,6 +67,7 @@ def evaluate_on_val_dls(
|
||||||
logger,
|
logger,
|
||||||
step_count,
|
step_count,
|
||||||
update_panel: bool = False,
|
update_panel: bool = False,
|
||||||
|
streaming: bool = False,
|
||||||
):
|
):
|
||||||
with switch_sequence_parallel_mode():
|
with switch_sequence_parallel_mode():
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
@ -75,7 +76,7 @@ def evaluate_on_val_dls(
|
||||||
data_cfg = gpc.config.data
|
data_cfg = gpc.config.data
|
||||||
|
|
||||||
for val_name, val_dl in val_dls.items():
|
for val_name, val_dl in val_dls.items():
|
||||||
if len(val_dl) == 0 and verbose:
|
if not streaming and len(val_dl) == 0 and verbose:
|
||||||
logger.info(f"Validation dataset: {val_name} is empty")
|
logger.info(f"Validation dataset: {val_name} is empty")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -91,7 +92,7 @@ def evaluate_on_val_dls(
|
||||||
for val_idx, batch in tqdm(
|
for val_idx, batch in tqdm(
|
||||||
enumerate(val_dl),
|
enumerate(val_dl),
|
||||||
desc="Val.",
|
desc="Val.",
|
||||||
total=len(val_dl),
|
total=len(val_dl) if not streaming else None,
|
||||||
position=1,
|
position=1,
|
||||||
disable=not verbose,
|
disable=not verbose,
|
||||||
leave=False,
|
leave=False,
|
||||||
|
@ -135,7 +136,7 @@ def evaluate_on_val_dls(
|
||||||
dist.barrier()
|
dist.barrier()
|
||||||
|
|
||||||
val_res = val_metric.get_metric()
|
val_res = val_metric.get_metric()
|
||||||
if verbose and len(val_dl) != 0:
|
if verbose and (streaming or len(val_dl) != 0):
|
||||||
val_loss = val_loss / (val_idx + 1 + 1e-6)
|
val_loss = val_loss / (val_idx + 1 + 1e-6)
|
||||||
infos = {
|
infos = {
|
||||||
"step": step_count,
|
"step": step_count,
|
||||||
|
|
|
@ -0,0 +1,163 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
|
import math
|
||||||
|
import socket
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
from flash_attn.modules.mha import FlashSelfAttention, SelfAttention
|
||||||
|
from torch.utils import benchmark
|
||||||
|
|
||||||
|
from internlm.utils.logger import get_logger
|
||||||
|
|
||||||
|
try:
|
||||||
|
import GPUtil
|
||||||
|
import psutil
|
||||||
|
except ImportError:
|
||||||
|
GPUtil, psutil = None, None
|
||||||
|
|
||||||
|
from internlm.core.context import ParallelMode
|
||||||
|
from internlm.core.context import global_context as gpc
|
||||||
|
from internlm.utils.common import get_current_device
|
||||||
|
|
||||||
|
logger = get_logger(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_forward(
|
||||||
|
test_fn,
|
||||||
|
*inputs,
|
||||||
|
repeats=100,
|
||||||
|
amp=True,
|
||||||
|
amp_dtype=torch.float16,
|
||||||
|
**kwinputs,
|
||||||
|
):
|
||||||
|
"""Use Pytorch Benchmark on the forward pass of an arbitrary function."""
|
||||||
|
|
||||||
|
def amp_wrapper(*inputs, **kwinputs):
|
||||||
|
with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
|
||||||
|
test_fn(*inputs, **kwinputs)
|
||||||
|
|
||||||
|
bench_timer = benchmark.Timer(
|
||||||
|
stmt="test_fn_amp(*inputs, **kwinputs)",
|
||||||
|
globals={"test_fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
|
||||||
|
num_threads=torch.get_num_threads(),
|
||||||
|
)
|
||||||
|
used_time = bench_timer.timeit(repeats)
|
||||||
|
return used_time.mean
|
||||||
|
|
||||||
|
|
||||||
|
def flops(batch, seqlen, headdim, nheads, time_f):
|
||||||
|
"""Compute the flops value of a GPU with give flashattention function"""
|
||||||
|
|
||||||
|
flop = 4 * batch * seqlen**2 * nheads * headdim
|
||||||
|
return (flop / time_f / 10**12) if not math.isnan(time_f) else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpu_temperature():
|
||||||
|
"""Get current GPU temperature."""
|
||||||
|
try:
|
||||||
|
gpu_id = torch.cuda.current_device()
|
||||||
|
except AssertionError:
|
||||||
|
gpu_id = -1
|
||||||
|
|
||||||
|
if GPUtil is not None and gpu_id >= 0:
|
||||||
|
gpus = GPUtil.getGPUs()
|
||||||
|
gpu_temperature = gpus[gpu_id].temperature
|
||||||
|
else:
|
||||||
|
gpu_temperature = -1
|
||||||
|
|
||||||
|
return gpu_temperature
|
||||||
|
|
||||||
|
|
||||||
|
def get_cpu_temperature():
|
||||||
|
"""Get current CPU temperature."""
|
||||||
|
|
||||||
|
if psutil is not None:
|
||||||
|
cpu_temperature = psutil.sensors_temperatures()["coretemp"][0].current
|
||||||
|
else:
|
||||||
|
cpu_temperature = -1
|
||||||
|
|
||||||
|
return cpu_temperature
|
||||||
|
|
||||||
|
|
||||||
|
def bench_net():
|
||||||
|
"""Benchmark nccl performance for slow node detection."""
|
||||||
|
|
||||||
|
if gpc.get_world_size(ParallelMode.GLOBAL) <= 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info("benchmarking network speed ...")
|
||||||
|
|
||||||
|
repeats = 100
|
||||||
|
input_data = torch.randn(
|
||||||
|
8 * 1024 * 1024,
|
||||||
|
device=get_current_device(),
|
||||||
|
dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
def allreduce_fn(inputs):
|
||||||
|
dist.all_reduce(inputs, op=torch.distributed.ReduceOp.AVG, group=gpc.get_group(ParallelMode.NETTEST))
|
||||||
|
|
||||||
|
bench_timer = benchmark.Timer(
|
||||||
|
stmt="test_fn_amp(inputs)",
|
||||||
|
globals={"test_fn_amp": allreduce_fn, "inputs": input_data},
|
||||||
|
num_threads=torch.get_num_threads(),
|
||||||
|
)
|
||||||
|
allreduce_time = bench_timer.timeit(repeats).mean
|
||||||
|
allreduce_time = allreduce_time * 10**3
|
||||||
|
allreduce_time_this = allreduce_time
|
||||||
|
allreduce_time = torch.Tensor([allreduce_time]).to(device=get_current_device())
|
||||||
|
dist.all_reduce(allreduce_time, group=gpc.get_group(ParallelMode.GLOBAL))
|
||||||
|
allreduce_time_avg = allreduce_time / gpc.get_world_size(ParallelMode.GLOBAL)
|
||||||
|
allreduce_time_avg = float(allreduce_time_avg.item())
|
||||||
|
|
||||||
|
if allreduce_time_this >= allreduce_time_avg * 1.05:
|
||||||
|
logger.warning(
|
||||||
|
f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} NCCL test is slower than avg, "
|
||||||
|
f"Hostname {socket.gethostname()}, "
|
||||||
|
f"allreduce_time {allreduce_time_this:.2f}, avg {allreduce_time_avg:.2f}, "
|
||||||
|
f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def bench_gpu(use_flash_attn=True):
|
||||||
|
"""Benchmark single GPU performance for slow node detection."""
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info("benchmarking gpu speed ...")
|
||||||
|
|
||||||
|
headdim = 64
|
||||||
|
dim = 2048
|
||||||
|
batch_size, seqlen = 2, 1024
|
||||||
|
nheads = dim // headdim
|
||||||
|
|
||||||
|
inner_attn = FlashSelfAttention if use_flash_attn else SelfAttention
|
||||||
|
inner_attn = inner_attn(causal=True, softmax_scale=None, attention_dropout=0)
|
||||||
|
|
||||||
|
qkv = torch.randn(
|
||||||
|
batch_size,
|
||||||
|
seqlen,
|
||||||
|
3,
|
||||||
|
dim // headdim,
|
||||||
|
headdim,
|
||||||
|
device=get_current_device(),
|
||||||
|
dtype=torch.float16,
|
||||||
|
requires_grad=True,
|
||||||
|
)
|
||||||
|
time_f = benchmark_forward(inner_attn, qkv)
|
||||||
|
speed = flops(batch_size, seqlen, headdim, nheads, time_f)
|
||||||
|
speed_this = speed
|
||||||
|
speed = torch.Tensor([speed]).to(device=get_current_device())
|
||||||
|
dist.all_reduce(speed, group=gpc.get_group(ParallelMode.GLOBAL))
|
||||||
|
speed_avg = speed / gpc.get_world_size(ParallelMode.GLOBAL)
|
||||||
|
speed_avg = float(speed_avg.item())
|
||||||
|
|
||||||
|
if speed_this <= speed_avg * 0.95:
|
||||||
|
logger.warning(
|
||||||
|
f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} GPU is slower than avg, "
|
||||||
|
f"Hostname {socket.gethostname()}, "
|
||||||
|
f"tflops {speed_this:.2f}, avg {speed_avg:.2f}, "
|
||||||
|
f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
|
||||||
|
)
|
|
@ -14,18 +14,19 @@ class _Timer:
|
||||||
self.elapsed_ = 0.0
|
self.elapsed_ = 0.0
|
||||||
self.started_ = False
|
self.started_ = False
|
||||||
self.start_time = time.time()
|
self.start_time = time.time()
|
||||||
|
self.stream = torch.cuda.current_stream()
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
"""Start the timer."""
|
"""Start the timer."""
|
||||||
assert not self.started_, "timer has already been started"
|
assert not self.started_, "timer has already been started"
|
||||||
torch.cuda.synchronize()
|
self.stream.synchronize()
|
||||||
self.start_time = time.time()
|
self.start_time = time.time()
|
||||||
self.started_ = True
|
self.started_ = True
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
"""Stop the timer."""
|
"""Stop the timer."""
|
||||||
assert self.started_, "timer is not started"
|
assert self.started_, "timer is not started"
|
||||||
torch.cuda.synchronize()
|
self.stream.synchronize()
|
||||||
self.elapsed_ += time.time() - self.start_time
|
self.elapsed_ += time.time() - self.start_time
|
||||||
self.started_ = False
|
self.started_ = False
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,9 @@
|
||||||
# -*- encoding: utf-8 -*-
|
# -*- encoding: utf-8 -*-
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import fcntl
|
||||||
import os
|
import os
|
||||||
|
import socket
|
||||||
import time
|
import time
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
@ -12,6 +14,7 @@ import torch
|
||||||
from internlm.core.context import ParallelMode
|
from internlm.core.context import ParallelMode
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
from internlm.core.trainer import TrainState
|
from internlm.core.trainer import TrainState
|
||||||
|
from internlm.monitor import send_alert_message
|
||||||
from internlm.solver.optimizer import HybridZeroOptimizer
|
from internlm.solver.optimizer import HybridZeroOptimizer
|
||||||
from internlm.utils.common import get_current_device
|
from internlm.utils.common import get_current_device
|
||||||
from internlm.utils.logger import get_logger
|
from internlm.utils.logger import get_logger
|
||||||
|
@ -25,8 +28,6 @@ from internlm.utils.storage_manager import (
|
||||||
|
|
||||||
logger = get_logger(__file__)
|
logger = get_logger(__file__)
|
||||||
|
|
||||||
quit_signal_handler = None
|
|
||||||
|
|
||||||
|
|
||||||
class CheckpointType(Enum):
|
class CheckpointType(Enum):
|
||||||
NORMAL_CHECKPOINT = 1
|
NORMAL_CHECKPOINT = 1
|
||||||
|
@ -167,44 +168,6 @@ def save_optimizer_checkpoint(optim, state_path):
|
||||||
llm_save(os.path.join(state_path, fp), states)
|
llm_save(os.path.join(state_path, fp), states)
|
||||||
|
|
||||||
|
|
||||||
def save_checkpoint(folder, model, optimizer, scheduler, train_state: TrainState, model_config: Dict = None):
|
|
||||||
"""
|
|
||||||
Save checkpoint to the given folder path.
|
|
||||||
"""
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
torch.distributed.barrier()
|
|
||||||
folder = os.path.join(folder, str(train_state.step_count))
|
|
||||||
logger.info(
|
|
||||||
f"Saving checkpoint to `{folder}` at batch count:{train_state.step_count} from rank:{gpc.get_global_rank()}..."
|
|
||||||
)
|
|
||||||
|
|
||||||
timer("save-model").start()
|
|
||||||
save_model_checkpoint(folder=folder, model=model)
|
|
||||||
timer("save-model").stop()
|
|
||||||
|
|
||||||
timer("save-optimizer").start()
|
|
||||||
save_optimizer_checkpoint(optim=optimizer, state_path=folder)
|
|
||||||
timer("save-optimizer").stop()
|
|
||||||
|
|
||||||
if gpc.is_rank_for_log():
|
|
||||||
scheduler_states = scheduler.state_dict()
|
|
||||||
llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
|
|
||||||
|
|
||||||
sampler_state = train_state.batch_sampler.state_dict()
|
|
||||||
llm_save(os.path.join(folder, "sampler.pt"), saved_obj=sampler_state)
|
|
||||||
llm_save(os.path.join(folder, "context.pt"), saved_obj=train_state.state_dict())
|
|
||||||
|
|
||||||
if model_config is not None:
|
|
||||||
llm_save(os.path.join(folder, "model_config.pt"), saved_obj=model_config)
|
|
||||||
|
|
||||||
torch.distributed.barrier()
|
|
||||||
|
|
||||||
if gpc.is_rank_for_log():
|
|
||||||
timer.log(["save-model", "save-optimizer"], logger=logger)
|
|
||||||
logger.info(f"Step: {train_state.step_count}, rank 0 save ckpt use {time.time() - start:.3f} s")
|
|
||||||
|
|
||||||
|
|
||||||
def load_optimizer_checkpoint(folder, optim):
|
def load_optimizer_checkpoint(folder, optim):
|
||||||
"""Load the optimizer state from the local file system or remote
|
"""Load the optimizer state from the local file system or remote
|
||||||
object storage Service (OSS).
|
object storage Service (OSS).
|
||||||
|
@ -304,19 +267,12 @@ def load_scheduler(ckpt_path: str, lr_scheduler, optimizer, learning_rate, train
|
||||||
logger.info(f"reload load_scheduler:{lr_scheduler}")
|
logger.info(f"reload load_scheduler:{lr_scheduler}")
|
||||||
|
|
||||||
|
|
||||||
class CheckpointSaveManager:
|
class CheckpointManager:
|
||||||
"""StorageManagerContext"""
|
"""StorageManagerContext"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, ckpt_config, model, model_config=None, model_config_file=None, feishu_address=None) -> None:
|
||||||
self,
|
|
||||||
ckpt_config,
|
|
||||||
model,
|
|
||||||
optimizer,
|
|
||||||
lr_scheduler,
|
|
||||||
model_config,
|
|
||||||
) -> None:
|
|
||||||
"""
|
"""
|
||||||
CheckpointSaveManager is used to decide when to store ckpt. If it is an asynchronous
|
CheckpointManager is used to decide when to store ckpt. If it is an asynchronous
|
||||||
upload mode, you must call wait_async_upload_finish at the end of the program to wait
|
upload mode, you must call wait_async_upload_finish at the end of the program to wait
|
||||||
for the asynchronous ckpt upload to complete.
|
for the asynchronous ckpt upload to complete.
|
||||||
|
|
||||||
|
@ -332,26 +288,96 @@ class CheckpointSaveManager:
|
||||||
self.save_ckpt_folder = ckpt_config.save_ckpt_folder
|
self.save_ckpt_folder = ckpt_config.save_ckpt_folder
|
||||||
self.snapshot_ckpt_folder = ckpt_config.snapshot_ckpt_folder
|
self.snapshot_ckpt_folder = ckpt_config.snapshot_ckpt_folder
|
||||||
self.oss_snapshot_freq: int = ckpt_config.oss_snapshot_freq
|
self.oss_snapshot_freq: int = ckpt_config.oss_snapshot_freq
|
||||||
|
self.stop_file_path = ckpt_config.stop_file_path
|
||||||
|
self.load_model_only_folder = ckpt_config.load_model_only_folder
|
||||||
|
self.feishu_address = feishu_address
|
||||||
self.storage_manager = get_storage_manager()
|
self.storage_manager = get_storage_manager()
|
||||||
self.snapshot_counter = 0
|
self.snapshot_counter = 0
|
||||||
|
self.load_optimizer = gpc.config.ckpt.load_optimizer
|
||||||
|
|
||||||
self.model = model
|
self.model = model
|
||||||
self.optimizer = optimizer
|
|
||||||
self.lr_scheduler = lr_scheduler
|
|
||||||
self.model_config = model_config
|
self.model_config = model_config
|
||||||
|
self.model_config_file = model_config_file
|
||||||
|
|
||||||
|
if self.stop_file_path and gpc.get_global_rank() == 0:
|
||||||
|
dir_path = os.path.dirname(self.stop_file_path)
|
||||||
|
if dir_path != "" and not os.path.exists(dir_path):
|
||||||
|
os.makedirs(dir_path)
|
||||||
|
with open(self.stop_file_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write("0")
|
||||||
|
|
||||||
|
if ckpt_config.load_given_ckpt is False:
|
||||||
|
# Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
|
||||||
|
latest_ckpt_path = self.query_lastest_ckpt()
|
||||||
|
if latest_ckpt_path:
|
||||||
|
self.load_ckpt_folder = latest_ckpt_path
|
||||||
|
else:
|
||||||
|
# At this time, we have to load model init weights and train from step 0.
|
||||||
|
self.load_ckpt_folder = self.load_model_only_folder
|
||||||
|
else:
|
||||||
|
self.load_ckpt_folder = ckpt_config.load_ckpt_folder
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(f"load_ckpt_folder will set to :'{self.load_ckpt_folder}'")
|
||||||
|
if self.stop_file_path is None:
|
||||||
|
logger.warning("no set stop_file_path, quit_signal_handler is disable")
|
||||||
|
|
||||||
|
def quit_signal_handler(self, train_state) -> bool:
|
||||||
|
"""
|
||||||
|
Exit signal detection function, if we write the exit step in the 'QUIT_FILE_PATH' file,
|
||||||
|
all ranks will save ckpt and exit.
|
||||||
|
Negative integer step means save ckpt.
|
||||||
|
Positive integer step means save ckpt and quit.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
train_state (TrainState):
|
||||||
|
Returns:
|
||||||
|
bool: whether to quit.
|
||||||
|
"""
|
||||||
|
now_break, now_save_ckpt, save_type = False, False, CheckpointType.NORMAL_CHECKPOINT
|
||||||
|
|
||||||
|
if self.stop_file_path is None:
|
||||||
|
return now_break, now_save_ckpt, save_type
|
||||||
|
|
||||||
|
with open(self.stop_file_path, "a+", encoding="utf-8") as f:
|
||||||
|
fcntl.flock(f, fcntl.LOCK_EX)
|
||||||
|
f.seek(0)
|
||||||
|
msg = f.read()
|
||||||
|
fcntl.flock(f, fcntl.LOCK_UN)
|
||||||
|
action_step = int(msg)
|
||||||
|
|
||||||
|
if action_step < 0 and abs(action_step) == train_state.step_count:
|
||||||
|
now_save_ckpt = True
|
||||||
|
|
||||||
|
if action_step > 0 and action_step == train_state.step_count:
|
||||||
|
now_break, now_save_ckpt = True, True
|
||||||
|
|
||||||
|
if action_step != 0 and gpc.is_rank_for_log():
|
||||||
|
msg = "Stop" if action_step > 0 else "Save"
|
||||||
|
action_step = abs(action_step)
|
||||||
|
if train_state.step_count <= action_step:
|
||||||
|
if self.feishu_address:
|
||||||
|
send_alert_message(
|
||||||
|
address=self.feishu_address,
|
||||||
|
message=f"training will {msg} at step_count {action_step}!\
|
||||||
|
now step_count is {train_state.step_count}",
|
||||||
|
)
|
||||||
|
|
||||||
|
return now_break, now_save_ckpt, save_type
|
||||||
|
|
||||||
def try_save_checkpoint(self, train_state):
|
def try_save_checkpoint(self, train_state):
|
||||||
if not self.enable_save_ckpt:
|
if not self.enable_save_ckpt:
|
||||||
return
|
return False
|
||||||
|
|
||||||
save_ckpts, save_type = False, CheckpointType.NORMAL_CHECKPOINT
|
save_ckpts, save_type = False, CheckpointType.NORMAL_CHECKPOINT
|
||||||
if self.oss_snapshot_freq > 1 and train_state.step_count % self.oss_snapshot_freq == 0:
|
if self.oss_snapshot_freq > 1 and train_state.step_count % self.oss_snapshot_freq == 0:
|
||||||
save_ckpts, save_type = True, CheckpointType.SNAPSHOT_CHECKPOINT
|
save_ckpts, save_type = True, CheckpointType.SNAPSHOT_CHECKPOINT
|
||||||
if train_state.step_count % self.checkpoint_every == 0:
|
if train_state.step_count % self.checkpoint_every == 0:
|
||||||
save_ckpts, save_type = True, CheckpointType.NORMAL_CHECKPOINT
|
save_ckpts, save_type = True, CheckpointType.NORMAL_CHECKPOINT
|
||||||
|
now_break, singal_save_ckpts, singal_save_type = self.quit_signal_handler(train_state)
|
||||||
if save_ckpts is False:
|
if save_ckpts is False:
|
||||||
if quit_signal_handler is not None:
|
save_ckpts = singal_save_ckpts
|
||||||
save_ckpts, save_type = quit_signal_handler(train_state)
|
save_type = singal_save_type
|
||||||
|
|
||||||
if save_ckpts:
|
if save_ckpts:
|
||||||
# Wait for the previous round of asynchronous upload storage to complete.
|
# Wait for the previous round of asynchronous upload storage to complete.
|
||||||
|
@ -361,18 +387,247 @@ class CheckpointSaveManager:
|
||||||
self.snapshot_counter = (self.snapshot_counter + 1) % 2
|
self.snapshot_counter = (self.snapshot_counter + 1) % 2
|
||||||
save_ckpt_folder = os.path.join(self.snapshot_ckpt_folder, f"{self.snapshot_counter}")
|
save_ckpt_folder = os.path.join(self.snapshot_ckpt_folder, f"{self.snapshot_counter}")
|
||||||
else:
|
else:
|
||||||
save_ckpt_folder = self.save_ckpt_folder
|
save_ckpt_folder = os.path.join(self.save_ckpt_folder, str(train_state.step_count))
|
||||||
|
|
||||||
save_checkpoint(
|
self.save_checkpoint(
|
||||||
folder=save_ckpt_folder,
|
folder=save_ckpt_folder,
|
||||||
model=self.model,
|
model=self.model,
|
||||||
optimizer=self.optimizer,
|
optimizer=self.optimizer,
|
||||||
scheduler=self.lr_scheduler,
|
scheduler=self.lr_scheduler,
|
||||||
train_state=train_state,
|
train_state=train_state,
|
||||||
model_config=self.model_config,
|
model_config=self.model_config,
|
||||||
|
model_config_file=self.model_config_file,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return now_break
|
||||||
|
|
||||||
def wait_async_upload_finish(self):
|
def wait_async_upload_finish(self):
|
||||||
"""wait for all checkpoint uploads to be completed"""
|
"""wait for all checkpoint uploads to be completed"""
|
||||||
self.storage_manager.wait()
|
self.storage_manager.wait()
|
||||||
torch.distributed.barrier()
|
torch.distributed.barrier()
|
||||||
|
|
||||||
|
def query_latest_snapshot_step_boto3(self):
|
||||||
|
"""query_latest_snapshot_step_boto3
|
||||||
|
Returns:
|
||||||
|
Tuple(str, int): path of latest ckpt and ckpt step, if not found, None will return.
|
||||||
|
"""
|
||||||
|
ckpt_list = self.storage_manager.get_fns(self.save_ckpt_folder)
|
||||||
|
if len(ckpt_list) == 0:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
max_normal_step = 0
|
||||||
|
ckpt_list = list(map(lambda a: int(a.strip("/")) if a.strip("/").isdigit() else 0, ckpt_list))
|
||||||
|
ckpt_list.sort(reverse=True)
|
||||||
|
for ckpt in ckpt_list:
|
||||||
|
fns_list = self.storage_manager.get_fns(os.path.join(self.save_ckpt_folder, str(ckpt)))
|
||||||
|
for fn in fns_list:
|
||||||
|
if fn.endswith(".step"):
|
||||||
|
max_normal_step = ckpt
|
||||||
|
break
|
||||||
|
if max_normal_step != 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
max_normal_step = ckpt_list[0]
|
||||||
|
load_normal_ckpt_path = os.path.join(self.save_ckpt_folder, str(max_normal_step))
|
||||||
|
|
||||||
|
snapshot_path_0 = os.path.join(self.save_ckpt_folder, "snapshot", "0")
|
||||||
|
snapshot_path_1 = os.path.join(self.save_ckpt_folder, "snapshot", "1")
|
||||||
|
ckpt_list_1 = self.storage_manager.get_fns(snapshot_path_0)
|
||||||
|
ckpt_list_2 = self.storage_manager.get_fns(snapshot_path_1)
|
||||||
|
max_step_0, max_step_1 = 0, 0
|
||||||
|
for ckpt in ckpt_list_1:
|
||||||
|
ckpt = ckpt.strip("/")
|
||||||
|
if ckpt.endswith(".step"):
|
||||||
|
max_step_0 = max(max_step_0, int(ckpt.split(".")[0]))
|
||||||
|
for ckpt in ckpt_list_2:
|
||||||
|
ckpt = ckpt.strip("/")
|
||||||
|
if ckpt.endswith(".step"):
|
||||||
|
max_step_1 = max(max_step_1, int(ckpt.split(".")[0]))
|
||||||
|
|
||||||
|
snap_load_path = snapshot_path_0 if max_step_0 > max_step_1 else snapshot_path_1
|
||||||
|
snap_step = max(max_step_0, max_step_1)
|
||||||
|
load_path = snap_load_path if snap_step > max_normal_step else load_normal_ckpt_path
|
||||||
|
load_step = max(snap_step, max_normal_step)
|
||||||
|
return load_path, load_step
|
||||||
|
|
||||||
|
def query_latest_snapshot_step_local(self):
|
||||||
|
max_step, max_step_path = 0, None
|
||||||
|
for root, _, files in os.walk(self.save_ckpt_folder, followlinks=True):
|
||||||
|
for fn in files:
|
||||||
|
fn = fn.strip("/")
|
||||||
|
if fn.endswith(".step"):
|
||||||
|
# We assume that both normal ckpt and snapshot ckpt will store the '.step' file
|
||||||
|
# as an integrity flag.
|
||||||
|
step = int(fn.rsplit(".", maxsplit=1)[0])
|
||||||
|
if max_step < step:
|
||||||
|
max_step = step
|
||||||
|
max_step_path = root
|
||||||
|
|
||||||
|
return max_step_path, max_step
|
||||||
|
|
||||||
|
def query_lastest_ckpt(self):
|
||||||
|
latest_checkpoint = None
|
||||||
|
# Training was automatically restarted by the process, forcing the latest snapshot to be read.
|
||||||
|
if self.save_ckpt_folder:
|
||||||
|
if self.save_ckpt_folder.startswith("boto3"):
|
||||||
|
latest_checkpoint, step = self.query_latest_snapshot_step_boto3()
|
||||||
|
elif self.save_ckpt_folder.startswith("local"):
|
||||||
|
latest_checkpoint, step = self.query_latest_snapshot_step_local()
|
||||||
|
else:
|
||||||
|
latest_checkpoint, step = None, 0
|
||||||
|
|
||||||
|
if latest_checkpoint is not None:
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(f"Found latest ckpt : {latest_checkpoint}, step: {step}")
|
||||||
|
send_alert_message(
|
||||||
|
address=self.feishu_address,
|
||||||
|
message=f"Auto restart resume from ckpt-path: '{latest_checkpoint}', step : {step}",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
send_alert_message(
|
||||||
|
address=self.feishu_address,
|
||||||
|
message=f"Can't find snapshot checkpoint, use default load-ckpt path: {latest_checkpoint}",
|
||||||
|
)
|
||||||
|
|
||||||
|
return latest_checkpoint
|
||||||
|
|
||||||
|
def try_load_model(self, current_time=""):
|
||||||
|
model_load_path = None
|
||||||
|
|
||||||
|
if self.load_ckpt_folder and self.load_model_only_folder:
|
||||||
|
raise ValueError(
|
||||||
|
"Error, try to use both load_ckpt_folder and load_model_only_folder paths, \
|
||||||
|
if you only need to load model weights (for example starting an SFT task for the first time), \
|
||||||
|
set load_model_only_folder path, if you need to resume training from ckpt, \
|
||||||
|
set load_ckpt_folder or use default value \
|
||||||
|
(if is the default value, internlm will try to load the latest ckpt from save_ckpt_folder)"
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.load_ckpt_folder:
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(
|
||||||
|
f"===========Resume training from `{self.load_ckpt_folder}` {current_time} on host:"
|
||||||
|
f"{socket.gethostname()}==========="
|
||||||
|
)
|
||||||
|
model_load_path = self.load_ckpt_folder
|
||||||
|
elif self.load_model_only_folder:
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(
|
||||||
|
f"===========Load Model from `{self.load_model_only_folder}` {current_time} on host:"
|
||||||
|
f"{socket.gethostname()}==========="
|
||||||
|
)
|
||||||
|
model_load_path = self.load_model_only_folder
|
||||||
|
else:
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(
|
||||||
|
f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
|
||||||
|
f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
|
||||||
|
f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
|
||||||
|
)
|
||||||
|
|
||||||
|
# Loading model weights must be done before zero is initialized.
|
||||||
|
if model_load_path is not None:
|
||||||
|
load_model_checkpoint(folder=model_load_path, model=self.model)
|
||||||
|
|
||||||
|
def try_resume_training(self, lr_scheduler, optimizer, lr, train_state, train_dl):
|
||||||
|
"""Attempt to restore the training state of the last ckpt.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lr_scheduler (_LRScheduler): lr_scheduler object.
|
||||||
|
optimizer (Optimizer): optimizer object.
|
||||||
|
lr (float): learning rate.
|
||||||
|
train_state (dict): traing states.
|
||||||
|
train_dl (DataLoader): traning dataloader object
|
||||||
|
"""
|
||||||
|
if self.load_ckpt_folder is not None:
|
||||||
|
# load optimzier states.
|
||||||
|
if self.load_optimizer:
|
||||||
|
load_optimizer_checkpoint(self.load_ckpt_folder, optimizer)
|
||||||
|
# load lr scheduler states.
|
||||||
|
load_scheduler(self.load_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
|
||||||
|
# load training states.
|
||||||
|
load_context(self.load_ckpt_folder, train_dl, train_state)
|
||||||
|
# load dataloader sampler states.
|
||||||
|
if hasattr(train_state, "batch_sampler") and not isinstance(
|
||||||
|
train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
|
||||||
|
):
|
||||||
|
load_sampler(self.load_ckpt_folder, train_dl.batch_sampler)
|
||||||
|
if hasattr(train_state, "data_state_dict"):
|
||||||
|
train_dl.dataset.load_state_dict(
|
||||||
|
llm_load(os.path.join(self.load_ckpt_folder, "sampler_0.pt")), ckpt_path=self.load_ckpt_folder
|
||||||
|
)
|
||||||
|
self.optimizer = optimizer
|
||||||
|
self.lr_scheduler = lr_scheduler
|
||||||
|
|
||||||
|
def save_checkpoint(
|
||||||
|
self,
|
||||||
|
folder,
|
||||||
|
model,
|
||||||
|
optimizer,
|
||||||
|
scheduler,
|
||||||
|
train_state: TrainState,
|
||||||
|
model_config: Dict = None,
|
||||||
|
model_config_file: str = None,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Save checkpoint to the given folder path.
|
||||||
|
"""
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
self.set_save_folder(folder, train_state.step_count)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
torch.distributed.barrier()
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
logger.info(f"Saving checkpoint to `{folder}` at batch count:{train_state.step_count}...")
|
||||||
|
|
||||||
|
timer("save-model").start()
|
||||||
|
save_model_checkpoint(folder=folder, model=model)
|
||||||
|
timer("save-model").stop()
|
||||||
|
|
||||||
|
timer("save-optimizer").start()
|
||||||
|
save_optimizer_checkpoint(optim=optimizer, state_path=folder)
|
||||||
|
timer("save-optimizer").stop()
|
||||||
|
|
||||||
|
if (
|
||||||
|
hasattr(train_state, "data_state_dict")
|
||||||
|
and gpc.get_local_rank(ParallelMode.TENSOR) == 0
|
||||||
|
and gpc.get_local_rank(ParallelMode.PIPELINE) == 0
|
||||||
|
):
|
||||||
|
llm_save(
|
||||||
|
os.path.join(folder, f"sampler_{gpc.get_local_rank(ParallelMode.DATA)}.pt"),
|
||||||
|
saved_obj=train_state.data_state_dict,
|
||||||
|
)
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
scheduler_states = scheduler.state_dict()
|
||||||
|
llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
|
||||||
|
if hasattr(train_state, "batch_sampler") and not isinstance(
|
||||||
|
train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
|
||||||
|
):
|
||||||
|
sampler_state = train_state.batch_sampler.state_dict()
|
||||||
|
llm_save(os.path.join(folder, "sampler.pt"), saved_obj=sampler_state)
|
||||||
|
llm_save(os.path.join(folder, "context.pt"), saved_obj=train_state.state_dict())
|
||||||
|
|
||||||
|
if model_config is not None:
|
||||||
|
# Model configuration dictionary.
|
||||||
|
llm_save(os.path.join(folder, "model_config.pt"), saved_obj=model_config)
|
||||||
|
|
||||||
|
if model_config_file is not None:
|
||||||
|
# The complete training config file content, stored in binary format.
|
||||||
|
llm_save(os.path.join(folder, "config_file.pt"), saved_obj=model_config_file)
|
||||||
|
|
||||||
|
torch.distributed.barrier()
|
||||||
|
|
||||||
|
if gpc.is_rank_for_log():
|
||||||
|
timer.log(["save-model", "save-optimizer"], logger=logger)
|
||||||
|
logger.info(f"Step: {train_state.step_count}, rank 0 save ckpt use {time.time() - start:.3f} s")
|
||||||
|
if self.storage_manager.async_mode is False:
|
||||||
|
llm_save(
|
||||||
|
os.path.join(folder, f"{train_state.step_count}.step"),
|
||||||
|
saved_obj=dict({"step": train_state.step_count}),
|
||||||
|
)
|
||||||
|
|
||||||
|
def set_save_folder(self, folder, step):
|
||||||
|
self.storage_manager.latest_save_folder = folder
|
||||||
|
self.storage_manager.latest_save_step = step
|
||||||
|
|
|
@ -1,15 +1,13 @@
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
from functools import partial
|
from functools import partial, reduce
|
||||||
from typing import Any, Dict, List, Tuple
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
import pyecharts
|
import pyecharts
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from internlm.core.context import ParallelMode
|
from internlm.core.naive_amp import NaiveAMPModel
|
||||||
from internlm.core.context import global_context as gpc
|
|
||||||
from internlm.solver.pipeline_utils import partition_uniform
|
|
||||||
|
|
||||||
mb = 1024 * 1024
|
mb = 1024 * 1024
|
||||||
|
|
||||||
|
@ -107,6 +105,8 @@ class SimpleMemState:
|
||||||
"""
|
"""
|
||||||
Update the total memory usage of the model and sub-models.
|
Update the total memory usage of the model and sub-models.
|
||||||
"""
|
"""
|
||||||
|
self._total_mem = self._layer_mem
|
||||||
|
|
||||||
for stat in self.sub_model_stats.values():
|
for stat in self.sub_model_stats.values():
|
||||||
# Update sub-model status first.
|
# Update sub-model status first.
|
||||||
stat.update_total_memory()
|
stat.update_total_memory()
|
||||||
|
@ -169,6 +169,39 @@ class SimpleMemState:
|
||||||
return {"name": self.layer_name, "children": children}
|
return {"name": self.layer_name, "children": children}
|
||||||
|
|
||||||
|
|
||||||
|
class ActivationMemState:
|
||||||
|
"""
|
||||||
|
Activation Memory State
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, num_chunks: int) -> None:
|
||||||
|
self._num_chunks = num_chunks
|
||||||
|
|
||||||
|
self.inited: List[bool] = [False for _ in range(num_chunks)]
|
||||||
|
self.states: List[SimpleMemState] = [SimpleMemState(f"activations_{idx}") for idx in range(num_chunks)]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def total_mem(self) -> int:
|
||||||
|
return sum(state.total_mem for state in self.states)
|
||||||
|
|
||||||
|
def dump(self, prefix: str = "") -> str:
|
||||||
|
return reduce(lambda x, y: x + y, [state.dump(prefix) for state in self.states])
|
||||||
|
|
||||||
|
def to_json(self, base: int = 1024 * 1024) -> List:
|
||||||
|
return [state.to_json(base) for state in self.states]
|
||||||
|
|
||||||
|
|
||||||
|
def _unpack_naive_wrapper(model: torch.nn.Module) -> Tuple[torch.nn.Module, int]:
|
||||||
|
num_chunks = len(model) if isinstance(model, torch.nn.ModuleList) else 1
|
||||||
|
|
||||||
|
if num_chunks > 1:
|
||||||
|
model = torch.nn.ModuleList([_model.model if isinstance(_model, NaiveAMPModel) else _model for _model in model])
|
||||||
|
else:
|
||||||
|
model = model.model if isinstance(model, NaiveAMPModel) else model
|
||||||
|
|
||||||
|
return model, num_chunks
|
||||||
|
|
||||||
|
|
||||||
class SimpleMemoryProfiler:
|
class SimpleMemoryProfiler:
|
||||||
"""
|
"""
|
||||||
A memory profiler for a llm model.
|
A memory profiler for a llm model.
|
||||||
|
@ -177,7 +210,7 @@ class SimpleMemoryProfiler:
|
||||||
model (torch.nn.Module): The model to profile.
|
model (torch.nn.Module): The model to profile.
|
||||||
optimizer (torch.optim.Optimizer): The optimizer used for training the model.
|
optimizer (torch.optim.Optimizer): The optimizer used for training the model.
|
||||||
log_file (str): The file to write the memory state information to.
|
log_file (str): The file to write the memory state information to.
|
||||||
activation_config (List[str], optional): The list of activation layers to track. Defaults to None.
|
total_steps: number of steps to trace.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
|
@ -186,9 +219,8 @@ class SimpleMemoryProfiler:
|
||||||
optimizer: torch.optim.Optimizer,
|
optimizer: torch.optim.Optimizer,
|
||||||
log_folder: str,
|
log_folder: str,
|
||||||
total_steps: int = 5,
|
total_steps: int = 5,
|
||||||
activation_config: List[str] = None,
|
|
||||||
):
|
):
|
||||||
self._model = model
|
self._model, self._num_model_chunks = _unpack_naive_wrapper(model)
|
||||||
self._optimizer = optimizer
|
self._optimizer = optimizer
|
||||||
self._log_folder = log_folder
|
self._log_folder = log_folder
|
||||||
self._remaining_steps = total_steps
|
self._remaining_steps = total_steps
|
||||||
|
@ -197,17 +229,20 @@ class SimpleMemoryProfiler:
|
||||||
self._record_start_time = time.time()
|
self._record_start_time = time.time()
|
||||||
|
|
||||||
# For activation memory state.
|
# For activation memory state.
|
||||||
self._activation_config = activation_config
|
|
||||||
self._activation_mem_inited: bool = False
|
|
||||||
self._activation_mem: int = 0
|
self._activation_mem: int = 0
|
||||||
self._activation_max_count = 0
|
self._activation_mem_max: int = 0
|
||||||
self._activation_base_mem: SimpleMemState = SimpleMemState("activations")
|
self._activation_base_mems = ActivationMemState(self._num_model_chunks)
|
||||||
|
|
||||||
# Check or create log folder
|
# Check or create log folder
|
||||||
os.makedirs(self._log_folder, exist_ok=True)
|
os.makedirs(self._log_folder, exist_ok=True)
|
||||||
|
|
||||||
# Register activation memory tracking hooks
|
# Register activation memory tracking hooks
|
||||||
self._register_activation_trace_hooks()
|
if self._num_model_chunks > 1:
|
||||||
|
for chunk_id in range(self._num_model_chunks):
|
||||||
|
self._register_activation_trace_hooks(chunk_id, self._model[chunk_id])
|
||||||
|
else:
|
||||||
|
self._register_activation_trace_hooks(0, self._model)
|
||||||
|
|
||||||
# Calculate static parameter cuda memory
|
# Calculate static parameter cuda memory
|
||||||
self._param_mem_state = SimpleMemState("param_mem")
|
self._param_mem_state = SimpleMemState("param_mem")
|
||||||
|
@ -221,7 +256,7 @@ class SimpleMemoryProfiler:
|
||||||
self._calc_tensor_group_memory(self._os_params_mem_state, list(enumerate(self._optimizer.param_groups)))
|
self._calc_tensor_group_memory(self._os_params_mem_state, list(enumerate(self._optimizer.param_groups)))
|
||||||
|
|
||||||
# Generate the first memory record
|
# Generate the first memory record
|
||||||
self.point(create=True)
|
self.point(with_options="params,grads,os_params", create=True)
|
||||||
|
|
||||||
def point(self, with_options: str = "", create: bool = False) -> None:
|
def point(self, with_options: str = "", create: bool = False) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -272,7 +307,7 @@ class SimpleMemoryProfiler:
|
||||||
if "os_state" in options:
|
if "os_state" in options:
|
||||||
layout_info += "os_state_layout:\n" + self._os_state_mem_state.dump()
|
layout_info += "os_state_layout:\n" + self._os_state_mem_state.dump()
|
||||||
if "activation_base" in options:
|
if "activation_base" in options:
|
||||||
layout_info += "activation_base_layout:\n" + self._activation_base_mem.dump()
|
layout_info += "activation_base_layout:\n" + self._activation_base_mems.dump()
|
||||||
|
|
||||||
# Write memory state information to log file
|
# Write memory state information to log file
|
||||||
file_mode = "w" if create else "a"
|
file_mode = "w" if create else "a"
|
||||||
|
@ -315,14 +350,14 @@ class SimpleMemoryProfiler:
|
||||||
[self._os_params_mem_state.to_json(), self._os_state_mem_state.to_json()],
|
[self._os_params_mem_state.to_json(), self._os_state_mem_state.to_json()],
|
||||||
"os_memory_sunburst",
|
"os_memory_sunburst",
|
||||||
)
|
)
|
||||||
self._render_sunburst_chart(self._activation_base_mem.to_json()["children"], "activation_memory_sunburst")
|
self._render_sunburst_chart(self._activation_base_mems.to_json(), "activation_memory_sunburst")
|
||||||
# Generate summary sunburst chart
|
# Generate summary sunburst chart
|
||||||
summary_sunburst_data = [
|
summary_sunburst_data = [
|
||||||
{"name": "params", "value": self._param_mem_state.total_mem // mb},
|
{"name": "params", "value": self._param_mem_state.total_mem // mb},
|
||||||
{"name": "grads", "value": self._grad_mem_state.total_mem // mb},
|
{"name": "grads", "value": self._grad_mem_state.total_mem // mb},
|
||||||
{"name": "os_params", "value": self._os_params_mem_state.total_mem // mb},
|
{"name": "os_params", "value": self._os_params_mem_state.total_mem // mb},
|
||||||
{"name": "os_state", "value": self._os_state_mem_state.total_mem // mb},
|
{"name": "os_state", "value": self._os_state_mem_state.total_mem // mb},
|
||||||
{"name": "activation", "value": self._activation_base_mem.total_mem // mb},
|
{"name": "activation", "value": self._activation_mem_max // mb},
|
||||||
]
|
]
|
||||||
|
|
||||||
self._render_sunburst_chart(summary_sunburst_data, "summary_sunburst")
|
self._render_sunburst_chart(summary_sunburst_data, "summary_sunburst")
|
||||||
|
@ -337,12 +372,13 @@ class SimpleMemoryProfiler:
|
||||||
{},
|
{},
|
||||||
{
|
{
|
||||||
"r0": "10%",
|
"r0": "10%",
|
||||||
"r": "40%",
|
"r": "35%",
|
||||||
"itemStyle": {"borderWidth": 3},
|
"itemStyle": {"borderWidth": 3},
|
||||||
"label": {"align": "left"},
|
"label": {"align": "left"},
|
||||||
},
|
},
|
||||||
{"r0": "40%", "r": "65%", "label": {"align": "left"}},
|
{"r0": "35%", "r": "55%", "label": {"align": "left"}},
|
||||||
{"r0": "65%", "r": "80%", "label": {"align": "left"}},
|
{"r0": "55%", "r": "70%", "label": {"align": "left"}},
|
||||||
|
{"r0": "70%", "r": "80%", "label": {"align": "left"}},
|
||||||
{"r0": "80%", "r": "90%", "label": {"align": "left"}},
|
{"r0": "80%", "r": "90%", "label": {"align": "left"}},
|
||||||
{
|
{
|
||||||
"r0": "90%",
|
"r0": "90%",
|
||||||
|
@ -357,7 +393,14 @@ class SimpleMemoryProfiler:
|
||||||
f"{self._log_folder}/{name}.html"
|
f"{self._log_folder}/{name}.html"
|
||||||
)
|
)
|
||||||
|
|
||||||
def _inner_activation_trace_hook(self, layer_name: str, model: Any, inputs: Any, output: torch.Tensor) -> None:
|
def _inner_activation_trace_hook(
|
||||||
|
self,
|
||||||
|
chunk_id: int,
|
||||||
|
layer_name: str,
|
||||||
|
model: Any,
|
||||||
|
inputs: Any,
|
||||||
|
output: torch.Tensor,
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Hook function to trace the activation memory usage for a inner layer.
|
Hook function to trace the activation memory usage for a inner layer.
|
||||||
|
|
||||||
|
@ -373,13 +416,15 @@ class SimpleMemoryProfiler:
|
||||||
del model, inputs
|
del model, inputs
|
||||||
assert isinstance(output, torch.Tensor), f"Invalid output type: {type(output)}"
|
assert isinstance(output, torch.Tensor), f"Invalid output type: {type(output)}"
|
||||||
|
|
||||||
if self._stoped or self._activation_mem_inited:
|
if self._stoped or self._activation_base_mems.inited[chunk_id]:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Delay updating the total_mem of activation_base_mem here, it will be handled in the forward ending hook.
|
# Delay updating the total_mem of activation_base_mem here, it will be handled in the forward ending hook.
|
||||||
self._activation_base_mem.add(layer_name, output.element_size() * output.nelement(), flush=False)
|
self._activation_base_mems.states[chunk_id].add(
|
||||||
|
layer_name, output.element_size() * output.nelement(), flush=False
|
||||||
|
)
|
||||||
|
|
||||||
def _activation_trace_hook_forward(self, model: Any, inputs: Any, output: torch.Tensor) -> None:
|
def _activation_trace_hook_forward(self, chunk_id: int, model: Any, inputs: Any, output: torch.Tensor) -> None:
|
||||||
"""
|
"""
|
||||||
Hook function to trace the activation memory usage for a forward pass.
|
Hook function to trace the activation memory usage for a forward pass.
|
||||||
|
|
||||||
|
@ -398,23 +443,24 @@ class SimpleMemoryProfiler:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Check if the activation memory has been initialized
|
# Check if the activation memory has been initialized
|
||||||
if self._activation_mem_inited is False:
|
if self._activation_base_mems.inited[chunk_id] is False:
|
||||||
|
self._activation_base_mems.inited[chunk_id] = True
|
||||||
# Update the total memory of the activation base memory state
|
# Update the total memory of the activation base memory state
|
||||||
self._activation_base_mem.update_total_memory()
|
self._activation_base_mems.states[chunk_id].update_total_memory()
|
||||||
# Set with_options to "activation_base" to include activation_base_layout in the memory dump
|
# Set with_options to "activation_base" to include activation_base_layout in the memory dump
|
||||||
self._activation_mem_inited = True
|
with_options = "activation_base"
|
||||||
|
else:
|
||||||
|
with_options = ""
|
||||||
|
|
||||||
# Accumulate activation memory usage for each forward pass
|
# Accumulate activation memory usage for each forward pass
|
||||||
self._activation_mem += self._activation_base_mem.total_mem
|
self._activation_mem += self._activation_base_mems.states[chunk_id].total_mem
|
||||||
|
if self._activation_mem > self._activation_mem_max:
|
||||||
# Update activation max count
|
self._activation_mem_max = self._activation_mem
|
||||||
if self._activation_mem // self._activation_base_mem.total_mem > self._activation_max_count:
|
|
||||||
self._activation_max_count = self._activation_mem // self._activation_base_mem.total_mem
|
|
||||||
|
|
||||||
# Trigger a memory record
|
# Trigger a memory record
|
||||||
self.point()
|
self.point(with_options)
|
||||||
|
|
||||||
def _activation_tarce_hook_backward(self, model: Any, inputs: Any, grad_outputs: Any) -> None:
|
def _activation_tarce_hook_backward(self, chunk_id: int, model: Any, inputs: Any, grad_outputs: Any) -> None:
|
||||||
"""
|
"""
|
||||||
Hook function to trace the activation memory usage for a backward pass.
|
Hook function to trace the activation memory usage for a backward pass.
|
||||||
|
|
||||||
|
@ -432,37 +478,28 @@ class SimpleMemoryProfiler:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Release activation memory usage for each backward pass
|
# Release activation memory usage for each backward pass
|
||||||
self._activation_mem -= self._activation_base_mem.total_mem
|
self._activation_mem -= self._activation_base_mems.states[chunk_id].total_mem
|
||||||
|
|
||||||
# Trigger a memory record
|
# Trigger a memory record
|
||||||
self.point()
|
self.point()
|
||||||
|
|
||||||
def _register_activation_trace_hooks(self) -> None:
|
def _register_activation_trace_hooks(self, chunk_id: int, model_chunk: torch.nn.Module) -> None:
|
||||||
"""
|
"""
|
||||||
Register activation trace hooks for the model and each submodule in the model.
|
Register activation trace hooks for the model and each submodule in the model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Register inner activation trace hooks for each submodule in the model
|
# Register inner activation trace hooks for each submodule in the model
|
||||||
for layer_name in self._activation_config:
|
for layer_name, sub_model in model_chunk.named_modules():
|
||||||
# Register a hook for every activation
|
|
||||||
model = self._model
|
|
||||||
sub_models = layer_name.split(".")
|
|
||||||
# Get the target sub-model
|
|
||||||
for sub_model_name in sub_models:
|
|
||||||
try:
|
|
||||||
model = model.get_submodule(sub_model_name)
|
|
||||||
except AttributeError:
|
|
||||||
model = None
|
|
||||||
break
|
|
||||||
|
|
||||||
# Register the hook
|
# Register the hook
|
||||||
if model is not None:
|
if len(sub_model._modules) != 0:
|
||||||
model.register_forward_hook(partial(self._inner_activation_trace_hook, layer_name))
|
continue # TODO: in some special cases, we may need some additional configuration to correct
|
||||||
|
|
||||||
|
sub_model.register_forward_hook(partial(self._inner_activation_trace_hook, chunk_id, layer_name))
|
||||||
|
|
||||||
# Register a forward hook for the main model to track activation memory usage
|
# Register a forward hook for the main model to track activation memory usage
|
||||||
self._model.register_forward_hook(self._activation_trace_hook_forward)
|
model_chunk.register_forward_hook(partial(self._activation_trace_hook_forward, chunk_id))
|
||||||
# Register a backward hook for the main model to release activation memory usage
|
# Register a backward hook for the main model to release activation memory usage
|
||||||
self._model.register_full_backward_hook(self._activation_tarce_hook_backward)
|
model_chunk.register_full_backward_hook(partial(self._activation_tarce_hook_backward, chunk_id))
|
||||||
|
|
||||||
def _calc_tensor_memory(
|
def _calc_tensor_memory(
|
||||||
self, root_stat: SimpleMemState, named_tensors: Dict[str, torch.Tensor], require_grad: bool = False
|
self, root_stat: SimpleMemState, named_tensors: Dict[str, torch.Tensor], require_grad: bool = False
|
||||||
|
@ -554,48 +591,6 @@ class SimpleMemoryProfiler:
|
||||||
self._calc_tensor_memory(root_stat, named_tensors)
|
self._calc_tensor_memory(root_stat, named_tensors)
|
||||||
|
|
||||||
|
|
||||||
def build_activation_config(num_layers: int, num_chunks: int = 1) -> List[str]:
|
|
||||||
# TODO: support interleaved pipeline scheduling.
|
|
||||||
assert num_chunks == 1, "Only support num_chunks == 1"
|
|
||||||
|
|
||||||
if gpc.is_initialized(ParallelMode.PIPELINE):
|
|
||||||
pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
|
|
||||||
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
|
|
||||||
else:
|
|
||||||
pipeline_size = 1
|
|
||||||
pipeline_rank = 0
|
|
||||||
|
|
||||||
all_parts = partition_uniform(num_layers, pipeline_size, num_chunks)
|
|
||||||
parts = all_parts[pipeline_rank]
|
|
||||||
start, end = parts[0]
|
|
||||||
num_blocks = end - start
|
|
||||||
|
|
||||||
block_conf_tmpl = [
|
|
||||||
"mixer.rotary_emb",
|
|
||||||
"mixer.Wqkv",
|
|
||||||
"mixer.inner_attn",
|
|
||||||
"mixer.inner_cross_attn",
|
|
||||||
"mixer.out_proj",
|
|
||||||
# "dropout1", # skip when dropout_selective_checkpoint is True
|
|
||||||
# "dropout2", # skip when dropout_selective_checkpoint is True
|
|
||||||
"norm1",
|
|
||||||
"norm2",
|
|
||||||
"mlp.w1",
|
|
||||||
"mlp.w2",
|
|
||||||
"mlp.w3",
|
|
||||||
]
|
|
||||||
|
|
||||||
block_conf = []
|
|
||||||
for block_id in range(num_blocks):
|
|
||||||
block_conf += [f"blocks.{block_id}.{layer}" for layer in block_conf_tmpl]
|
|
||||||
|
|
||||||
# We don't need to care about whether the embedding, norm, and head layers exist in the model after partitioning.
|
|
||||||
# If they don't exist, they will be automatically ignored when registering activation trace hooks.
|
|
||||||
activation_conf = ["embedding", "norm", "head"] + block_conf
|
|
||||||
|
|
||||||
return activation_conf
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
class SimpleModel(torch.nn.Module):
|
class SimpleModel(torch.nn.Module):
|
||||||
|
@ -635,32 +630,39 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
def _simple_schedule(_num_chunks, _model_chunks, _input) -> torch.Tensor:
|
||||||
|
if _num_chunks > 1:
|
||||||
|
_output = _input
|
||||||
|
for _model_chunk in _model_chunks:
|
||||||
|
_output = _model_chunk(_output)
|
||||||
|
else:
|
||||||
|
_output = _model_chunks(_input)
|
||||||
|
|
||||||
|
return _output
|
||||||
|
|
||||||
|
# num_chunks config
|
||||||
|
_num_chunks = 1
|
||||||
|
|
||||||
# init model and optimizer
|
# init model and optimizer
|
||||||
_model: torch.nn.Module = SimpleModel()
|
if _num_chunks > 1:
|
||||||
|
_chunks = [SimpleModel(skip_layer2=idx % 2 == 0) for idx in range(_num_chunks)]
|
||||||
|
_model = torch.nn.ModuleList(_chunks).cuda()
|
||||||
|
else:
|
||||||
|
_model: torch.nn.Module = SimpleModel().cuda()
|
||||||
_optimizer = torch.optim.Adam(_model.parameters())
|
_optimizer = torch.optim.Adam(_model.parameters())
|
||||||
|
|
||||||
# create activation config for simple model layer by layer.
|
|
||||||
activation_configs = [
|
|
||||||
# model level 0
|
|
||||||
"layer1",
|
|
||||||
"layer2",
|
|
||||||
"layer3",
|
|
||||||
# model level 1
|
|
||||||
"layer2.layer1",
|
|
||||||
"layer2.layer3",
|
|
||||||
]
|
|
||||||
|
|
||||||
_model.modules()
|
|
||||||
|
|
||||||
# init profiler
|
# init profiler
|
||||||
profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler.log", activation_configs)
|
profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler", total_steps=1)
|
||||||
|
|
||||||
_optimizer.zero_grad()
|
_optimizer.zero_grad()
|
||||||
|
|
||||||
x1 = torch.randn((128, 5120))
|
# inputs
|
||||||
x2 = torch.randn((128, 5120))
|
x1 = torch.randn((128, 5120)).cuda()
|
||||||
out1 = _model(x1)
|
x2 = torch.randn((128, 5120)).cuda()
|
||||||
out2 = _model(x2)
|
# forward
|
||||||
|
out1 = _simple_schedule(_num_chunks, _model, x1)
|
||||||
|
out2 = _simple_schedule(_num_chunks, _model, x2)
|
||||||
|
# backward
|
||||||
out1.mean().backward()
|
out1.mean().backward()
|
||||||
out2.mean().backward()
|
out2.mean().backward()
|
||||||
|
|
||||||
|
|
|
@ -15,8 +15,6 @@ from asyncio.tasks import ALL_COMPLETED
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Awaitable, Callable, Dict, List, Union
|
from typing import Any, Awaitable, Callable, Dict, List, Union
|
||||||
|
|
||||||
import boto3
|
|
||||||
import botocore
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
|
|
||||||
|
@ -24,6 +22,13 @@ from internlm.core.context import global_context as gpc
|
||||||
from internlm.utils.common import SingletonMeta
|
from internlm.utils.common import SingletonMeta
|
||||||
from internlm.utils.logger import get_logger
|
from internlm.utils.logger import get_logger
|
||||||
|
|
||||||
|
try:
|
||||||
|
import boto3
|
||||||
|
import botocore
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
logger = get_logger(__file__)
|
logger = get_logger(__file__)
|
||||||
|
|
||||||
boto3_url_re = re.compile(r"([^\.]+)\.([\d\.]+)")
|
boto3_url_re = re.compile(r"([^\.]+)\.([\d\.]+)")
|
||||||
|
@ -234,13 +239,13 @@ class Boto3Client(StorageClient):
|
||||||
"""
|
"""
|
||||||
paginator = handler.client.get_paginator("list_objects_v2")
|
paginator = handler.client.get_paginator("list_objects_v2")
|
||||||
pages = paginator.paginate(Bucket=bucket_name, Prefix=fp)
|
pages = paginator.paginate(Bucket=bucket_name, Prefix=fp)
|
||||||
|
|
||||||
folder_name_list = []
|
folder_name_list = []
|
||||||
for page in pages:
|
for page in pages:
|
||||||
for obj in page["Contents"]:
|
if "Contents" in page:
|
||||||
fp: str = obj["Key"]
|
for obj in page["Contents"]:
|
||||||
folder_name_list.append(fp.rsplit("/", maxsplit=1)[1])
|
pth: str = obj["Key"]
|
||||||
return folder_name_list
|
folder_name_list.append(pth.split(fp, maxsplit=1)[1].strip("/").split("/", maxsplit=1)[0])
|
||||||
|
return list(set(folder_name_list))
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def async_upload_fileobj(handler, bucket_name: str, fp: str, local_nvme_path: str):
|
def async_upload_fileobj(handler, bucket_name: str, fp: str, local_nvme_path: str):
|
||||||
|
@ -391,6 +396,11 @@ class StorageManager(metaclass=SingletonMeta):
|
||||||
self.tmp_local_folder = tmp_local_folder
|
self.tmp_local_folder = tmp_local_folder
|
||||||
self.async_mode = async_mode
|
self.async_mode = async_mode
|
||||||
self.has_warning = False
|
self.has_warning = False
|
||||||
|
self._async_loop = None
|
||||||
|
self._thread_pool = None
|
||||||
|
self.latest_save_folder = None
|
||||||
|
self.latest_save_step = 0
|
||||||
|
self.async_task_peeding = False
|
||||||
|
|
||||||
if enable_save and self.async_mode:
|
if enable_save and self.async_mode:
|
||||||
self._async_loop = asyncio.new_event_loop()
|
self._async_loop = asyncio.new_event_loop()
|
||||||
|
@ -485,6 +495,7 @@ class StorageManager(metaclass=SingletonMeta):
|
||||||
torch.save(saved_obj, f, pickle_protocol=pickle.HIGHEST_PROTOCOL)
|
torch.save(saved_obj, f, pickle_protocol=pickle.HIGHEST_PROTOCOL)
|
||||||
self.async_executor(meta.async_upload_fn, *unpack_meta(meta))
|
self.async_executor(meta.async_upload_fn, *unpack_meta(meta))
|
||||||
os.chmod(tmp_step_file, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
|
os.chmod(tmp_step_file, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
|
||||||
|
self.async_task_peeding = True
|
||||||
else:
|
else:
|
||||||
meta.client.sync_upload_fileobj(*unpack_meta(meta), *args, saved_obj=saved_obj, **kwargs)
|
meta.client.sync_upload_fileobj(*unpack_meta(meta), *args, saved_obj=saved_obj, **kwargs)
|
||||||
self.upload_count += 1
|
self.upload_count += 1
|
||||||
|
@ -523,23 +534,22 @@ class StorageManager(metaclass=SingletonMeta):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
async def _sync_tasks(self) -> Awaitable[None]:
|
async def _sync_tasks(self) -> Awaitable[None]:
|
||||||
if not self._async_stack:
|
if self._async_stack:
|
||||||
return
|
await asyncio.wait(self._async_stack, return_when=ALL_COMPLETED)
|
||||||
|
count = 0
|
||||||
await asyncio.wait(self._async_stack, return_when=ALL_COMPLETED)
|
while self._async_stack:
|
||||||
|
t = self._async_stack[0]
|
||||||
for task in self._async_stack:
|
try:
|
||||||
try:
|
e = t.exception()
|
||||||
task.exception()
|
if e:
|
||||||
except InvalidStateError:
|
self._exception_list.append((e, count))
|
||||||
continue
|
logger.error(f"File:{self._to_be_del_files[count]}, upload failed for {e}")
|
||||||
except Exception as e:
|
# raise e
|
||||||
file_id = len(self._exception_list)
|
count += 1
|
||||||
self._exception_list.append((e, file_id))
|
self._async_stack.pop(0)
|
||||||
|
except InvalidStateError:
|
||||||
logger.error(f"File: {self._to_be_del_files[file_id]}, " f"upload failed with {e}")
|
# Not finished. https://docs.python.org/3/library/asyncio-task.html#asyncio.Task.exception
|
||||||
|
pass
|
||||||
self._async_stack.clear()
|
|
||||||
|
|
||||||
def async_executor(self, fn: Callable, *args, **kwargs) -> None:
|
def async_executor(self, fn: Callable, *args, **kwargs) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -559,11 +569,14 @@ class StorageManager(metaclass=SingletonMeta):
|
||||||
if not self.async_mode:
|
if not self.async_mode:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not self.async_task_peeding:
|
||||||
|
return
|
||||||
|
|
||||||
if self._async_loop:
|
if self._async_loop:
|
||||||
self._async_loop.run_until_complete(self._sync_tasks())
|
self._async_loop.run_until_complete(self._sync_tasks())
|
||||||
|
|
||||||
if self._exception_list:
|
if self._exception_list:
|
||||||
for file_id, error_msg in self._exception_list:
|
for error_msg, file_id in self._exception_list:
|
||||||
logger.error(
|
logger.error(
|
||||||
f"Node:{socket.gethostname()}, Error: Checkpoint {self._to_be_del_files[file_id]} "
|
f"Node:{socket.gethostname()}, Error: Checkpoint {self._to_be_del_files[file_id]} "
|
||||||
f"failed on step {self.upload_count}: {error_msg}"
|
f"failed on step {self.upload_count}: {error_msg}"
|
||||||
|
@ -577,10 +590,16 @@ class StorageManager(metaclass=SingletonMeta):
|
||||||
self._del_tmp_folder()
|
self._del_tmp_folder()
|
||||||
self._exception_list.clear()
|
self._exception_list.clear()
|
||||||
self._to_be_del_files.clear()
|
self._to_be_del_files.clear()
|
||||||
|
self.async_task_peeding = False
|
||||||
|
|
||||||
if gpc.is_rank_for_log():
|
if gpc.is_rank_for_log():
|
||||||
logger.info("all async uploads succeeded!")
|
|
||||||
self.upload_count += 1
|
self.upload_count += 1
|
||||||
|
if self.async_mode:
|
||||||
|
self.save(
|
||||||
|
os.path.join(self.latest_save_folder, f"{self.latest_save_step}.step"),
|
||||||
|
saved_obj=dict({"step": self.latest_save_step}),
|
||||||
|
async_upload=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
storage_manager: StorageManager = None
|
storage_manager: StorageManager = None
|
||||||
|
|
|
@ -11,10 +11,6 @@ from torch.utils.tensorboard import SummaryWriter
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
|
|
||||||
|
|
||||||
def copy_ignore_folder(source_path, target_path):
|
|
||||||
os.system(f"cp -r {source_path}/* {target_path}/")
|
|
||||||
|
|
||||||
|
|
||||||
def tb_save_run_info(writer, config_lines, global_step=0):
|
def tb_save_run_info(writer, config_lines, global_step=0):
|
||||||
writer.add_text(tag="cmd", text_string=" ".join(sys.argv[:]), global_step=global_step)
|
writer.add_text(tag="cmd", text_string=" ".join(sys.argv[:]), global_step=global_step)
|
||||||
lines = []
|
lines = []
|
||||||
|
@ -42,9 +38,21 @@ def init_tb_writer(
|
||||||
tb_folder = tensorboard_folder
|
tb_folder = tensorboard_folder
|
||||||
|
|
||||||
if gpc.get_global_rank() == 0:
|
if gpc.get_global_rank() == 0:
|
||||||
|
# If we don't load ckpt, 'resume_tb_folder' is set as the tensorboard
|
||||||
|
# dir of the last task by 'make_launch_script.sh'.
|
||||||
|
# If we load ckpt, 'resume_tb_folder' will be overwritten as the
|
||||||
|
# reloaded 'train_state.resume_tb_folder'.s
|
||||||
if resume_tb_folder is not None:
|
if resume_tb_folder is not None:
|
||||||
logger.info(f"Try mv tensorboard logs: {resume_tb_folder} to {tb_folder}...")
|
assert len(resume_tb_folder) > 0 and resume_tb_folder != "/"
|
||||||
copy_ignore_folder(resume_tb_folder, tb_folder)
|
if not os.path.exists(resume_tb_folder):
|
||||||
|
logger.error(
|
||||||
|
f"Can't found resume_tb_folder{resume_tb_folder}, \
|
||||||
|
please make sure this folder is located at local file system."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(f"Try mv tensorboard logs: {resume_tb_folder} to {tb_folder}... ")
|
||||||
|
os.system(f"cp -r {resume_tb_folder}/* {tb_folder}/")
|
||||||
|
os.system(f"chmod -R +w {tb_folder}/")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Login tensorboard logs to: {tb_folder}")
|
logger.info(f"Login tensorboard logs to: {tb_folder}")
|
||||||
|
|
||||||
|
@ -126,6 +134,14 @@ class Writer:
|
||||||
except Exception:
|
except Exception:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
def add_scalars(self, key, value, step):
|
||||||
|
try:
|
||||||
|
assert isinstance(value, dict)
|
||||||
|
if self.enable_tb and self.tb_writer is not None:
|
||||||
|
self.tb_writer.add_scalars(main_tag=key, tag_scalar_dict=value, global_step=step)
|
||||||
|
except Exception:
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
def add_text(self, key, value, step):
|
def add_text(self, key, value, step):
|
||||||
try:
|
try:
|
||||||
if self.enable_tb and self.tb_writer is not None:
|
if self.enable_tb and self.tb_writer is not None:
|
||||||
|
|
|
@ -13,4 +13,4 @@ boto3
|
||||||
botocore
|
botocore
|
||||||
torch-scatter
|
torch-scatter
|
||||||
pyecharts
|
pyecharts
|
||||||
-f https://data.pyg.org/whl/torch-1.13.0+cu117.html
|
-f https://data.pyg.org/whl/torch-1.13.1+cu117.html
|
518
train.py
518
train.py
|
@ -5,99 +5,48 @@ import socket
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from torch import nn
|
|
||||||
from torch.utils.data import DataLoader
|
|
||||||
|
|
||||||
import internlm
|
import internlm
|
||||||
from internlm.core.context import ParallelMode
|
from internlm.core.context import ParallelMode
|
||||||
from internlm.core.context import global_context as gpc
|
from internlm.core.context import global_context as gpc
|
||||||
from internlm.core.naive_amp import NaiveAMPModel
|
|
||||||
from internlm.core.scheduler import SchedulerMetricHook
|
from internlm.core.scheduler import SchedulerMetricHook
|
||||||
from internlm.core.trainer import TrainState
|
from internlm.core.trainer import TrainState
|
||||||
from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
|
from internlm.initialize import initialize_distributed_env
|
||||||
from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
|
|
||||||
from internlm.data.dataset import get_dataset_dict
|
|
||||||
from internlm.data.dummy_dataset import RandomDataset
|
|
||||||
from internlm.data.packed_dataset import (
|
|
||||||
PackedDataset,
|
|
||||||
PackedDatasetWithoutCuSeqlen,
|
|
||||||
get_packed_dataset_without_short_length,
|
|
||||||
)
|
|
||||||
from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
|
|
||||||
from internlm.model.loss import FlashGPTLMLoss
|
from internlm.model.loss import FlashGPTLMLoss
|
||||||
from internlm.model.metrics import AccPerplex
|
from internlm.model.metrics import AccPerplex
|
||||||
from internlm.monitor import initialize_monitor_manager, send_alert_message, set_env_var
|
from internlm.monitor import initialize_monitor_manager, send_alert_message
|
||||||
from internlm.monitor.monitor import monitor_manager as mm
|
from internlm.monitor.monitor import monitor_manager as mm
|
||||||
from internlm.solver.beta2_scheduler import Beta2Scheduler
|
from internlm.train import (
|
||||||
from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
|
get_train_data_loader,
|
||||||
from internlm.solver.optimizer import HybridZeroOptimizer
|
get_validation_data_loader,
|
||||||
|
initialize_llm_profile,
|
||||||
|
initialize_model,
|
||||||
|
initialize_optimizer,
|
||||||
|
load_new_batch,
|
||||||
|
record_current_batch_training_metrics,
|
||||||
|
)
|
||||||
from internlm.utils.common import (
|
from internlm.utils.common import (
|
||||||
BatchSkipper,
|
BatchSkipper,
|
||||||
DummyProfile,
|
|
||||||
get_master_node,
|
|
||||||
get_megatron_flops,
|
get_megatron_flops,
|
||||||
launch_time,
|
launch_time,
|
||||||
parse_args,
|
parse_args,
|
||||||
)
|
)
|
||||||
from internlm.utils.evaluation import evaluate_on_val_dls
|
from internlm.utils.evaluation import evaluate_on_val_dls
|
||||||
|
from internlm.utils.gputest import bench_gpu, bench_net
|
||||||
from internlm.utils.logger import get_logger, initialize_uniscale_logger
|
from internlm.utils.logger import get_logger, initialize_uniscale_logger
|
||||||
from internlm.utils.megatron_timers import megatron_timer as timer
|
from internlm.utils.megatron_timers import megatron_timer as timer
|
||||||
from internlm.utils.model_checkpoint import (
|
from internlm.utils.model_checkpoint import CheckpointManager
|
||||||
CheckpointSaveManager,
|
from internlm.utils.parallel import get_parallel_log_file_name
|
||||||
load_context,
|
from internlm.utils.simple_memory_profiler import SimpleMemoryProfiler
|
||||||
load_model_checkpoint,
|
|
||||||
load_optimizer_checkpoint,
|
|
||||||
load_sampler,
|
|
||||||
load_scheduler,
|
|
||||||
)
|
|
||||||
from internlm.utils.parallel import (
|
|
||||||
get_parallel_log_file_name,
|
|
||||||
is_no_pp_or_last_stage,
|
|
||||||
sync_model_param,
|
|
||||||
sync_model_param_within_tp,
|
|
||||||
)
|
|
||||||
from internlm.utils.registry import MODEL_INITIALIZER
|
|
||||||
from internlm.utils.simple_memory_profiler import (
|
|
||||||
SimpleMemoryProfiler,
|
|
||||||
build_activation_config,
|
|
||||||
)
|
|
||||||
from internlm.utils.writer import Writer
|
from internlm.utils.writer import Writer
|
||||||
|
|
||||||
# global llm logger
|
# global llm logger
|
||||||
logger = get_logger(__file__)
|
logger = get_logger(__file__)
|
||||||
|
|
||||||
|
|
||||||
def initialize_distributed_env(config: str, launcher: str = "slurm", master_port: int = 8888, seed: int = 1024):
|
|
||||||
"""
|
|
||||||
Initialize distributed environment for distributed training.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
config (str): Config file path.
|
|
||||||
launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
|
|
||||||
master_port (str): The master port for distributed training. 8888 by default.
|
|
||||||
seed (int, optional): Specified random seed for every process. 1024 by default.
|
|
||||||
"""
|
|
||||||
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
if launcher == "torch":
|
|
||||||
internlm.launch_from_torch(config=config, seed=seed)
|
|
||||||
elif launcher == "slurm":
|
|
||||||
internlm.launch_from_slurm(
|
|
||||||
config=config,
|
|
||||||
host=get_master_node(),
|
|
||||||
port=master_port,
|
|
||||||
seed=seed,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
|
|
||||||
|
|
||||||
|
|
||||||
def initialize_llm_logger(start_time: str):
|
def initialize_llm_logger(start_time: str):
|
||||||
"""
|
"""
|
||||||
Initialize customed uniscale logger.
|
Initialize customed uniscale logger.
|
||||||
|
@ -118,357 +67,14 @@ def initialize_llm_logger(start_time: str):
|
||||||
return uniscale_logger
|
return uniscale_logger
|
||||||
|
|
||||||
|
|
||||||
def initialize_model():
|
|
||||||
"""
|
|
||||||
Initialize model.
|
|
||||||
|
|
||||||
Returns: The neural network model to be trained or evaluated.
|
|
||||||
"""
|
|
||||||
|
|
||||||
model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
|
|
||||||
if isinstance(model, nn.ModuleList):
|
|
||||||
model = nn.ModuleList(
|
|
||||||
[
|
|
||||||
NaiveAMPModel(
|
|
||||||
model=_m,
|
|
||||||
output_to_fp32=False, # manually controlled by interleaved pipleline scheduler
|
|
||||||
dtype=gpc.config.model.get("dtype", torch.half),
|
|
||||||
sync_buffer=False,
|
|
||||||
)
|
|
||||||
for _m in model
|
|
||||||
]
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
model = NaiveAMPModel(
|
|
||||||
model=model,
|
|
||||||
output_to_fp32=is_no_pp_or_last_stage(),
|
|
||||||
dtype=gpc.config.model.get("dtype", torch.half),
|
|
||||||
sync_buffer=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# This sync is very important, cause the model weights kept in optimizer are copied
|
|
||||||
# from the origin parameters in the memory, so we should make sure the dp sync
|
|
||||||
# does not influence the model weights in optimizer be different with the origin parameters.
|
|
||||||
sync_model_param(model, parallel_mode=ParallelMode.DATA)
|
|
||||||
|
|
||||||
# This function is needed to make sure parameters that are not splitted by tensor parallelism are
|
|
||||||
# the same across tensor parallelism.
|
|
||||||
sync_model_param_within_tp(model)
|
|
||||||
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
def get_train_data_loader(num_worker: int = 0):
|
|
||||||
"""
|
|
||||||
Generate and return the training data loader.
|
|
||||||
|
|
||||||
Returns: A tuple of (train_dl, dataset_types).
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Get the dataset types
|
|
||||||
dataset_types = None
|
|
||||||
dataset_types = list(DATASET_TYPE_IDS_MAP.keys())
|
|
||||||
data_cfg = gpc.config.data
|
|
||||||
|
|
||||||
# Get the sample weight dictionary
|
|
||||||
train_folder = data_cfg.train_folder
|
|
||||||
|
|
||||||
if not train_folder:
|
|
||||||
train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
|
|
||||||
if data_cfg.pack_sample_into_one:
|
|
||||||
train_ds = PackedDatasetWithoutCuSeqlen(
|
|
||||||
train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
train_ds = PackedDataset(
|
|
||||||
train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
train_ds = get_packed_dataset_without_short_length(
|
|
||||||
folder=data_cfg.train_folder,
|
|
||||||
packed_length=data_cfg.packed_length,
|
|
||||||
max_length_per_sample=data_cfg.seq_len,
|
|
||||||
show_progress=dist.get_rank() == 0,
|
|
||||||
min_length=data_cfg.min_length,
|
|
||||||
min_length_dict=data_cfg.get("min_length_dict", {}),
|
|
||||||
pack_into_one_sample=data_cfg.pack_sample_into_one,
|
|
||||||
)
|
|
||||||
|
|
||||||
# partition already completed
|
|
||||||
# assert isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen))
|
|
||||||
if isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen)):
|
|
||||||
datasets = [train_ds]
|
|
||||||
else:
|
|
||||||
datasets = train_ds.datasets
|
|
||||||
|
|
||||||
# Create the training dataset sampler
|
|
||||||
train_sampler = StaticBatchSampler(
|
|
||||||
datasets,
|
|
||||||
batch_size=data_cfg.micro_num,
|
|
||||||
rampup_batch_size=data_cfg.rampup_batch_size,
|
|
||||||
micro_bsz=data_cfg.micro_bsz,
|
|
||||||
seed=1024,
|
|
||||||
drop_last=True,
|
|
||||||
data_rank=gpc.get_local_rank(ParallelMode.DATA),
|
|
||||||
data_world_size=gpc.get_world_size(ParallelMode.DATA),
|
|
||||||
)
|
|
||||||
|
|
||||||
train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
|
|
||||||
|
|
||||||
# Create the training data loader
|
|
||||||
train_dl = DataLoader(
|
|
||||||
dataset=train_ds,
|
|
||||||
batch_sampler=train_sampler,
|
|
||||||
num_workers=num_worker,
|
|
||||||
pin_memory=True,
|
|
||||||
collate_fn=train_collate_fn,
|
|
||||||
persistent_workers=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
return train_dl, dataset_types
|
|
||||||
|
|
||||||
|
|
||||||
def get_validation_data_loader(num_worker: int = 0):
|
|
||||||
"""Generate and return the validation data loader."""
|
|
||||||
|
|
||||||
data_cfg = gpc.config.data
|
|
||||||
|
|
||||||
if not data_cfg.valid_folder:
|
|
||||||
val_ds = RandomDataset(num_samples=gpc.get_world_size(ParallelMode.DATA) * 500, max_len=data_cfg.seq_len)
|
|
||||||
else:
|
|
||||||
val_ds = get_dataset_dict(folder=data_cfg.valid_folder, split="")
|
|
||||||
|
|
||||||
if not isinstance(val_ds, dict):
|
|
||||||
val_ds = {"val": val_ds}
|
|
||||||
|
|
||||||
val_collate_fn = partial(jsonl_ds_collate_fn, max_length_per_sample=data_cfg.seq_len)
|
|
||||||
|
|
||||||
val_dls = {}
|
|
||||||
for val_name, ds in val_ds.items():
|
|
||||||
# making the batch_size of validate larger can speed up the evaluation, but it should not be too large,
|
|
||||||
# otherwise too much data may be dropped
|
|
||||||
batch_size = min(
|
|
||||||
data_cfg.valid_micro_num * data_cfg.micro_bsz, len(ds) // gpc.get_world_size(ParallelMode.DATA)
|
|
||||||
)
|
|
||||||
batch_size = batch_size // data_cfg.micro_bsz * data_cfg.micro_bsz
|
|
||||||
|
|
||||||
if batch_size == 0 and gpc.is_rank_for_log():
|
|
||||||
logger.info(f"skip validate {val_name}.")
|
|
||||||
continue
|
|
||||||
|
|
||||||
val_dls[val_name] = get_dpsampler_dataloader(
|
|
||||||
ds, shuffle=False, num_workers=num_worker, batch_size=batch_size, collate_fn=val_collate_fn, drop_last=True
|
|
||||||
) # drop_last=True, otherwise it may cause problems in the last batch
|
|
||||||
|
|
||||||
if gpc.is_rank_for_log():
|
|
||||||
logger.info(
|
|
||||||
f"load validation dataset {val_name} with valid batch size {str(batch_size)} and "
|
|
||||||
f"samples {str(len(val_dls[val_name]))}."
|
|
||||||
)
|
|
||||||
|
|
||||||
return val_dls
|
|
||||||
|
|
||||||
|
|
||||||
def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
|
|
||||||
"""
|
|
||||||
Load and return the new batch data based on training data loader.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
train_dl (torch.utils.data.DataLoader): Dataloader for training.
|
|
||||||
train_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
|
|
||||||
train_state (TrainState): Current training state.
|
|
||||||
|
|
||||||
Returns: A batch data and the updated train_iter.
|
|
||||||
"""
|
|
||||||
|
|
||||||
timer("batch-gen").start()
|
|
||||||
try:
|
|
||||||
batch = next(train_iter) # structure is ({'input_ids': Tensor, 'cu_seqlens': Tensor}, Tensor)
|
|
||||||
next(train_state.batch_sampler_iter)
|
|
||||||
except StopIteration:
|
|
||||||
train_iter = iter(train_dl)
|
|
||||||
batch = next(train_iter)
|
|
||||||
train_state.batch_sampler_iter = iter(train_state.batch_sampler)
|
|
||||||
next(train_state.batch_sampler_iter)
|
|
||||||
train_state.num_consumed_samples_in_epoch = 0
|
|
||||||
timer("batch-gen").stop()
|
|
||||||
|
|
||||||
if batch[0].get("type_ids", None) is not None:
|
|
||||||
# if use_flash_attn is False, we need to unpack type_ids
|
|
||||||
if not gpc.config.model.use_flash_attn:
|
|
||||||
batch[0]["type_ids"] = unpack_data(batch[0]["type_ids"], batch[0]["cu_seqlens"])
|
|
||||||
|
|
||||||
return batch, train_iter
|
|
||||||
|
|
||||||
|
|
||||||
def initialize_optimizer(model: nn.Module):
|
|
||||||
"""
|
|
||||||
Initialize optimizer.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model (torch.nn.Module): Your model instance to be trained or evaluated.
|
|
||||||
|
|
||||||
Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
|
|
||||||
"""
|
|
||||||
adam_cfg = gpc.config.adam
|
|
||||||
naive_optimizer = torch.optim.AdamW(
|
|
||||||
params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
|
|
||||||
lr=adam_cfg.lr,
|
|
||||||
betas=(adam_cfg.adam_beta1, adam_cfg.adam_beta2),
|
|
||||||
eps=adam_cfg.adam_eps,
|
|
||||||
)
|
|
||||||
|
|
||||||
optimizer = HybridZeroOptimizer(
|
|
||||||
naive_optimizer, grad_scal_cfg=gpc.config.grad_scaler, zero_cfg=gpc.config.hybrid_zero_optimizer
|
|
||||||
)
|
|
||||||
|
|
||||||
beta2_scheduler = Beta2Scheduler(optimizer=naive_optimizer, **gpc.config.beta2_scheduler)
|
|
||||||
|
|
||||||
lr_scheduler = FineTuneCosineAnnealingWarmupLR(optimizer, **gpc.config.lr_scheduler)
|
|
||||||
|
|
||||||
return optimizer, beta2_scheduler, lr_scheduler
|
|
||||||
|
|
||||||
|
|
||||||
def initialize_llm_profile(profiling: bool = False, start_time: str = None):
|
|
||||||
"""Initialize and return the profiler context manager instance."""
|
|
||||||
|
|
||||||
if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
|
|
||||||
llm_profile = torch.profiler.profile
|
|
||||||
logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
|
|
||||||
else:
|
|
||||||
llm_profile = DummyProfile
|
|
||||||
|
|
||||||
return llm_profile(
|
|
||||||
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
|
|
||||||
schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
|
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
|
||||||
f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
|
|
||||||
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
|
||||||
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
|
|
||||||
+ f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
|
|
||||||
),
|
|
||||||
with_stack=True,
|
|
||||||
with_modules=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def record_current_batch_training_metrics(
|
|
||||||
get_tflops_func,
|
|
||||||
logger,
|
|
||||||
writer,
|
|
||||||
success_update,
|
|
||||||
batch_count,
|
|
||||||
batch,
|
|
||||||
train_state,
|
|
||||||
optimizer,
|
|
||||||
beta2_scheduler,
|
|
||||||
trainer,
|
|
||||||
start_time,
|
|
||||||
loss,
|
|
||||||
grad_norm,
|
|
||||||
metric,
|
|
||||||
update_panel,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Print some training metrics of current batch.
|
|
||||||
"""
|
|
||||||
|
|
||||||
set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))
|
|
||||||
|
|
||||||
if success_update in (0, True):
|
|
||||||
train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
|
|
||||||
if is_no_pp_or_last_stage():
|
|
||||||
acc_perplex = metric.get_metric()
|
|
||||||
|
|
||||||
if success_update and gpc.is_rank_for_log():
|
|
||||||
lr = optimizer.param_groups[0]["lr"]
|
|
||||||
if hasattr(trainer.engine.optimizer, "grad_scaler"):
|
|
||||||
scaler = trainer.engine.optimizer.grad_scaler._scale.item()
|
|
||||||
elif hasattr(trainer.engine.optimizer.optim, "grad_scaler"):
|
|
||||||
scaler = trainer.engine.optimizer.optim.grad_scaler._scale.item()
|
|
||||||
|
|
||||||
num_tokens_in_batch = batch[1].nelement()
|
|
||||||
num_samples_in_batch = sum([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
|
||||||
max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
|
|
||||||
max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
|
||||||
min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
|
|
||||||
|
|
||||||
tk_per_gpu = 0
|
|
||||||
tk_per_gpu = round(
|
|
||||||
num_tokens_in_batch
|
|
||||||
* gpc.get_world_size(ParallelMode.DATA)
|
|
||||||
/ gpc.get_world_size(ParallelMode.GLOBAL)
|
|
||||||
/ (time.time() - start_time),
|
|
||||||
2,
|
|
||||||
)
|
|
||||||
|
|
||||||
tflops = get_tflops_func((time.time() - start_time))
|
|
||||||
|
|
||||||
infos = {
|
|
||||||
"tflops": tflops,
|
|
||||||
"step": batch_count,
|
|
||||||
"loss": loss.item(),
|
|
||||||
"tgs (tokens/gpu/second)": tk_per_gpu,
|
|
||||||
"lr": lr,
|
|
||||||
"loss_scale": scaler,
|
|
||||||
"grad_norm": grad_norm,
|
|
||||||
}
|
|
||||||
|
|
||||||
infos["micro_num"] = len(batch[1])
|
|
||||||
infos["num_consumed_tokens"] = train_state.num_consumed_tokens
|
|
||||||
infos["inf_nan_skip_batches"] = train_state.inf_nan_skip_batches
|
|
||||||
infos["num_samples_in_batch"] = num_samples_in_batch # the number of batches which have the most samples
|
|
||||||
infos["largest_length"] = max_length_in_batch # the longest input
|
|
||||||
infos["largest_batch"] = max_samples_in_batch # the batch with the most samples
|
|
||||||
infos["smallest_batch"] = min_samples_in_batch
|
|
||||||
infos["adam_beta2"] = beta2_scheduler.get_beta2()
|
|
||||||
|
|
||||||
fwd_bwd_time = round(timer("fwd-bwd").elapsed(), 2)
|
|
||||||
infos["fwd_bwd_time"] = fwd_bwd_time
|
|
||||||
|
|
||||||
for key, value in acc_perplex.items():
|
|
||||||
infos[key] = value
|
|
||||||
|
|
||||||
line = ""
|
|
||||||
for key, value in infos.items():
|
|
||||||
line += f"{key}={value} "
|
|
||||||
writer.add_scalar(key=key, value=value, step=train_state.step_count)
|
|
||||||
|
|
||||||
if update_panel:
|
|
||||||
logger.info(
|
|
||||||
line,
|
|
||||||
extra={
|
|
||||||
"step": batch_count,
|
|
||||||
"lr": lr,
|
|
||||||
"num_consumed_tokens": train_state.num_consumed_tokens,
|
|
||||||
"grad_norm": grad_norm,
|
|
||||||
"loss": loss.item(),
|
|
||||||
"flops": tflops,
|
|
||||||
"tgs": tk_per_gpu,
|
|
||||||
"acc": acc_perplex["acc"],
|
|
||||||
"perplexity": acc_perplex["perplexity"],
|
|
||||||
"fwd_bwd_time": fwd_bwd_time,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.info(line)
|
|
||||||
|
|
||||||
# if loss spike occurs, send alert info to feishu
|
|
||||||
mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
# init setting
|
# init setting
|
||||||
skip_batches = gpc.config.data.skip_batches
|
skip_batches = gpc.config.data.skip_batches
|
||||||
total_steps = gpc.config.data.total_steps
|
total_steps = gpc.config.data.total_steps
|
||||||
valid_every = gpc.config.data.valid_every
|
valid_every = gpc.config.data.valid_every
|
||||||
load_optimizer = gpc.config.ckpt.load_optimizer
|
|
||||||
label_smoothing = gpc.config.loss.label_smoothing
|
label_smoothing = gpc.config.loss.label_smoothing
|
||||||
lr = gpc.config.adam.lr
|
lr = gpc.config.adam.lr
|
||||||
|
|
||||||
load_model_only_folder = gpc.config.ckpt.get("load_model_only_folder", None)
|
|
||||||
load_resume_ckpt_folder = gpc.config.ckpt.get("load_ckpt_folder", None)
|
|
||||||
|
|
||||||
get_tflops_func = partial(
|
get_tflops_func = partial(
|
||||||
get_megatron_flops,
|
get_megatron_flops,
|
||||||
checkpoint=gpc.config.model.checkpoint,
|
checkpoint=gpc.config.model.checkpoint,
|
||||||
|
@ -490,46 +96,22 @@ def main(args):
|
||||||
# initialize customed llm logger
|
# initialize customed llm logger
|
||||||
uniscale_logger = initialize_llm_logger(start_time=current_time)
|
uniscale_logger = initialize_llm_logger(start_time=current_time)
|
||||||
|
|
||||||
# initialize customed llm writer
|
|
||||||
with open(args.config, "r") as f:
|
|
||||||
config_lines = f.readlines()
|
|
||||||
writer = Writer(
|
|
||||||
job_name=gpc.config.JOB_NAME,
|
|
||||||
launch_time=current_time,
|
|
||||||
file_name=get_parallel_log_file_name(),
|
|
||||||
tensorboard_folder=gpc.config.tensorboard_folder,
|
|
||||||
resume_tb_folder=gpc.config.resume_tb_folder,
|
|
||||||
config=config_lines,
|
|
||||||
logger=logger,
|
|
||||||
enable_tb=gpc.config.enable_tb,
|
|
||||||
)
|
|
||||||
|
|
||||||
model_load_path = None
|
|
||||||
if load_resume_ckpt_folder is not None:
|
|
||||||
logger.info(
|
|
||||||
f"===========Resume training from `{load_resume_ckpt_folder}` {current_time} on host:"
|
|
||||||
f"{socket.gethostname()}==========="
|
|
||||||
)
|
|
||||||
model_load_path = load_resume_ckpt_folder
|
|
||||||
elif load_model_only_folder is not None:
|
|
||||||
logger.info(
|
|
||||||
f"===========SFT training from `{load_model_only_folder}` {current_time} on host:"
|
|
||||||
f"{socket.gethostname()}==========="
|
|
||||||
)
|
|
||||||
model_load_path = load_model_only_folder
|
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
|
|
||||||
f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
|
|
||||||
f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
|
|
||||||
)
|
|
||||||
|
|
||||||
# initialize and resume train state
|
# initialize and resume train state
|
||||||
train_state = TrainState(gpc.config)
|
train_state = TrainState(gpc.config)
|
||||||
|
|
||||||
# initialize model
|
# initialize model
|
||||||
model = initialize_model()
|
model = initialize_model()
|
||||||
|
|
||||||
|
with open(args.config, "r") as f:
|
||||||
|
config_lines = f.readlines()
|
||||||
|
ckpt_manager = CheckpointManager(
|
||||||
|
ckpt_config=gpc.config.ckpt,
|
||||||
|
model=model,
|
||||||
|
model_config=gpc.config.model,
|
||||||
|
model_config_file="".join(config_lines),
|
||||||
|
feishu_address=gpc.config.alert_address,
|
||||||
|
)
|
||||||
|
|
||||||
# initialize loss function
|
# initialize loss function
|
||||||
criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
|
criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
|
||||||
|
|
||||||
|
@ -539,29 +121,24 @@ def main(args):
|
||||||
train_state.init_batch_sampler(train_dl)
|
train_state.init_batch_sampler(train_dl)
|
||||||
|
|
||||||
# Loading model weights must be done before zero is initialized.
|
# Loading model weights must be done before zero is initialized.
|
||||||
if model_load_path is not None:
|
ckpt_manager.try_load_model(current_time)
|
||||||
load_model_checkpoint(folder=model_load_path, model=model)
|
|
||||||
|
|
||||||
optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
|
optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
|
||||||
|
|
||||||
# Loading other persistent training states.
|
# Loading other persistent training states.
|
||||||
if load_resume_ckpt_folder is not None:
|
ckpt_manager.try_resume_training(lr_scheduler, optimizer, lr, train_state, train_dl)
|
||||||
# load lr scheduler states.
|
|
||||||
load_scheduler(load_resume_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
|
|
||||||
# load training states.
|
|
||||||
load_context(load_resume_ckpt_folder, train_dl, train_state)
|
|
||||||
# load dataloader sampler states.
|
|
||||||
load_sampler(load_resume_ckpt_folder, train_dl.batch_sampler)
|
|
||||||
# load optimzier states.
|
|
||||||
if load_optimizer:
|
|
||||||
load_optimizer_checkpoint(load_resume_ckpt_folder, optimizer)
|
|
||||||
|
|
||||||
ckpt_save_manager = CheckpointSaveManager(
|
# initialize customed llm writer
|
||||||
ckpt_config=gpc.config.ckpt,
|
writer = Writer(
|
||||||
model=model,
|
job_name=gpc.config.JOB_NAME,
|
||||||
optimizer=optimizer,
|
launch_time=current_time,
|
||||||
lr_scheduler=lr_scheduler,
|
file_name=get_parallel_log_file_name(),
|
||||||
model_config=gpc.config.model,
|
tensorboard_folder=gpc.config.tensorboard_folder,
|
||||||
|
resume_tb_folder=train_state.resume_tb_folder, # resume from ckpt.
|
||||||
|
step_count=train_state.step_count, # resume from ckpt.
|
||||||
|
config=config_lines,
|
||||||
|
logger=logger,
|
||||||
|
enable_tb=gpc.config.enable_tb,
|
||||||
)
|
)
|
||||||
|
|
||||||
# initialize metric for calculating accuracy and perplexity
|
# initialize metric for calculating accuracy and perplexity
|
||||||
|
@ -598,12 +175,11 @@ def main(args):
|
||||||
# initialize simple memory profiler
|
# initialize simple memory profiler
|
||||||
if args.profiling:
|
if args.profiling:
|
||||||
memory_profiler = SimpleMemoryProfiler(
|
memory_profiler = SimpleMemoryProfiler(
|
||||||
model.model,
|
model,
|
||||||
optimizer.optim,
|
optimizer.optim,
|
||||||
log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
|
log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
|
||||||
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
|
||||||
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
|
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
|
||||||
activation_config=build_activation_config(gpc.config.model.num_layers),
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
memory_profiler = None
|
memory_profiler = None
|
||||||
|
@ -621,6 +197,8 @@ def main(args):
|
||||||
for batch_count in range(train_state.batch_count, total_steps):
|
for batch_count in range(train_state.batch_count, total_steps):
|
||||||
if batch_count % 50 == 0:
|
if batch_count % 50 == 0:
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
bench_gpu()
|
||||||
|
bench_net()
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
timer("one-batch").start()
|
timer("one-batch").start()
|
||||||
|
@ -645,6 +223,7 @@ def main(args):
|
||||||
|
|
||||||
# do forward and backward
|
# do forward and backward
|
||||||
timer("fwd-bwd").start()
|
timer("fwd-bwd").start()
|
||||||
|
|
||||||
_, _, loss = trainer.execute_schedule(
|
_, _, loss = trainer.execute_schedule(
|
||||||
batch, forward_only=False, return_loss=True, return_output_label=False
|
batch, forward_only=False, return_loss=True, return_output_label=False
|
||||||
)
|
)
|
||||||
|
@ -659,7 +238,7 @@ def main(args):
|
||||||
train_state.step_count += 1
|
train_state.step_count += 1
|
||||||
else:
|
else:
|
||||||
train_state.inf_nan_skip_batches += 1 # record the amount of updating parameters unsuccessfully.
|
train_state.inf_nan_skip_batches += 1 # record the amount of updating parameters unsuccessfully.
|
||||||
if -99.0 in grad_norm_groups and gpc.is_rank_for_log(): # -99.0 encodes a specific failure case
|
if -1 in grad_norm_groups.values() and gpc.is_rank_for_log(): # -1 encodes a specific failure case
|
||||||
logger.warning(f"Warning: skip parameter update at step {batch_count}.")
|
logger.warning(f"Warning: skip parameter update at step {batch_count}.")
|
||||||
send_alert_message(
|
send_alert_message(
|
||||||
address=gpc.config.alert_address,
|
address=gpc.config.alert_address,
|
||||||
|
@ -680,7 +259,7 @@ def main(args):
|
||||||
trainer=trainer,
|
trainer=trainer,
|
||||||
start_time=start_time,
|
start_time=start_time,
|
||||||
loss=loss,
|
loss=loss,
|
||||||
grad_norm=np.array(grad_norm_groups),
|
grad_norm=grad_norm_groups,
|
||||||
metric=metric,
|
metric=metric,
|
||||||
update_panel=uniscale_logger is not None,
|
update_panel=uniscale_logger is not None,
|
||||||
)
|
)
|
||||||
|
@ -700,14 +279,17 @@ def main(args):
|
||||||
|
|
||||||
# checkpoint the training states in specific steps, which is determined by the args "checkpoint_every"
|
# checkpoint the training states in specific steps, which is determined by the args "checkpoint_every"
|
||||||
# # save batch sampler that tracks the true consumed samples
|
# # save batch sampler that tracks the true consumed samples
|
||||||
ckpt_save_manager.try_save_checkpoint(train_state)
|
now_break = ckpt_manager.try_save_checkpoint(train_state)
|
||||||
|
if now_break:
|
||||||
|
break
|
||||||
|
|
||||||
if memory_profiler is not None:
|
if memory_profiler is not None:
|
||||||
memory_profiler.step()
|
memory_profiler.step()
|
||||||
|
|
||||||
prof.step()
|
if batch_count % 2 == 0:
|
||||||
|
prof.step()
|
||||||
|
|
||||||
ckpt_save_manager.wait_async_upload_finish()
|
ckpt_manager.wait_async_upload_finish()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue