Merge branch 'main' into develop

pull/275/head^2
yingtongxiong 2023-09-05 17:45:26 +08:00
commit 0e62d41137
71 changed files with 3116 additions and 898 deletions

View File

@ -66,9 +66,23 @@ jobs:
- uses: actions/checkout@v3
- name: slurm-train
id: basic_train
run: |
source activate internlm-env-test
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source activate internlm-env-test
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_new_ckpt
run: |
source activate internlm-env-test
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
- name: torchrun-train
@ -96,7 +110,6 @@ jobs:
srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
cd ..
rm -rf $GITHUB_WORKSPACE/hf_ckpt
load-chat-model-in-hf:
if: ${{ always() }}
needs: check-requirements

28
.readthedocs.yml Normal file
View File

@ -0,0 +1,28 @@
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Set the OS, Python version and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.8"
# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: doc/code-docs/source/conf.py
fail_on_warning: false
# Optionally build your docs in additional formats such as PDF
formats:
- pdf
# Optional but recommended, declare the Python requirements required
# to build your documentation
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- requirements: doc/code-docs/requirements.txt

View File

@ -40,6 +40,10 @@ InternLM は、70 億のパラメータを持つベースモデルと、実用
さらに、大規模な依存関係を必要とせずにモデルの事前学習をサポートする軽量な学習フレームワークが提供されます。単一のコードベースで、数千の GPU を持つ大規模クラスタでの事前学習と、単一の GPU での微調整をサポートし、顕著な性能最適化を達成します。InternLM は、1024GPU でのトレーニングにおいて 90% 近いアクセラレーション効率を達成しています。
## 新闻
InternLM-7B-Chat v1.1 は、コード インタプリタと関数呼び出し機能を備えてリリースされました。 [Lagent](https://github.com/InternLM/lagent) で試すことができます。
## InternLM-7B
### パフォーマンス評価
@ -80,8 +84,8 @@ Transformers を使用して InternLM 7B チャットモデルをロードする
```python
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
>>> model = model.eval()
>>> response, history = model.chat(tokenizer, "こんにちは", history=[])
>>> print(response)

View File

@ -45,6 +45,10 @@ InternLM 即书生·浦语大模型包含面向实用场景的70亿参数
提供了支持模型预训练的轻量级训练框架无需安装大量依赖包一套代码支持千卡预训练和单卡人类偏好对齐训练同时实现了极致的性能优化实现千卡训练下近90%加速效率。
## 新闻
我们开源了 InternLM-Chat-7B v1.1。该模型能够调用代码解释器和工具插件。你可以在 [Lagent](https://github.com/InternLM/lagent) 中体验这些新功能。
## InternLM-7B
### 性能评测
@ -74,6 +78,7 @@ InternLM 即书生·浦语大模型包含面向实用场景的70亿参数
| 模型 | InternLM 格式权重下载地址 | Transformers 格式权重下载地址 |
| -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------ |
| **InternLM 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
| **InternLM Chat 7B v1.1** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1) | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1) |
| **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)
| **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k)
@ -85,8 +90,8 @@ InternLM 即书生·浦语大模型包含面向实用场景的70亿参数
```python
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
>>> model = model.eval()
>>> response, history = model.chat(tokenizer, "你好", history=[])
>>> print(response)
@ -117,26 +122,44 @@ streamlit run web_demo.py
我们使用 [LMDeploy](https://github.com/InternLM/LMDeploy) 完成 InternLM 的一键部署。
1. 首先安装 LMDeploy:
```bash
python3 -m pip install lmdeploy
```
2. 快速的部署命令如下:
执行以下命令,可以在终端与 `internlm-chat-7b` 模型进行交互式对话,或者通过 WebUI 与它聊天。
```bash
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-7b/model
```
# 转换权重格式
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b
3. 在导出模型后你可以直接通过如下命令启动服务并在客户端与AI对话
# 在终端进行交互式对话
python3 -m lmdeploy.turbomind.chat ./workspace
# 启动 gradio 服务
python3 -m lmdeploy.serve.gradio.app ./workspace
```
以上过程中LMDeploy 使用的是 FP16 的计算精度。
除了 FP16 精度LMDeploy 还支持 `internlm-chat-7b` 4bit 权重模型推理。它不仅把模型的显存减少到 6G大约只有 FP16 的 40%,更重要的是,经过 kernel 层面的极致优化,其推理性能在 A100-80G 上可达到 FP16 的 2.4 倍以上。
以下是`internlm-chat-7b` 4bit 权重模型的部署方法。推理速度的 bechmark 请参考[这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md#%E6%8E%A8%E7%90%86%E9%80%9F%E5%BA%A6)
```bash
bash workspace/service_docker_up.sh
python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
```
# download prequnantized internlm-chat-7b model from huggingface
git-lfs install
git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
# Convert the model's layout and store it in the default path, ./workspace.
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
# inference lmdeploy's turbomind engine
python3 -m lmdeploy.turbomind.chat ./workspace
# serving with gradio
python3 -m lmdeploy.serve.gradio.app ./workspace
```
LMDeploy 是涵盖了 LLM 任务的全套轻量化、部署和服务的工具箱。请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。
[LMDeploy](https://github.com/InternLM/LMDeploy) 支持了 InternLM 部署的完整流程,请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。
## 微调&训练

View File

@ -45,6 +45,10 @@ InternLM has open-sourced a 7 billion parameter base model and a chat model tail
Additionally, a lightweight training framework is offered to support model pre-training without the need for extensive dependencies. With a single codebase, it supports pre-training on large-scale clusters with thousands of GPUs, and fine-tuning on a single GPU while achieving remarkable performance optimizations. InternLM achieves nearly 90% acceleration efficiency during training on 1024 GPUs.
## News
InternLM-7B-Chat v1.1 is released with code interpreter and function calling capability. You can try it with [Lagent](https://github.com/InternLM/lagent).
## InternLM-7B
### Performance Evaluation
@ -74,6 +78,7 @@ InternLM 7B and InternLM 7B Chat, trained using InternLM, have been open-sourced
| Model | InternLM Format Weight Download Link | Transformers Format Weight Download Link |
| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
| **InternLM 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
| **InternLM Chat 7B v1.1** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1) | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1) |
| **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) |
| **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k) |
@ -85,8 +90,8 @@ To load the InternLM 7B Chat model using Transformers, use the following code:
```python
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
>>> model = model.eval()
>>> response, history = model.chat(tokenizer, "hello", history=[])
>>> print(response)
@ -118,28 +123,45 @@ The effect is as follows
### Deployment
We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the one-click deployment of InternLM.
1. First, install LMDeploy:
We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the workflow of InternLM deployment.
```bash
python3 -m pip install lmdeploy
```
2. Use the following command for quick deployment:
You can utilize the following commands to conduct `internlm-chat-7b` FP16 inference, serve it and interact with AI assistant via WebUI:
```bash
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b/model
# convert weight layout
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b
# inference lmdeploy's turbomind engine
python3 -m lmdeploy.turbomind.chat ./workspace
# serving with gradio
python3 -m lmdeploy.serve.gradio.app ./workspace
```
3. After exporting the model, you can start a server and have a conversation with the deployed model using the following command:
You can also deploy 4-bit quantized `internlm-chat-7b` model via LMDeploy. It greatly trims down the model's memory overhead to 6G, just 40% of what FP16 inference would take. More importantly, with extreme optimized kernel, the inference performance achieves 2.4x faster than FP16 inference on A100-80G.
Try the followings to enjoy 4-bit `internlm-chat-7b` on a Geforce RTX 30x GPU card. You can find the inference benchmark from [here](https://github.com/InternLM/lmdeploy/blob/main/docs/en/w4a16.md#inference-performance).
```bash
bash workspace/service_docker_up.sh
python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
# download prequnantized internlm-chat-7b model from huggingface
git-lfs install
git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
# Convert the model's layout and store it in the default path, ./workspace.
python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
# inference lmdeploy's turbomind engine
python3 -m lmdeploy.turbomind.chat ./workspace
# serving with gradio
python3 -m lmdeploy.serve.gradio.app ./workspace
```
[LMDeploy](https://github.com/InternLM/LMDeploy) provides a complete workflow for deploying InternLM. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.
LMDeploy is an efficient toolkit for compressing, deploying, and serving LLM models. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.
## Fine-tuning & Training

View File

@ -0,0 +1,29 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
def merge_dicts(dict_a: dict, dict_b: dict):
for key in dict_b.keys():
if isinstance(dict_b[key], dict):
dict_b[key] = {**dict_a[key], **dict_b[key]}
merge_dicts(dict_a[key], dict_b[key])
dict_c = {**dict_a, **dict_b}
return dict_c
def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
result = ""
for key, value in data.items():
if isinstance(value, dict):
result += f"{' ' * indent}{key} = dict(\n"
result += format_dict_to_py_string(value, indent + 4, is_nested=True)
result += f"{' ' * indent})"
else:
result += f"{' ' * indent}{key} = {repr(value)}"
if is_nested:
result += ","
result += "\n"
result = f"""\
{result}
"""
return result

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()

View File

@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
# Ckpt folder format:
# fs: 'local:/mnt/nfs/XXX'
# oss: 'boto3:s3://model_weights/XXX'
MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
# SAVE_CKPT_FOLDER = "local:llm_ckpts"
SAVE_CKPT_FOLDER = "local:llm_ckpts"
# LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
@ -27,7 +27,7 @@ ckpt = dict(
load_optimizer=True,
)
TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
data = dict(
seq_len=SEQ_LEN,
# micro_num means the number of micro_batch contained in one gradient update
@ -120,8 +120,8 @@ zero1 parallel:
2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
pipeline parallel: pipeline parallel size.
tensor parallel: tensor parallel size, usually the number of GPUs per node.
pipeline parallel: pipeline parallel size, only 1 is accepted currently.
tensor parallel: tensor parallel size, usually the number of GPUs per node, only 1 is accepted currently.
"""
parallel = dict(
zero1=8,

View File

@ -0,0 +1,49 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import argparse
import json
import os
from ci_scripts.common import com_func
from internlm.core.context import Config
def generate_new_config(config_py_file, test_config_json, case_name):
# generate path of the new config py
config_path = os.path.split(config_py_file)
new_config_py_file = os.path.join(config_path[0], case_name + ".py")
# merge dict
origin_config = Config.from_file(config_py_file)
with open(test_config_json) as f:
test_config = json.load(f)
if test_config:
if case_name not in test_config.keys():
raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
new_config = com_func.merge_dicts(origin_config, test_config[case_name])
print(f"new config is:\n{new_config}")
# write new config to py file
file_content = com_func.format_dict_to_py_string(new_config)
with open(new_config_py_file, "w") as f:
f.write(file_content)
print(f"The new test train config file is {new_config_py_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--origin_config",
type=str,
default="./ci_scripts/train/ci_7B_sft.py",
help="path to the origin train config file",
)
parser.add_argument(
"--test_config",
type=str,
default="./ci_scripts/train/test_config.json",
help="path to the test train config file",
)
parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
args = parser.parse_args()
generate_new_config(args.origin_config, args.test_config, args.case_name)

View File

@ -0,0 +1,38 @@
#!/bin/bash
set -x
[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
expected_num=22
exit_code=0
source ./ci_scripts/common/basic_func.sh
echo "start to test slurm training with loading checkpoint."
python ./ci_scripts/train/generate_config.py --case_name $1
file="./ci_scripts/train/$1.py"
if [[ ! -f ${file} ]]; then
echo "expect: ${file} exists, actual: not exist."
exit_code=$(($exit_code + 1))
fi
srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
[[ $? -ne 0 ]] && { echo "test slurm training failed."; exit_code=$(($exit_code + 1)); }
num=$(num_files "${CKPTS40_OUTPUT}")
if [[ ${num} -ne ${expected_num} ]]; then
echo "expect: ${expected_num} files, actual: ${num} files."
exit_code=$(($exit_code + 1))
fi
# clean the test files.
if ! rm -rf ${CKPTS_PATH}/*; then
echo "cleaning cached file in ${CKPTS_PATH} failed."
exit_code=$(($exit_code + 1))
fi
exit $exit_code

View File

@ -5,7 +5,7 @@ set -x
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt"
expected_num=21
expected_num=22
exit_code=0
source ./ci_scripts/common/basic_func.sh
@ -28,10 +28,4 @@ if [[ ${num} -ne ${expected_num} ]]; then
exit_code=$(($exit_code + 1))
fi
# clean the test files.
if ! rm -rf ${CKPTS_PATH}/*; then
echo "cleaning cached file in ${CKPTS_PATH} failed."
exit_code=$(($exit_code + 1))
fi
exit $exit_code

View File

@ -0,0 +1,45 @@
{
"7B_basic_train": {
"SEQ_LEN": 1024,
"HIDDEN_SIZE": 2048,
"NUM_ATTENTION_HEAD": 16,
"NUM_LAYER": 16,
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
"ckpt": {
"checkpoint_every": 20
},
"data": {
"total_steps": 20
}
},
"7B_load_new_ckpt": {
"SEQ_LEN": 1024,
"HIDDEN_SIZE": 2048,
"NUM_ATTENTION_HEAD": 16,
"NUM_LAYER": 16,
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
"LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
"ckpt": {
"load_ckpt_folder": "local:llm_ckpts/20",
"checkpoint_every": 20
},
"data": {
"total_steps": 40
}
},
"7B_load_preset_ckpt": {
"SEQ_LEN": 1024,
"HIDDEN_SIZE": 2048,
"NUM_ATTENTION_HEAD": 16,
"NUM_LAYER": 16,
"TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
"LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
"ckpt": {
"load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
"checkpoint_every": 20
},
"data": {
"total_steps": 40
}
}
}

View File

@ -5,7 +5,7 @@ set -x
readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt"
expected_num=21
expected_num=22
exit_code=0
source ./ci_scripts/common/basic_func.sh

View File

@ -75,7 +75,8 @@ grad_scaler = dict(
hybrid_zero_optimizer = dict(
# Enable low_level_optimzer overlap_communication
zero_overlap_communication=True,
overlap_sync_grad=True,
overlap_sync_param=True,
# bucket size for nccl communication params
reduce_bucket_size=512 * 1024 * 1024,
# grad clipping
@ -120,12 +121,11 @@ model = dict(
num_layers=NUM_LAYER,
mlp_ratio=MLP_RATIO,
apply_post_layer_norm=False,
dtype="torch.bfloat16",
dtype="torch.float16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
norm_type="rmsnorm",
layer_norm_epsilon=1e-5,
use_flash_attn=True,
num_chunks=1, # if num_chunks > 1, interleaved pipeline scheduler is used.
sequence_parallel=False,
)
"""
zero1 parallel:
@ -142,6 +142,7 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node.
parallel = dict(
zero1=8,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
cudnn_deterministic = False

20
doc/code-docs/Makefile Normal file
View File

@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

35
doc/code-docs/make.bat Normal file
View File

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.https://www.sphinx-doc.org/
exit /b 1
)
if "%1" == "" goto help
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

View File

@ -0,0 +1,10 @@
Sphinx
sphinx-autobuild
recommonmark
sphinx_rtd_theme
sphinx_markdown_tables
autodoc_pydantic==1.9
enum_tools
numpy
torch
tqdm

View File

@ -0,0 +1,2 @@
Model Checkpointing
===================

View File

@ -0,0 +1,91 @@
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
import os
import sys
project = "InternLM"
copyright = "2023, InternLM Team"
author = "InternLM Team"
release = "v0.2.0"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
extensions = [
"recommonmark",
"sphinx_rtd_theme",
"sphinx.ext.viewcode",
"sphinx.ext.autodoc",
"sphinxcontrib.autodoc_pydantic",
"sphinx.ext.autosectionlabel",
"sphinx.ext.napoleon",
]
pygments_style = "sphinx"
# autodoc_pyandtic config
autodoc_pydantic_model_show_field_summary = False
autodoc_pydantic_field_signature_prefix = " "
autodoc_pydantic_model_signature_prefix = "class"
autodoc_pydantic_model_show_json = False
autodoc_pydantic_model_show_config_summary = False
autodoc_pydantic_model_show_config_member = False
autodoc_pydantic_model_show_validator_summary = False
autodoc_pydantic_model_show_validator_members = False
autodoc_pydantic_model_summary_list_order = "bysource"
autodoc_pydantic_model_member_order = "bysource"
autodoc_pydantic_field_list_validators = False
# Napoleon settings
napoleon_google_docstring = True
napoleon_numpy_docstring = True
napoleon_include_init_with_doc = False
napoleon_include_private_with_doc = False
napoleon_include_special_with_doc = True
napoleon_use_admonition_for_examples = False
napoleon_use_admonition_for_notes = False
napoleon_use_admonition_for_references = False
napoleon_use_ivar = False
napoleon_use_param = True
napoleon_use_rtype = True
napoleon_preprocess_types = False
napoleon_type_aliases = None
napoleon_attr_annotations = True
templates_path = ["_templates"]
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
html_theme = "sphinx_rtd_theme"
html_static_path = ["_static"]
# GitHub integration
html_context = {
"display_github": True,
"github_user": "pjlab",
"github_repo": "InternLM",
"github_version": "master",
"conf_py_path": "/doc/code-docs/source/",
}
sys.path.insert(0, os.path.abspath("../../../"))
# Prepend module names to class descriptions
add_module_names = True
autoclass_content = "class"
autodoc_mock_imports = [
"apex",
"torch",
"numpy",
]

View File

@ -0,0 +1,70 @@
.. InternLM documentation master file, created by
sphinx-quickstart on Mon Aug 28 17:33:28 2023.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
InternLM
========
Environment Setup
-------------------
.. toctree::
:maxdepth: 2
install
Model Setup
-------------------
.. toctree::
:maxdepth: 2
initialize
Training API
-------------------
.. toctree::
:maxdepth: 2
training
Parallel Training
-------------------
.. toctree::
:maxdepth: 2
parallel
Model Checkpointing
-------------------
.. toctree::
:maxdepth: 2
checkpoint
Profiler
-------------------
.. toctree::
:maxdepth: 2
profiler
Monitor
-------------------
.. toctree::
:maxdepth: 2
monitor
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View File

@ -0,0 +1,35 @@
Training Setup
==============
.. _InternLM-args:
Argument Parsing
----------------
InternLM uses the `argparse <https://docs.python.org/3/library/argparse.html>`_ library to supply commandline
configuration to the InternLM runtime. Use ``internlm.initialize.get_default_parser()`` to get InternLM's default
parser with some builtin arguments, users can add custom parameters to this parser.
.. code-block:: python
# Get InternLM default parser
parser = internlm.initialize.get_default_parser()
# Add new argument
parser.add_argument("--user_arg", type=int, default=-1, help="arguments add by user.")
cmd_args = parser.parse_args()
.. autofunction:: internlm.initialize.get_default_parser
.. _InternLM-init:
Model Initialization
-------------------------
Optimizer Initialization
-------------------------
Dataloader Initialization
-------------------------
Trainer Initialization
-------------------------

View File

@ -0,0 +1,70 @@
## Installation
### Environment Preparation
The required packages and corresponding version are shown as follows:
- Python == 3.10
- GCC == 10.2.0
- MPFR == 4.1.0
- CUDA >= 11.7
- Pytorch >= 1.13.1
- Transformers >= 4.28.0
- Flash-Attention >= v1.0.5
- Apex == 23.05
- GPU with Ampere or Hopper architecture (such as H100, A100)
- Linux OS
After installing the above dependencies, some system environment variables need to be updated:
```bash
export CUDA_PATH={path_of_cuda_11.7}
export GCC_HOME={path_of_gcc_10.2.0}
export MPFR_HOME={path_of_mpfr_4.1.0}
export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
export PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
export CC=${GCC_HOME}/bin/gcc
export CXX=${GCC_HOME}/bin/c++
```
### Environment Installation
Clone the project `internlm` and its dependent submodules from the github repository, as follows:
```bash
git clone git@github.com:InternLM/InternLM.git --recurse-submodules
```
It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:
```bash
conda create --name internlm-env python=3.10 -y
conda activate internlm-env
cd internlm
pip install -r requirements/torch.txt
pip install -r requirements/runtime.txt
```
Install flash-attention (version v1.0.5):
```bash
cd ./third_party/flash-attention
python setup.py install
cd ./csrc
cd fused_dense_lib && pip install -v .
cd ../xentropy && pip install -v .
cd ../rotary && pip install -v .
cd ../layer_norm && pip install -v .
cd ../../../../
```
Install Apex (version 23.05):
```bash
cd ./third_party/apex
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
cd ../../
```
### Environment Image
Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
```bash
# pull image
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
# start container
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
docker exec -it myinternlm bash
```

View File

@ -0,0 +1,10 @@
Monitor and Alert
=================
Monitoring
-----------------
Alerting
-----------------

View File

@ -0,0 +1,23 @@
Parallel Training
=================
.. 整体说一下并行配置使用方式,接下来再分模块详细说明
Tensor Parallel
-----------------
Pipeline Parallel
-----------------
Sequence Parallel
-----------------
Data Parallel
-----------------
ZeRO1.5
-----------------

View File

@ -0,0 +1,11 @@
Profiler
========
.. 可介绍torch profiler, memory profiler的使用
Torch Profiler
-----------------
Memory Profiler
-----------------

View File

@ -0,0 +1,2 @@
Training API
============

View File

@ -59,12 +59,28 @@ cd ../../
```
### Environment Image
Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm.
#### Image Configuration and Build
The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:
``` bash
make -f docker.Makefile BASE_OS=centos7
```
In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported.
#### Pull Standard Image
The standard image based on ubuntu and centos has been built and can be directly pulled:
```bash
# pull image
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
# start container
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
docker exec -it myinternlm bash
# ubuntu20.04
docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
# centos7
docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
```
#### Run Container
For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
```bash
docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
```
The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md).

View File

@ -6,11 +6,14 @@ The system code file structure is shown below:
├── internlm # Main directory of the system code
│ ├── apis # Interface module, containing some interface functions related to inference, etc.
│ ├── core # Core module, managing parallel context and training scheduling engine for training and inference
│ │ ├── communication # Communication module, responsible for p2p communication in pipeline parallel scheduling
│ │ ├── context # Context module, mainly responsible for initializing parallel process groups and managing parallel context
│ │ │ ├── parallel_context.py
│ │ │ └── process_group_initializer.py
│ │ ├── scheduler # Scheduling module, which manages schedulers for parallel training, including non-pipeline and pipeline parallel schedulers
│ │ │ ├── no_pipeline_scheduler.py
│ │ │ └── pipeline_scheduler.py
│ │ ├── engine.py # Responsible for managing the training and evaluation process of the model
│ │ ├── no_pipeline_scheduler.py # Scheduler for parallel training
│ │ └── trainer.py # Responsible for managing the training engine and scheduler
│ ├── data # Data module, responsible for managing dataset generation and processing
│ ├── initialize # Initialization module, responsible for managing distributed environment startup and trainer initialization

View File

@ -165,8 +165,9 @@ Training parallel configuration example:
```python
parallel = dict(
zero1=8,
pipeline=1,
tensor=1,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
```
@ -174,8 +175,11 @@ parallel = dict(
- When `size <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
- When `size == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
- When `size > 1` and `size <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
- pipeline: pipeline parallel size, default value is 1
- tensor: tensor parallel size, usually the number of GPUs per node, default value is 1
- tensor: tensor parallel size, usually the number of GPUs per node, default is 1
- pipeline: pipeline parallel strategy
- size: pipeline parallel size, the default value is 1
- interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False
- sequence_parallel: Whether to enable sequence parallelism, the default value is False
Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`

View File

@ -59,11 +59,28 @@ cd ../../
```
### 环境镜像
用户可以从 https://hub.docker.com/r/sunpengsdu/internlm 获取安装了 InternLM 运行环境的镜像,拉取镜像及启动容器的命令如下:
用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像,或者也可以从 https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。
#### 镜像配置及构造
dockerfile 的配置以及构造均通过 docker.Makefile 文件实现,在 InternLM 根目录下执行如下命令即可 build 镜像:
``` bash
# 拉取镜像
docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
# 启动容器
docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
docker exec -it myinternlm bash
make -f docker.Makefile BASE_OS=centos7
```
在 docker.Makefile 中可自定义基础镜像,环境版本等内容,对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 ubuntu20.04 和 centos7。
#### 镜像拉取
基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用:
```bash
# ubuntu20.04
docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
# centos7
docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
```
#### 容器启动
对于使用 dockerfile 构建或拉取的本地标准镜像,使用如下命令启动并进入容器:
```bash
docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
```
容器内默认目录即 `/InternLM`,根据[使用文档](./usage.md)即可启动训练。

View File

@ -6,11 +6,14 @@
├── internlm # 系统代码的主目录
│ ├── apis # 接口模块,包含一些关于推理等的接口函数
│ ├── core # 核心模块,管理用于训练和推理的 parallel context 和训练调度引擎
│ │ ├── communication # 通信模块负责流水线并行调度中的p2p通信
│ │ ├── context # context 模块,主要负责初始化并行进程组,并管理 parallel context
│ │ │ ├── parallel_context.py
│ │ │ └── process_group_initializer.py
│ │ ├── scheduler # 调度模块,管理并行训练的调度器,包括非流水线并行调度器和流水线并行调度器
│ │ │ ├── no_pipeline_scheduler.py
│ │ │ └── pipeline_scheduler.py
│ │ ├── engine.py # 负责管理模型的训练和评估过程
│ │ ├── no_pipeline_scheduler.py # 并行训练的调度器
│ │ └── trainer.py # 负责管理训练引擎和调度器
│ ├── data # 数据模块,负责管理数据集生成和处理
│ ├── initialize # 初始化模块,负责管理分布式环境启动和训练器初始化

View File

@ -151,16 +151,20 @@ model = dict(
```python
parallel = dict(
zero1=8,
pipeline=1,
tensor=1,
pipeline=dict(size=1, interleaved_overlap=True),
sequence_parallel=False,
)
```
- zero1zero 并行策略,分如下三种情况,默认值为 -1
- 当`size <= 0`,则 zero1 进程组的大小等于数据并行进程组的大小,因此优化器状态参数将在数据并行范围内分配
- 当`size == 1`,则不使用 zero1 ,所有数据并行组保留完整的优化器状态参数
- 当`size > 1`且`size <= data_parallel_world_size`,则 zero1 进程组是数据并行进程组的子集
- pipeline流水线并行大小默认值为 1
- tensor张量并行大小通常是每个节点的 GPU 数量,默认值为 1
- pipeline流水线并行策略
- size流水线并行大小默认值为 1
- interleaved_overlapbool 类型,交错式调度时,开启或关闭通信优化,默认值为关闭
- sequence_parallel是否开启序列化并行默认值为 False
注意:`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`

107
docker.Makefile Normal file
View File

@ -0,0 +1,107 @@
DOCKER_REGISTRY ?= docker.io
DOCKER_ORG ?= my
DOCKER_IMAGE ?= internlm
DOCKER_FULL_NAME = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
CUDA_VERSION = 11.7.1
GCC_VERSION = 10.2.0
CUDNN_VERSION = 8
BASE_RUNTIME =
# ubuntu20.04 centos7
BASE_OS = centos7
BASE_DEVEL = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-${BASE_OS}
# The conda channel to use to install cudatoolkit
CUDA_CHANNEL = nvidia
# The conda channel to use to install pytorch / torchvision
INSTALL_CHANNEL ?= pytorch
PYTHON_VERSION ?= 3.10
PYTORCH_VERSION ?= 1.13.1
TORCHVISION_VERSION ?= 0.14.1
TORCHAUDIO_VERSION ?= 0.13.1
BUILD_PROGRESS ?= auto
TRITON_VERSION ?=
GMP_VERSION ?= 6.2.1
MPFR_VERSION ?= 4.1.0
MPC_VERSION ?= 1.2.1
GCC_VERSION ?= 10.2.0
HTTPS_PROXY_I ?=
HTTP_PROXY_I ?=
FLASH_ATTEN_VERSION ?= 1.0.5
FLASH_ATTEN_TAG ?= v${FLASH_ATTEN_VERSION}
BUILD_ARGS = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
--build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
--build-arg CUDA_VERSION=$(CUDA_VERSION) \
--build-arg CUDA_CHANNEL=$(CUDA_CHANNEL) \
--build-arg PYTORCH_VERSION=$(PYTORCH_VERSION) \
--build-arg TORCHVISION_VERSION=$(TORCHVISION_VERSION) \
--build-arg TORCHAUDIO_VERSION=$(TORCHAUDIO_VERSION) \
--build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) \
--build-arg TRITON_VERSION=$(TRITON_VERSION) \
--build-arg GMP_VERSION=$(GMP_VERSION) \
--build-arg MPFR_VERSION=$(MPFR_VERSION) \
--build-arg MPC_VERSION=$(MPC_VERSION) \
--build-arg GCC_VERSION=$(GCC_VERSION) \
--build-arg https_proxy=$(HTTPS_PROXY_I) \
--build-arg http_proxy=$(HTTP_PROXY_I) \
--build-arg FLASH_ATTEN_TAG=$(FLASH_ATTEN_TAG)
EXTRA_DOCKER_BUILD_FLAGS ?=
BUILD ?= build
# Intentionally left blank
PLATFORMS_FLAG ?=
PUSH_FLAG ?=
USE_BUILDX ?=1
BUILD_PLATFORMS ?=
WITH_PUSH ?= false
BUILD_TYPE ?= intrenlm-dev
# Setup buildx flags
ifneq ("$(USE_BUILDX)","")
BUILD = buildx build
ifneq ("$(BUILD_PLATFORMS)","")
PLATFORMS_FLAG = --platform="$(BUILD_PLATFORMS)"
endif
endif
# endif
# # Only set platforms flags if using buildx
# ifeq ("$(WITH_PUSH)","true")
# PUSH_FLAG = --push
# endif
# endif
ifeq ($(findstring centos,$(BASE_OS)),centos)
DOCKERFILE_PATH ?= ./docker/Dockerfile-centos
else
DOCKERFILE_PATH ?= ./docker/Dockerfile-ubuntu
endif
#use -f to specify dockerfile
DOCKER_BUILD = DOCKER_BUILDKIT=1 \
docker $(BUILD) \
--progress=$(BUILD_PROGRESS) \
$(EXTRA_DOCKER_BUILD_FLAGS) \
$(PLATFORMS_FLAG) \
$(PUSH_FLAG) \
-f $(DOCKERFILE_PATH) \
-t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
$(BUILD_ARGS) .
# --target $(BUILD_TYPE)
.PHONY: all
all: devel-image
.PHONY: devel-image
devel-image: BASE_IMAGE := $(BASE_DEVEL)
devel-image: DOCKER_TAG := torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}-flashatten${FLASH_ATTEN_VERSION}-${BASE_OS}
devel-image:
$(DOCKER_BUILD)
.PHONY: clean
clean:
-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))

131
docker/Dockerfile-centos Normal file
View File

@ -0,0 +1,131 @@
ARG BASE_IMAGE
ARG https_proxy
ARG http_proxy
##############################################################################
# Install the basic environment on centos
##############################################################################
FROM ${BASE_IMAGE} as base
ARG https_proxy
ARG http_proxy
RUN yum install deltarpm -y && yum update -y \
&& yum install -y \
ca-certificates \
cmake \
curl \
git \
wget \
tar \
m4 \
bzip2 \
gcc \
gcc-c++ \
file \
texinfo \
which
##############################################################################
# Install the conda environment
##############################################################################
FROM base as conda
ARG PYTHON_VERSION=3.10
ARG TARGETPLATFORM
ARG https_proxy
ARG http_proxy
RUN case ${TARGETPLATFORM} in \
"linux/arm64") MINICONDA_ARCH=aarch64 ;; \
*) MINICONDA_ARCH=x86_64 ;; \
esac && \
curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
RUN chmod +x ~/miniconda.sh && \
bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
/opt/conda/bin/conda clean -ya
##############################################################################
# Install environment dependencies
##############################################################################
FROM conda as dep
WORKDIR /dep
ARG https_proxy
ARG http_proxy
ARG GMP_VERSION
ARG MPFR_VERSION
ARG MPC_VERSION
RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
&& tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
&& cd gmp-${GMP_VERSION}/ \
&& ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
&& tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
&& cd mpfr-${MPFR_VERSION}/ \
&& ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
&& tar -vxf mpc-${MPC_VERSION}.tar.gz \
&& cd mpc-${MPC_VERSION}/ \
&& ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& git clone https://github.com/ninja-build/ninja.git \
&& cd ninja \
&& git checkout release \
&& ./configure.py --bootstrap \
&& mv ./ninja /usr/bin \
&& cd ..
ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
ARG https_proxy
ARG http_proxy
ARG GCC_VERSION
ARG GMP_VERSION
ARG MPFR_VERSION
ARG MPC_VERSION
RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
&& tar -vxf gcc-${GCC_VERSION}.tar.xz \
&& mkdir build \
&& cd build/ \
&& ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
--with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
&& make -j64 && make install
ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
ENV CC=${GCC_HOME}/bin/gcc
ENV CXX=${GCC_HOME}/bin/c++
##############################################################################
# Install InternLM development environment, including flash-attention and apex
##############################################################################
FROM dep as intrenlm-dev
COPY . /InternLM
WORKDIR /InternLM
ARG https_proxy
ARG http_proxy
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
RUN git submodule update --init --recursive \
&& /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
&& /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
&& cd /InternLM/third_party/flash-attention \
&& /opt/conda/bin/python setup.py install \
&& cd ./csrc \
&& cd fused_dense_lib && /opt/conda/bin/pip install -v . \
&& cd ../xentropy && /opt/conda/bin/pip install -v . \
&& cd ../rotary && /opt/conda/bin/pip install -v . \
&& cd ../layer_norm && /opt/conda/bin/pip install -v . \
&& cd ../../../../ \
&& cd ./third_party/apex \
&& /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
&& /opt/conda/bin/pip cache purge \
&& rm -rf ~/.cache/pip

112
docker/Dockerfile-ubuntu Normal file
View File

@ -0,0 +1,112 @@
ARG BASE_IMAGE
ARG https_proxy
ARG http_proxy
##############################################################################
# Install the basic environment on ubuntu
##############################################################################
FROM ${BASE_IMAGE} as base
ARG https_proxy
ARG http_proxy
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
curl \
git \
wget \
tar \
m4 \
ninja-build
##############################################################################
# Install the conda environment
##############################################################################
FROM base as conda
ARG PYTHON_VERSION=3.10
ARG TARGETPLATFORM
ARG https_proxy
ARG http_proxy
RUN case ${TARGETPLATFORM} in \
"linux/arm64") MINICONDA_ARCH=aarch64 ;; \
*) MINICONDA_ARCH=x86_64 ;; \
esac && \
curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
RUN chmod +x ~/miniconda.sh && \
bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
/opt/conda/bin/conda clean -ya
##############################################################################
# Install environment dependencies
##############################################################################
FROM conda as dep
WORKDIR /dep
ARG https_proxy
ARG http_proxy
ARG GCC_VERSION
ARG GMP_VERSION
ARG MPFR_VERSION
ARG MPC_VERSION
RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
&& tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
&& cd gmp-${GMP_VERSION}/ \
&& ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
&& tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
&& cd mpfr-${MPFR_VERSION}/ \
&& ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
&& tar -vxf mpc-${MPC_VERSION}.tar.gz \
&& cd mpc-${MPC_VERSION}/ \
&& ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
&& tar -vxJf gcc-${GCC_VERSION}.tar.xz \
&& mkdir build \
&& cd build/ \
&& ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
--with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
&& make -j64 && make install
ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
ENV CC=${GCC_HOME}/bin/gcc
ENV CXX=${GCC_HOME}/bin/c++
##############################################################################
# Install InternLM development environment, including flash-attention and apex
##############################################################################
FROM dep as intrenlm-dev
COPY . /InternLM
WORKDIR /InternLM
ARG https_proxy
ARG http_proxy
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
RUN git submodule update --init --recursive \
&& /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
&& /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
&& cd /InternLM/third_party/flash-attention \
&& /opt/conda/bin/python setup.py install \
&& cd ./csrc \
&& cd fused_dense_lib && /opt/conda/bin/pip install -v . \
&& cd ../xentropy && /opt/conda/bin/pip install -v . \
&& cd ../rotary && /opt/conda/bin/pip install -v . \
&& cd ../layer_norm && /opt/conda/bin/pip install -v . \
&& cd ../../../../ \
&& cd ./third_party/apex \
&& /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
&& /opt/conda/bin/pip cache purge \
&& rm -rf ~/.cache/pip

View File

@ -0,0 +1,161 @@
ARG BASE_IMAGE
ARG https_proxy
ARG http_proxy
##############################################################################
# Install the basic environment on centos
##############################################################################
FROM ${BASE_IMAGE} as base
ARG https_proxy
ARG http_proxy
RUN yum install deltarpm -y && yum update -y \
&& yum install -y \
ca-certificates \
cmake \
curl \
git \
wget \
tar \
m4 \
bzip2 \
gcc \
gcc-c++ \
file \
texinfo \
which
##############################################################################
# Install the conda environment
##############################################################################
FROM base as conda
ARG PYTHON_VERSION=3.10
ARG TARGETPLATFORM
ARG https_proxy
ARG http_proxy
RUN case ${TARGETPLATFORM} in \
"linux/arm64") MINICONDA_ARCH=aarch64 ;; \
*) MINICONDA_ARCH=x86_64 ;; \
esac && \
curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
RUN chmod +x ~/miniconda.sh && \
bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
/opt/conda/bin/conda clean -ya
##############################################################################
# Install environment dependencies
##############################################################################
FROM conda as dep
WORKDIR /dep
ARG https_proxy
ARG http_proxy
ARG GMP_VERSION
ARG MPFR_VERSION
ARG MPC_VERSION
RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
&& tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
&& cd gmp-${GMP_VERSION}/ \
&& ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
&& tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
&& cd mpfr-${MPFR_VERSION}/ \
&& ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
&& tar -vxf mpc-${MPC_VERSION}.tar.gz \
&& cd mpc-${MPC_VERSION}/ \
&& ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& git clone https://github.com/ninja-build/ninja.git \
&& cd ninja \
&& git checkout release \
&& ./configure.py --bootstrap \
&& mv ./ninja /usr/bin \
&& cd ..
ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
ARG https_proxy
ARG http_proxy
ARG GCC_VERSION
ARG GMP_VERSION
ARG MPFR_VERSION
ARG MPC_VERSION
RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
&& tar -vxf gcc-${GCC_VERSION}.tar.xz \
&& mkdir build \
&& cd build/ \
&& ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
--with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
&& make -j64 && make install
ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
ENV CC=${GCC_HOME}/bin/gcc
ENV CXX=${GCC_HOME}/bin/c++
##############################################################################
# Install InternLM development environment, including flash-attention and apex
##############################################################################
FROM dep as intrenlm-dev
COPY . /InternLM
WORKDIR /InternLM
ARG https_proxy
ARG http_proxy
ARG PYTORCH_VERSION
ARG TORCHVISION_VERSION
ARG TORCHAUDIO_VERSION
RUN /opt/conda/bin/pip --no-cache-dir install \
transformers==4.29.2 \
sentencepiece \
numpy \
tqdm \
psutil \
packaging \
pre-commit \
ninja \
gputil \
pytest \
packaging \
boto3 \
botocore \
torch-scatter \
pyecharts \
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
&& /opt/conda/bin/pip --no-cache-dir install \
--extra-index-url https://download.pytorch.org/whl/cu117 \
torch==${PYTORCH_VERSION}+cu117 \
torchvision==${TORCHVISION_VERSION}+cu117 \
torchaudio==${TORCHAUDIO_VERSION}
ARG https_proxy
ARG http_proxy
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
ARG FLASH_ATTEN_TAG
RUN git submodule update --init --recursive \
&& cd /InternLM/third_party/flash-attention \
&& git checkout ${FLASH_ATTEN_TAG} \
&& /opt/conda/bin/python setup.py install \
&& cd ./csrc \
&& cd fused_dense_lib && /opt/conda/bin/pip install -v . \
&& cd ../xentropy && /opt/conda/bin/pip install -v . \
&& cd ../rotary && /opt/conda/bin/pip install -v . \
&& cd ../layer_norm && /opt/conda/bin/pip install -v . \
&& cd ../../../../ \
&& cd ./third_party/apex \
&& /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
&& /opt/conda/bin/pip cache purge \
&& rm -rf ~/.cache/pip

View File

@ -0,0 +1,142 @@
ARG BASE_IMAGE
ARG https_proxy
ARG http_proxy
##############################################################################
# Install the basic environment on ubuntu
##############################################################################
FROM ${BASE_IMAGE} as base
ARG https_proxy
ARG http_proxy
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
cmake \
curl \
git \
wget \
tar \
m4 \
ninja-build
##############################################################################
# Install the conda environment
##############################################################################
FROM base as conda
ARG PYTHON_VERSION=3.10
ARG TARGETPLATFORM
ARG https_proxy
ARG http_proxy
RUN case ${TARGETPLATFORM} in \
"linux/arm64") MINICONDA_ARCH=aarch64 ;; \
*) MINICONDA_ARCH=x86_64 ;; \
esac && \
curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
RUN chmod +x ~/miniconda.sh && \
bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh && \
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
/opt/conda/bin/conda clean -ya
##############################################################################
# Install environment dependencies
##############################################################################
FROM conda as dep
WORKDIR /dep
ARG https_proxy
ARG http_proxy
ARG GCC_VERSION
ARG GMP_VERSION
ARG MPFR_VERSION
ARG MPC_VERSION
RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
&& tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
&& cd gmp-${GMP_VERSION}/ \
&& ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
&& tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
&& cd mpfr-${MPFR_VERSION}/ \
&& ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
&& tar -vxf mpc-${MPC_VERSION}.tar.gz \
&& cd mpc-${MPC_VERSION}/ \
&& ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
&& make -j64 && make install \
&& cd .. \
&& wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
&& tar -vxJf gcc-${GCC_VERSION}.tar.xz \
&& mkdir build \
&& cd build/ \
&& ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
--with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
&& make -j64 && make install
ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
ENV CC=${GCC_HOME}/bin/gcc
ENV CXX=${GCC_HOME}/bin/c++
##############################################################################
# Install InternLM development environment, including flash-attention and apex
##############################################################################
FROM dep as intrenlm-dev
COPY . /InternLM
WORKDIR /InternLM
ARG https_proxy
ARG http_proxy
ARG PYTORCH_VERSION
ARG TORCHVISION_VERSION
ARG TORCHAUDIO_VERSION
RUN /opt/conda/bin/pip --no-cache-dir install \
transformers==4.29.2 \
sentencepiece \
numpy \
tqdm \
psutil \
packaging \
pre-commit \
ninja \
gputil \
pytest \
packaging \
boto3 \
botocore \
torch-scatter \
pyecharts \
-f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
&& /opt/conda/bin/pip --no-cache-dir install \
--extra-index-url https://download.pytorch.org/whl/cu117 \
torch==${PYTORCH_VERSION}+cu117 \
torchvision==${TORCHVISION_VERSION}+cu117 \
torchaudio==${TORCHAUDIO_VERSION}
ARG https_proxy
ARG http_proxy
ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
ARG FLASH_ATTEN_TAG
RUN git submodule update --init --recursive \
&& cd /InternLM/third_party/flash-attention \
&& git checkout ${FLASH_ATTEN_TAG} \
&& /opt/conda/bin/python setup.py install \
&& cd ./csrc \
&& cd fused_dense_lib && /opt/conda/bin/pip install -v . \
&& cd ../xentropy && /opt/conda/bin/pip install -v . \
&& cd ../rotary && /opt/conda/bin/pip install -v . \
&& cd ../layer_norm && /opt/conda/bin/pip install -v . \
&& cd ../../../../ \
&& cd ./third_party/apex \
&& /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
&& /opt/conda/bin/pip cache purge \
&& rm -rf ~/.cache/pip

25
experiment/README-CN.md Normal file
View File

@ -0,0 +1,25 @@
## 实验性环境镜像
本模块用于测试新版本环境,默认测试新环境 torch=2.0.1flash-attention=2.1.0。新环境可能具有不稳定性,标准环境安装请参考:[安装文档](../doc/install.md)
### 镜像构建及拉取
构建镜像时请于 InternLM 根目录下执行 docker.Makefile该文件与标准环境镜像共用所使用的 Dockerfile 位于 experiment 目录下。也可直接从 https://hub.docker.com/r/internlm/internlm 拉取镜像,命令如下:
```bash
# 构建镜像
# ubuntu20.04
make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
# centos7
make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
# 拉取镜像
# ubuntu20.04
docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
# centos7
docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
```
### 容器启动
对于使用 dockerfile 构建或拉取的本地标准镜像,使用如下命令启动并进入容器:
```bash
docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
```
容器内默认目录即 `/InternLM`,根据[使用文档](../doc/usage.md)即可启动训练。

25
experiment/README-EN.md Normal file
View File

@ -0,0 +1,25 @@
## Environment Image for experiment
This module is used to test the new version environment, the default test new environment is torch=2.0.1, flash-attention=2.1.0. The new environment may be unstable, for the standard environment installation please refer to: [installation guide](../doc/en/install.md)
### Build and Pull Image
When building the image, please make docker.Makefile in the InternLM root directory. This Makefile is shared with the standard environment image, and the Dockerfile used is located in the experiment directory. You can also pull the image directly from https://hub.docker.com/r/internlm/internlm, the command is as follows:
```bash
# Build Image
# ubuntu20.04
make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
# centos7
make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
# Pull Image
# ubuntu20.04
docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
# centos7
docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
```
### Run Container
For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
```bash
docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
```
The default directory in the container is `/InternLM`, please start training according to the [Usage](../doc/en/usage.md).

View File

@ -7,6 +7,7 @@ from .parallel_context import (
from .process_group_initializer import (
Initializer_Data,
Initializer_Model,
Initializer_Nettest,
Initializer_Pipeline,
Initializer_Tensor,
Initializer_Zero1,
@ -34,6 +35,7 @@ __all__ = [
"Initializer_Pipeline",
"Initializer_Data",
"Initializer_Zero1",
"Initializer_Nettest",
"ProcessGroupInitializer",
"Initializer_Model",
"seed",

View File

@ -143,6 +143,7 @@ class ParallelContext(metaclass=SingletonMeta):
self.pipeline_parallel_size = 1
self.tensor_parallel_size = 1
self.zero1_parallel_size = -1
self.nettest_parallel_size = 1
self.num_processes_on_current_node = -1
self.virtual_pipeline_parallel_size = None
self.virtual_pipeline_parallel_rank = None
@ -442,6 +443,9 @@ class ParallelContext(metaclass=SingletonMeta):
# instead, it should be calculated based on other parallel config
self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
# the recommended nettest_parallel_size is 32 GPUs
self.nettest_parallel_size = 32
if self.zero1_parallel_size <= 0:
self.zero1_parallel_size = self.data_parallel_size
@ -454,6 +458,7 @@ class ParallelContext(metaclass=SingletonMeta):
self.pipeline_parallel_size,
self.tensor_parallel_size,
self.zero1_parallel_size,
self.nettest_parallel_size,
]
# run initialization of different process groups
@ -462,6 +467,7 @@ class ParallelContext(metaclass=SingletonMeta):
initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
if self.pipeline_parallel_size > 1:
initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
for initializer in initializers:

View File

@ -3,6 +3,7 @@
# adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context
import math
from abc import ABC, abstractmethod
from enum import Enum
@ -31,6 +32,9 @@ class ParallelMode(Enum):
# zero1 parallel
ZERO1 = "zero1"
# runntime network test
NETTEST = "nettest"
class ProcessGroupInitializer(ABC):
"""An object, knowing the parallelism configuration, that initializes parallel groups.
@ -52,6 +56,7 @@ class ProcessGroupInitializer(ABC):
pipeline_parallel_size: int,
tensor_parallel_size: int,
zero1_parallel_size: int,
nettest_parallel_size: int,
):
self.rank = rank
self.world_size = world_size
@ -59,6 +64,7 @@ class ProcessGroupInitializer(ABC):
self.pipeline_parallel_size = pipeline_parallel_size
self.tensor_parallel_size = tensor_parallel_size
self.zero1_parallel_size = zero1_parallel_size
self.nettest_parallel_size = nettest_parallel_size
super().__init__()
@abstractmethod
@ -332,3 +338,52 @@ class Initializer_Zero1(ProcessGroupInitializer):
ranks_in_group = ranks
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
class Initializer_Nettest(ProcessGroupInitializer):
"""A ProcessGroupInitializer for network test, especailly for NCCL.
Args:
rank (int): The rank of current process.
world_size (int): Size of whole communication world.
nettest_parallel_size (int): Size of a network test group.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.num_nettest_group = math.ceil(self.world_size / self.nettest_parallel_size)
def init_dist_group(self, use_cpu: bool = False):
"""Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
Returns:
Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
A Tensor parallelism's information tuple.
"""
local_rank = None
ranks_in_group = None
process_group = None
cpu_group = None
group_world_size = None
mode = ParallelMode.NETTEST
for i in range(self.num_nettest_group):
ranks = []
for j in range(self.nettest_parallel_size):
rank = i * self.nettest_parallel_size + j
if rank < self.world_size:
ranks.append(rank)
group = dist.new_group(ranks)
if use_cpu:
group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
else:
group_cpu = None
if self.rank in ranks:
local_rank = ranks.index(self.rank)
group_world_size = len(ranks)
process_group = group
cpu_group = group_cpu
ranks_in_group = ranks
return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode

View File

@ -30,7 +30,7 @@ def get_tensor_shape():
if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
if gpc.config.model.use_flash_attn:
if gpc.config.model.sequence_parallel:
if gpc.config.parallel.sequence_parallel:
sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
tensor_shape = (
gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"] // sequence_world_size,
@ -140,7 +140,7 @@ class PipelineScheduler(BaseScheduler):
and gpc.get_world_size(ParallelMode.TENSOR) > 1
)
if gpc.config.model.sequence_parallel:
if gpc.config.parallel.sequence_parallel:
self.scatter_gather_tensors = False
# cache for the batch data

View File

@ -38,6 +38,11 @@ class TrainState:
# Total step count
self.total_steps: int = config.data.total_steps
# resume tensorboard folder, need load from checkpoint or set manually.
self.resume_tb_folder = config.resume_tb_folder
self.tensorboard_folder = config.tensorboard_folder
def init_batch_sampler(self, train_dl):
# Copy of the batch sampler from the DataLoader
self.batch_sampler = train_dl.batch_sampler.copy()
@ -73,9 +78,13 @@ class TrainState:
self.step_count = other_stuffs.get("step_count", other_stuffs["batch_count"]) + 1
# track the actual updates of sampler when using weighted sampling
if hasattr(self, "batch_sampler"):
self.batch_sampler = train_dl.batch_sampler.copy()
self.batch_sampler_iter = iter(self.batch_sampler)
# resume tensorboard from older tensorboard_folder
self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)
def state_dict(self):
return {
"batch_count": self.batch_count,
@ -83,6 +92,7 @@ class TrainState:
"num_consumed_tokens": self.num_consumed_tokens,
"inf_nan_skip_batches": self.inf_nan_skip_batches,
"step_count": self.step_count,
"tensorboard_folder": self.tensorboard_folder,
}

View File

@ -5,7 +5,7 @@ import torch
from internlm.core.context import global_context as gpc
DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1}
DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1, "code": 2}
def get_dataset_type_id(path):

View File

@ -1,9 +1,15 @@
from .initialize_trainer import initialize_trainer
from .launch import get_default_parser, launch_from_slurm, launch_from_torch
from .launch import (
get_default_parser,
initialize_distributed_env,
launch_from_slurm,
launch_from_torch,
)
__all__ = [
"get_default_parser",
"initialize_trainer",
"launch_from_slurm",
"launch_from_torch",
"initialize_distributed_env",
]

View File

@ -3,16 +3,15 @@
import math
import torch
from torch import Tensor, nn
def scaled_init_method_normal(sigma, num_layers):
def scaled_init_method_normal(sigma: float = 1.0, num_layers: int = 1):
"""Init method based on N(0, sigma/sqrt(2*num_layers)."""
std = sigma / math.sqrt(2.0 * num_layers)
def init_(tensor):
return torch.nn.init.normal_(tensor, mean=0.0, std=std)
return nn.init.normal_(tensor, mean=0.0, std=std)
return init_
@ -32,3 +31,33 @@ def normal_(mean: float = 0.0, std: float = 1.0):
return nn.init.normal_(tensor, mean, std)
return initializer
def scaled_init_method_uniform(sigma: float = 1.0, num_layers: int = 1):
"""Init method based on p(x)=Uniform(-a, a) where std(x)=sigma/sqrt(2*num_layers)."""
std = sigma / math.sqrt(2.0 * num_layers)
a = math.sqrt(3.0 * std)
def init_(tensor):
return nn.init.uniform_(tensor, -a, a)
return init_
def uniform_(mean: float = 0.0, std: float = 1.0):
r"""Return the initializer filling the input Tensor with values drawn from the uniform distribution
.. math::
\mathcal{U}(mean-a, mean+a), where a satisfies \mathcal{U}_{std}=std.
Args:
mean (float): the mean of the uniform distribution. Defaults 0.0.
std (float): the standard deviation of the uniform distribution. Defaults 1.0.
"""
a = math.sqrt(3.0 * std)
def initializer(tensor: Tensor):
return nn.init.uniform_(tensor, mean - a, mean + a)
return initializer

View File

@ -10,6 +10,7 @@ import torch
from internlm.core.context import Config
from internlm.core.context import global_context as gpc
from internlm.utils.common import get_master_node
from internlm.utils.logger import get_logger
from internlm.utils.storage_manager import init_storage_manager
@ -108,67 +109,100 @@ def args_sanity_check():
logger.info(f"valid_every: {data.valid_every}")
# processing the checkpoint config
if "enable_save_ckpt" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("enable_save_ckpt", False)
ckpt = gpc.config.ckpt
if "enable_save_ckpt" not in ckpt:
ckpt._add_item("enable_save_ckpt", False)
if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0:
gpc.config.ckpt._add_item("checkpoint_every", float("inf"))
# Saving checkpoint args.
if ckpt.enable_save_ckpt:
assert "checkpoint_every" in ckpt, "If enable save checkpoint, must give checkpoint_every in config.data!"
assert ckpt.checkpoint_every > 0
assert "save_ckpt_folder" in ckpt, "If enable save checkpoint, must give save_ckpt_folder in config.data!"
if "load_optimizer" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("load_optimizer", True)
if "save_ckpt_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("save_ckpt_folder", None)
if "load_ckpt_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("load_ckpt_folder", None)
if "load_model_only_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("load_model_only_folder", None)
if "async_upload" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("async_upload", False)
if "async_upload_tmp_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
if gpc.config.ckpt.async_upload:
assert "save_ckpt_folder" in gpc.config.ckpt
if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
if "async_upload" not in ckpt:
ckpt._add_item("async_upload", False) # async defalut is False.
else:
if ckpt.async_upload:
assert "save_ckpt_folder" in ckpt
if "boto3:" not in ckpt.save_ckpt_folder:
if gpc.is_rank_for_log():
logger.warning("Storing ckpt on file system does not support asynchronous storage, will use sync save!")
gpc.config.ckpt.async_upload = False
logger.warning(
"Storing ckpt on file system does not support asynchronous storage, will use sync save!"
)
ckpt.async_upload = False
else:
if "async_upload_tmp_folder" not in ckpt:
ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
if "snapshot_ckpt_folder" not in gpc.config.ckpt:
gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot"))
if not ckpt.async_upload:
ckpt._add_item("async_upload_tmp_folder", None)
if "oss_snapshot_freq" not in gpc.config.ckpt and gpc.config.ckpt.checkpoint_every != float("inf"):
gpc.config.ckpt._add_item("oss_snapshot_freq", gpc.config.ckpt.checkpoint_every / 2)
assert gpc.config.ckpt.oss_snapshot_freq > 0
if "snapshot_ckpt_folder" not in ckpt:
ckpt._add_item("snapshot_ckpt_folder", os.path.join(ckpt.save_ckpt_folder, "snapshot"))
assert not (
gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None
), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time."
if "oss_snapshot_freq" not in ckpt:
ckpt._add_item("oss_snapshot_freq", float("inf")) # if oss_snapshot_freq not given, we disable.
else:
ckpt._add_item("checkpoint_every", float("inf"))
ckpt._add_item("oss_snapshot_freq", float("inf"))
ckpt._add_item("save_ckpt_folder", None)
ckpt._add_item("async_upload", False)
ckpt._add_item("async_upload_tmp_folder", None)
ckpt._add_item("snapshot_ckpt_folder", None)
ckpt._add_item("snapshot_ckpt_folder", None)
# Loading checkpoint args.
if "load_model_only_folder" not in ckpt:
ckpt._add_item("load_model_only_folder", None)
if "load_ckpt_folder" not in ckpt:
ckpt._add_item("load_ckpt_folder", None)
if "load_optimizer" not in ckpt:
ckpt._add_item("load_optimizer", True)
if "stop_file_path" not in ckpt:
ckpt._add_item("stop_file_path", None)
if "load_given_ckpt" not in ckpt:
# If 'load_given_ckpt' is not given, we set it to False, so internlm can have opportunity
# to auto-load latest checkpoint.
ckpt._add_item("load_given_ckpt", False)
if ckpt.load_given_ckpt:
# Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
if ckpt.load_ckpt_folder and ckpt.load_model_only_folder:
logger.warning(
"Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
)
ckpt.load_model_only_folder = None
if gpc.is_rank_for_log():
logger.info("+" * 15 + " Ckpt Info " + "+" * 15) # pylint: disable=W1201
logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}")
logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}")
logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}")
logger.info(f"async_upload: {gpc.config.ckpt.async_upload}")
if gpc.config.ckpt.async_upload:
logger.info(f"async_upload_tmp_folder: {gpc.config.ckpt.async_upload_tmp_folder}")
logger.info(f"is enable save ckpt: {ckpt.enable_save_ckpt}")
logger.info(f"save_ckpt_folder: {ckpt.save_ckpt_folder}")
logger.info(f"checkpoint_every: {ckpt.checkpoint_every}")
logger.info(f"load_given_ckpt: {ckpt.load_given_ckpt}")
# initialization storage manager
init_storage_manager(gpc.config.ckpt)
init_storage_manager(ckpt)
# tensorboard writer config
if "enable_tb" not in gpc.config:
gpc.config._add_item("enable_tb", True)
if "tensorboard_folder" not in gpc.config:
gpc.config._add_item("tensorboard_folder", None)
gpc.config._add_item(
"tensorboard_folder", os.environ["tensorboard_folder"] if "tensorboard_folder" in os.environ else None
)
if "resume_tb_folder" not in gpc.config:
gpc.config._add_item("resume_tb_folder", None)
gpc.config._add_item(
"resume_tb_folder", os.environ["resume_tb_folder"] if "resume_tb_folder" in os.environ else None
)
if gpc.is_rank_for_log():
logger.info(f"tensorboard_folder: {gpc.config.tensorboard_folder}")
logger.info(f"resume_tb_folder: {gpc.config.resume_tb_folder}")
# cudnn
torch.backends.cudnn.benchmark = gpc.config.get("cudnn_benchmark", False)
@ -191,10 +225,8 @@ def args_sanity_check():
elif gpc.config.model.dtype in ("torch.float16", "torch.half"):
gpc.config.model.dtype = torch.float16
elif gpc.config.model.dtype == "torch.float32":
assert gpc.config.model.use_flash_attn is False, "when using float32, the use_flash_attn must be False"
gpc.config.model.dtype = torch.float32
elif gpc.config.model.dtype == "torch.tf32":
assert gpc.config.model.use_flash_attn is False, "when using tf32, the use_flash_attn must be False"
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True
gpc.config.model.dtype = torch.float32
@ -236,17 +268,32 @@ def args_sanity_check():
# process the model config
if "use_flash_attn" not in gpc.config.model:
gpc.config.model._add_item("use_flash_attn", True)
if "sequence_parallel" not in gpc.config.model:
gpc.config.model._add_item("sequence_parallel", False)
# process the parallel config
if "sequence_parallel" not in gpc.config.parallel:
gpc.config.parallel._add_item("sequence_parallel", False)
else:
assert not (
gpc.config.model.sequence_parallel is True and gpc.config.model.use_flash_attn is False
gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
), "sequence parallel does not support use_flash_attn=False"
# feishu webhook address for alerting
if "alert_address" not in gpc.config:
gpc.config._add_item("alert_address", None)
optim_ckpt = gpc.config.hybrid_zero_optimizer
if "zero_overlap_communication" in optim_ckpt:
# Compatible with the old interfaces.
optim_ckpt._add_item("overlap_sync_grad", optim_ckpt.zero_overlap_communication)
if "overlap_sync_grad" not in optim_ckpt:
optim_ckpt._add_item("overlap_sync_grad", False)
if "overlap_sync_param" not in optim_ckpt:
optim_ckpt._add_item("overlap_sync_param", False)
if gpc.is_rank_for_log():
logger.info(
f"overlap_sync_grad:{optim_ckpt.overlap_sync_grad}, overlap_sync_param:{optim_ckpt.overlap_sync_param}"
)
def launch(
config: Union[str, Path, Config, Dict],
@ -293,8 +340,6 @@ def launch(
# init process groups for different parallel modes from config
gpc.init_parallel_groups()
args_sanity_check()
# set cuda device
if torch.cuda.is_available():
# if local rank is not given, calculate automatically
@ -347,7 +392,11 @@ def launch_from_slurm(
)
def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nccl", seed: int = 1024):
def launch_from_torch(
config: Union[str, Path, Config, Dict],
backend: str = "nccl",
seed: int = 1024,
):
"""A wrapper for internlm.launch for torchrun or torch.distributed.launch by reading rank and world size
from the environment variables set by PyTorch
@ -375,3 +424,38 @@ def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nc
backend=backend,
seed=seed,
)
def initialize_distributed_env(
config: str,
launcher: str = "slurm",
master_port: int = 8888,
seed: int = 1024,
args_check=True,
):
"""
Initialize distributed environment for distributed training.
Args:
config (str): Config file path.
launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
master_port (str): The master port for distributed training. 8888 by default.
seed (int, optional): Specified random seed for every process. 1024 by default.
"""
torch.cuda.empty_cache()
if launcher == "torch":
launch_from_torch(config=config, seed=seed)
elif launcher == "slurm":
launch_from_slurm(
config=config,
host=get_master_node(),
port=master_port,
seed=seed,
)
else:
assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
if args_check:
args_sanity_check()

View File

@ -7,6 +7,7 @@ import rotary_emb
import torch
import torch.nn.functional as F
from einops import rearrange
from flash_attn.layers.rotary import ApplyRotaryEmb as LegacyApplyRotaryEmb
from flash_attn.layers.rotary import ApplyRotaryEmbQKV_ as LegacyApplyRotaryEmbQKV_
from torch import Tensor, nn
@ -56,7 +57,7 @@ class Embedding1D(nn.Module):
output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)
if gpc.config.model.sequence_parallel:
if gpc.config.parallel.sequence_parallel:
output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)
return output
@ -111,6 +112,7 @@ class ApplyRotaryEmbQKV_(torch.autograd.Function):
apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
legacy_apply_rotary_embed = LegacyApplyRotaryEmb.apply
class RotaryEmbedding(torch.nn.Module):
@ -135,15 +137,13 @@ class RotaryEmbedding(torch.nn.Module):
""" """
super().__init__()
# Generate and save the inverse frequency buffer (non trainable)
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
self.register_buffer("inv_freq", inv_freq)
self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
self.scale_base = scale_base
scale = (
self.scale = (
(torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
if scale_base > 0
else None
)
self.register_buffer("scale", scale)
self._seq_len_cached = 0
self._cos_cached = None
@ -218,3 +218,15 @@ class RotaryEmbedding(torch.nn.Module):
self._cos_k_cached[seqlen_offset:],
self._sin_k_cached[seqlen_offset:],
)
def _single_forward(self, x, indexes=0):
assert self.scale is None
self._update_cos_sin_cache(x, indexes)
x = x[None, ...]
ret = legacy_apply_rotary_embed(x, self._cos_cached[indexes], self._sin_cached[indexes]).squeeze(0)
return ret
def _single_eval_forward(self, x, seqlen_offset=0):
assert self.scale is None
self._update_cos_sin_cache(x, seqlen_offset + x.shape[1])
return legacy_apply_rotary_embed(x, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:])

View File

@ -62,7 +62,7 @@ class ScaleColumnParallelLinear(nn.Linear):
weight,
self.bias,
process_group=self.process_group,
sequence_parallel=gpc.config.model.sequence_parallel,
sequence_parallel=gpc.config.parallel.sequence_parallel,
)
@ -111,7 +111,7 @@ class RewardModelLinear(ScaleColumnParallelLinear):
weight,
self.bias,
process_group=self.process_group,
sequence_parallel=gpc.config.model.sequence_parallel,
sequence_parallel=gpc.config.parallel.sequence_parallel,
)
@ -173,7 +173,7 @@ class FeedForward(nn.Module):
hidden_features,
process_group,
bias,
sequence_parallel=gpc.config.model.sequence_parallel,
sequence_parallel=gpc.config.parallel.sequence_parallel,
device=device,
dtype=dtype,
)
@ -182,7 +182,7 @@ class FeedForward(nn.Module):
hidden_features,
process_group,
bias,
sequence_parallel=gpc.config.model.sequence_parallel,
sequence_parallel=gpc.config.parallel.sequence_parallel,
device=device,
dtype=dtype,
)
@ -191,7 +191,7 @@ class FeedForward(nn.Module):
out_features,
process_group,
bias=bias,
sequence_parallel=gpc.config.model.sequence_parallel,
sequence_parallel=gpc.config.parallel.sequence_parallel,
device=device,
dtype=dtype,
)

View File

@ -176,7 +176,7 @@ class AccPerplex:
res.update(ds_acc)
res.update(ds_tokens)
loss_res = self.loss_with_type_id.get_metric()
loss_res = self.loss_with_type_id.get_metric(reset)
res.update(loss_res)
return res

View File

@ -121,7 +121,7 @@ class PackedFlashBaseLayer1D(nn.Module):
process_group=gpc.get_group(ParallelMode.TENSOR),
bias1=False,
bias2=False,
sequence_parallel=gpc.config.model.sequence_parallel,
sequence_parallel=gpc.config.parallel.sequence_parallel,
checkpoint_lvl=0,
heuristic="auto",
device=device,
@ -294,7 +294,7 @@ class PackedFlashInternLm1D(nn.Module):
max_position_embeddings=-1,
process_group=gpc.get_group(ParallelMode.TENSOR),
padding_idx=None,
sequence_parallel=gpc.config.model.sequence_parallel,
sequence_parallel=gpc.config.parallel.sequence_parallel,
device=device,
dtype=dtype,
)

View File

@ -82,7 +82,7 @@ class MHA(nn.Module):
3 * embed_dim,
process_group,
bias=True,
sequence_parallel=gpc.config.model.sequence_parallel,
sequence_parallel=gpc.config.parallel.sequence_parallel,
**factory_kwargs,
) # according to https://spaces.ac.cn/archives/9577
@ -95,7 +95,11 @@ class MHA(nn.Module):
# output projection always have the bias (for now)
self.out_proj = RowParallelLinearTorch(
embed_dim, embed_dim, process_group, sequence_parallel=gpc.config.model.sequence_parallel, **factory_kwargs
embed_dim,
embed_dim,
process_group,
sequence_parallel=gpc.config.parallel.sequence_parallel,
**factory_kwargs,
)
# need to assign tp attribute so that internlm know it is tensor parallel module
if gpc.get_world_size(ParallelMode.TENSOR) > 1:
@ -128,6 +132,12 @@ class MHA(nn.Module):
qkv = self.rotary_emb(qkv, **kwargs)
if inference_params is None:
if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
if qkv.dtype not in [torch.float16, torch.bfloat16]:
qkv = qkv.to(torch.bfloat16)
context = self.inner_attn(qkv).to(x.dtype)
else:
context = self.inner_attn(qkv)
else:
q = qkv[:, :, 0]
@ -160,7 +170,14 @@ class MHA(nn.Module):
kwargs.pop("indexes")
if inference_params is None:
if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
if qkv.dtype not in [torch.float16, torch.bfloat16]:
qkv = qkv.to(torch.bfloat16)
context = self.inner_attn(qkv, **kwargs).to(x.dtype)
else:
context = self.inner_attn(qkv, **kwargs)
else:
raise RuntimeError("Not support this right now")

View File

@ -3,6 +3,7 @@
import math
from functools import partial
from itertools import product
import torch
import torch.distributed as dist
@ -19,6 +20,7 @@ from internlm.solver.optimizer.store import (
)
from internlm.solver.optimizer.utils import (
DynamicGradScaler,
ParamBcastSyncHandler,
flatten,
get_grad_accumulate_object,
has_inf_or_nan,
@ -87,9 +89,9 @@ class HybridZeroOptimizer(BaseOptimizer):
self,
optimizer: Optimizer,
cpu_offload=False,
overlap_broadcast=False,
grad_scal_cfg: Config = None,
zero_cfg: Config = None,
param_bcast_sync_handler: ParamBcastSyncHandler = None,
):
# DynamicGradScaler related args
if gpc.config.model.dtype is torch.float32:
@ -104,9 +106,10 @@ class HybridZeroOptimizer(BaseOptimizer):
max_scale = grad_scal_cfg.max_scale
# Zero related args
overlap_communication = zero_cfg.zero_overlap_communication
reduce_bucket_size = zero_cfg.reduce_bucket_size
clip_grad_norm = zero_cfg.clip_grad_norm
self._overlap_sync_grad = zero_cfg.overlap_sync_grad
self._overlap_sync_param = zero_cfg.overlap_sync_param
super().__init__(optim=optimizer)
@ -127,7 +130,7 @@ class HybridZeroOptimizer(BaseOptimizer):
self._fp32_flat_param_groups_of_current_rank = dict()
# communication params
self._overlap_communication = overlap_communication
# self._overlap_communication = overlap_communication
self._reduce_bucket_size = reduce_bucket_size
# gradient scaler
@ -158,7 +161,12 @@ class HybridZeroOptimizer(BaseOptimizer):
+ f"zo-{self._zero_local_rank}.pt"
)
self.params_per_rank_id_dict = []
self.overlap_broadcast = overlap_broadcast
self._param_bcast_sync_handler = param_bcast_sync_handler
if self._overlap_sync_param:
assert self._param_bcast_sync_handler is not None
self._broadcast_comm_stream = torch.cuda.Stream()
else:
self._broadcast_comm_stream = torch.cuda.current_stream()
# iterate over the param group in the optimizer
# partition these param groups for data parallel training
@ -228,12 +236,14 @@ class HybridZeroOptimizer(BaseOptimizer):
# initialize communication stream for
# communication-computation overlapping
if self._overlap_communication:
if self._overlap_sync_grad:
self._comm_stream = torch.cuda.Stream()
else:
self._comm_stream = torch.cuda.current_stream()
# reduction hook is only used if overlapping communication
# if it is stage 1 without overlapping, no hook will be attached
if self._overlap_communication:
if self._overlap_sync_grad:
self._attach_reduction_hook()
@property
@ -267,7 +277,9 @@ class HybridZeroOptimizer(BaseOptimizer):
global_id = str(i)
for j in range(len(param.size())):
global_id = "_".join([global_id, str(param.size()[j])])
if self._overlap_sync_param:
rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param)
else:
rank_to_go = numel_per_rank.index(min(numel_per_rank))
params_per_rank[rank_to_go].append(param)
self.params_per_rank_id_dict[-1][rank_to_go].append(global_id)
@ -299,7 +311,9 @@ class HybridZeroOptimizer(BaseOptimizer):
self._grad_store.add_accumulate_grad_object(accum_grad_obj)
reduction_func = partial(
self._store_and_try_reduce_grads_by_bucket, param=param, reduce_rank=reduce_rank
self._store_and_try_reduce_grads_by_bucket,
param=param,
reduce_rank=reduce_rank,
)
# define hook
@ -384,17 +398,17 @@ class HybridZeroOptimizer(BaseOptimizer):
self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)
def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
if self._overlap_communication:
stream = self._comm_stream
stream.synchronize()
if self._overlap_sync_grad:
self._comm_stream.synchronize()
self._param_store.clear_grads_of_previous_reduced_params()
else:
stream = torch.cuda.current_stream()
with torch.cuda.stream(stream):
with torch.cuda.stream(self._comm_stream):
flat = bucket.flatten()
reduced_flat = reduce_tensor(
tensor=flat, dtype=self.dtype, dst_rank=reduce_rank, parallel_mode=ParallelMode.DATA
tensor=flat,
dtype=self.dtype,
dst_rank=reduce_rank,
parallel_mode=ParallelMode.DATA,
)
# update the reduced tensor
@ -483,6 +497,7 @@ class HybridZeroOptimizer(BaseOptimizer):
grads = [self.padding_grad]
params = [self.padding_tensor]
norm = 0
if self._clip_grad_norm > 0:
# this norm is before scaling, it will be very large
norm = compute_norm(
@ -507,7 +522,7 @@ class HybridZeroOptimizer(BaseOptimizer):
# if not overlapping communication (no reduction hook is attached)
# we need to manually reduce these gradients
if not self._overlap_communication:
if not self._overlap_sync_grad:
for group_id in range(len(self._fp16_param_groups)):
for param in self._fp16_param_groups[group_id]:
if param.grad is not None:
@ -522,18 +537,21 @@ class HybridZeroOptimizer(BaseOptimizer):
groups_norms.append(self._compute_norm_with_stage(group_id=group_id))
# clear reduced grads
if self._overlap_communication:
if self._overlap_sync_grad:
# grads in the last bucket is reduced
self._comm_stream.synchronize()
self._param_store.clear_grads_of_previous_reduced_params()
# compute norm for gradients in the last bucket
total_norms = []
total_norms = {}
for group_id in range(self.num_param_groups):
total_norms.append(
self._compute_norm_with_stage(
group_id=group_id, last_bucket=True, last_stage=True, previous_norm=groups_norms[group_id]
)
group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
group_name = f"{group_id}_{group_name}"
total_norms[group_name] = self._compute_norm_with_stage(
group_id=group_id,
last_bucket=True,
last_stage=True,
previous_norm=groups_norms[group_id],
)
timer("sync_grad").start()
@ -552,7 +570,7 @@ class HybridZeroOptimizer(BaseOptimizer):
# found_inf = self._check_overflow()
# Because you may encounter inf when computing norm
if -1 in norms:
if -1 in norms.values():
found_inf = True
loss_scale = float(self.loss_scale.item()) # backup
@ -562,10 +580,13 @@ class HybridZeroOptimizer(BaseOptimizer):
if found_inf:
if gpc.is_rank_for_log():
logger.warning("Overflow occurs, please check it.")
send_alert_message(address=gpc.config.alert_address, message="Overflow occurs, please check it.")
send_alert_message(
address=gpc.config.alert_address,
message="Overflow occurs, please check it.",
)
self._grad_store._averaged_gradients = dict()
self.zero_grad()
return False, None
return False, norms
# copy the grad of fp16 param to fp32 param
single_grad_partition_groups = []
@ -597,15 +618,17 @@ class HybridZeroOptimizer(BaseOptimizer):
# unscale and clip grads
# get the global norm
global_norm_groups = []
global_norm_groups = {}
if self._clip_grad_norm > 0:
for norm in norms:
global_norm_groups.append(norm**0.5)
for group_name, norm in norms.items():
global_norm_groups[group_name] = norm**0.5
# the following operations are performed only on the rank to which parameters are assigned.
if gpc.config.model.dtype is not torch.float32:
if len(single_grad_partition_groups) != 0:
self._unscale_and_clip_grads(single_grad_partition_groups, global_norm_groups, loss_scale)
if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
self._unscale_and_clip_grads(
single_grad_partition_groups, list(global_norm_groups.values()), loss_scale
)
# update the parameters
timer("step").start()
@ -625,35 +648,42 @@ class HybridZeroOptimizer(BaseOptimizer):
fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
fp16_param.data.copy_(fp32_param)
# TODO: support broadcast overlap
self.broadcast_params(overlap=False)
with torch.cuda.stream(self._broadcast_comm_stream):
self.broadcast_params()
timer("step").stop()
# update gradients may not be needed here, because the sync_params function is used in initialization,
# so synchronization is maintained
return True, [global_norm / loss_scale for global_norm in global_norm_groups]
for group_name, global_norm in global_norm_groups.items():
global_norm_groups[group_name] = global_norm / loss_scale
return True, global_norm_groups
def broadcast_params(self, overlap=False):
def broadcast_params(self):
handles = []
for group_id in range(self.num_param_groups):
for rank in range(self._zero_world_size):
for rank, group_id in product(range(self._zero_world_size), range(self.num_param_groups)):
# The following operations are performed only on the rank to which parameters are assigned.
if rank not in self.param_group_no_params_ranks[group_id]:
if rank in self.param_group_no_params_ranks[group_id]:
continue
fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
# grank = gpc.get_ranks_in_group(group_type)[rank] # need to convert to the global rank
# assert grank == rank, f"{grank} == {rank}"
g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode)[rank]
handle = dist.broadcast(
fp16_param, src=g_rank, group=gpc.get_group(ParallelMode.ZERO1), async_op=True
fp16_param,
src=g_rank,
group=gpc.get_group(ParallelMode.ZERO1),
async_op=True,
)
if self._overlap_sync_param:
self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
else:
handles.append(handle)
if not overlap:
for handle in handles:
handle.wait()
else:
return handles
##################
# FP16 Utilities #
@ -671,7 +701,11 @@ class HybridZeroOptimizer(BaseOptimizer):
if avg_grad is not None and has_inf_or_nan(avg_grad):
self._found_overflow.fill_(1.0)
break
dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.GLOBAL))
dist.all_reduce(
self._found_overflow,
op=dist.ReduceOp.MAX,
group=gpc.get_group(ParallelMode.GLOBAL),
)
return self._found_overflow.item() > 0

View File

@ -3,15 +3,18 @@
import math
from abc import ABC, abstractmethod
from typing import Dict, Optional
from collections import OrderedDict
from functools import partial
from typing import Any, Dict, Optional, Union
import torch
import torch.distributed as dist
from torch import Tensor
from torch import Tensor, nn
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.core.naive_amp import NaiveAMPModel
from internlm.utils.common import get_tensor_norm, move_norm_to_cuda
from internlm.utils.logger import get_logger
from internlm.utils.parallel import is_model_parallel_parameter
@ -60,12 +63,19 @@ def get_grad_accumulate_object(tensor):
def split_half_float_double(tensor_list):
dtypes = ["torch.cuda.HalfTensor", "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor", "torch.cuda.BFloat16Tensor"]
buckets = []
for _, dtype in enumerate(dtypes):
bucket = [t for t in tensor_list if t.type() == dtype]
if bucket:
buckets.append(bucket)
dtype_buckets = {
"torch.cuda.HalfTensor": [],
"torch.cuda.FloatTensor": [],
"torch.cuda.DoubleTensor": [],
"torch.cuda.BFloat16Tensor": [],
}
for t in tensor_list:
dtype = t.type()
if dtype in dtype_buckets:
dtype_buckets[dtype].append(t)
buckets = [bucket for bucket in dtype_buckets.values() if bucket]
return buckets
@ -184,7 +194,10 @@ def calc_l2_norm(grads):
if APEX_AVAILABLE:
dummy_overflow_buf = torch.cuda.IntTensor([0])
norm, _ = multi_tensor_applier(
amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads], False # no per-parameter norm
amp_C.multi_tensor_l2norm,
dummy_overflow_buf,
[grads],
False, # no per-parameter norm
)
else:
norm, _ = multi_tensor_l2norm_torch(grads, False)
@ -228,7 +241,11 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no
# Take max across all model-parallel GPUs.
if gpc.get_world_size(ParallelMode.MODEL) > 1:
dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.MODEL))
dist.all_reduce(
total_norm_cuda,
op=dist.ReduceOp.MAX,
group=gpc.get_group(ParallelMode.MODEL),
)
total_norm = total_norm_cuda[0].item()
else:
tensor_parallel_grads = []
@ -280,7 +297,11 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no
# Sum across all model-parallel GPUs.
if gpc.is_initialized(ParallelMode.MODEL):
dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.MODEL))
dist.all_reduce(
total_norm,
op=dist.ReduceOp.SUM,
group=gpc.get_group(ParallelMode.MODEL),
)
# This is because we use zero1, so we need to use this reduction.
# TODO: Check zero group to be a subset of dp group.
@ -459,3 +480,90 @@ class DynamicGradScaler(BaseGradScaler):
self._scale = self._scale.fill_(state_dict["_scale"])
self._growth_step = state_dict["_growth_step"]
self._hysteresis_step = state_dict["_hysteresis_step"]
class ParamBcastSyncHandler:
"""
Model Partition Handler for overlap broadcast with forward
"""
def __init__(self, model: Union[nn.Module, nn.ModuleList]) -> None:
self._block_to_param = OrderedDict() # <key: nn.Module> <value: list(param)>
self._param_to_rank = dict() # <key: param> <value: rank)>
self._block_to_rank = dict() # <key: nn.Module> <value: rank)>
self._bcast_handles = dict() # <key: rank> <value: list(bcast handles))>
zero1_size = gpc.get_world_size(ParallelMode.ZERO1)
total_param_num = sum(p.numel() for p in model.parameters())
avg_param_num = total_param_num * 1.0 // zero1_size
# just want to share same for loop for ModuleList and Module
if not isinstance(model, nn.ModuleList):
model = [model]
# record the parameters to transformer/embeding/head/norm block
for _chunk in model:
if isinstance(_chunk, NaiveAMPModel):
_chunk = _chunk.model
for _, children in _chunk.named_children():
# should be the transformer block definaton in modeling_xxx.py
if isinstance(children, nn.ModuleList):
# record the block that a parameter belongs to
for _, block in enumerate(children):
# self._block_to_param[f"{name}.{idx}"] = list(block.parameters())
self._block_to_param[block] = list(block.parameters())
else:
# record the block that a parameter belongs to
# self._block_to_param[name] = list(children.parameters())
self._block_to_param[children] = list(children.parameters())
alloc_num = 0
rank_to_go = 0
# process the parameters in block_to_param sequencially,
# allocate each parameter to a local rank of ParallelMode.ZERO1,
# NOTE that we do NOT consider following scenarios:
# 1) whether a parameter is trainable;
# 2) paramters maybe in different optimizer group
for block, params in self._block_to_param.items():
# allocate a model block to a local rank of ParallelMode.ZERO1
self._block_to_rank[block] = [rank_to_go]
for p in params:
alloc_num = alloc_num + p.numel()
# in this case, allocate the param to next rank if possible
if alloc_num > avg_param_num * 1.01 and rank_to_go < zero1_size - 1:
rank_to_go = rank_to_go + 1
alloc_num = 0
self._block_to_rank[block].append(rank_to_go)
# allocate a parameter to a local rank of ParallelMode.ZERO1
self._param_to_rank[p] = rank_to_go
# initialize an empty list for _bcast_handles of each rank
for rank in range(gpc.get_world_size(ParallelMode.ZERO1)):
self._bcast_handles[rank] = []
# register_forward_pre_hook for transformer/embeding/norm/xxx block
self._register_sync_parameters_hook()
def _register_sync_parameters_hook(self) -> None:
def _pre_forward_hook(model: nn.Module, inputs: Any): # pylint: disable=W0613
bcast_handles = []
# gather all required broadcast hanles into a list
for rank in self._block_to_rank[model]:
bcast_handles.extend(self._bcast_handles[rank])
# need to clear _bcast_handles since they would be processed later
self._bcast_handles[rank] = []
# wait all required broadcast handles to be completed
for handle in bcast_handles:
handle.wait()
# register_forward_pre_hook for transformer/embeding/norm/xxx block
for block, _ in self._block_to_rank.items():
block.register_forward_pre_hook(partial(_pre_forward_hook))
def get_rank_by_param(self, param) -> int:
return self._param_to_rank[param]
def add_bcast_handle(self, rank, handle) -> None:
self._bcast_handles[rank].append(handle)

View File

@ -0,0 +1,19 @@
from .training_internlm import (
get_train_data_loader,
get_validation_data_loader,
initialize_llm_profile,
initialize_model,
initialize_optimizer,
load_new_batch,
record_current_batch_training_metrics,
)
__all__ = [
"get_train_data_loader",
"get_validation_data_loader",
"initialize_llm_profile",
"initialize_model",
"initialize_optimizer",
"load_new_batch",
"record_current_batch_training_metrics",
]

View File

@ -0,0 +1,422 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import time
from functools import partial
from typing import Callable, Iterable, Union
import torch
import torch.distributed as dist
from torch import nn
from torch.utils.data import ConcatDataset, DataLoader
from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.core.naive_amp import NaiveAMPModel
from internlm.core.trainer import TrainState
from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
from internlm.data.dataset import get_dataset_dict
from internlm.data.dummy_dataset import RandomDataset
from internlm.data.packed_dataset import (
PackedDataset,
PackedDatasetWithoutCuSeqlen,
get_packed_dataset_without_short_length,
)
from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
from internlm.monitor import set_env_var
from internlm.monitor.monitor import monitor_manager as mm
from internlm.solver.beta2_scheduler import Beta2Scheduler
from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
from internlm.solver.optimizer import HybridZeroOptimizer
from internlm.solver.optimizer.utils import ParamBcastSyncHandler
from internlm.utils.common import DummyProfile
from internlm.utils.logger import get_logger
from internlm.utils.megatron_timers import megatron_timer as timer
from internlm.utils.parallel import (
is_no_pp_or_last_stage,
sync_model_param,
sync_model_param_within_tp,
)
from internlm.utils.registry import MODEL_INITIALIZER
logger = get_logger(__file__)
def initialize_model():
"""
Initialize model.
Returns: The neural network model to be trained or evaluated.
"""
model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
if isinstance(model, nn.ModuleList):
model = nn.ModuleList(
[
NaiveAMPModel(
model=_m,
output_to_fp32=False, # manually controlled by interleaved pipleline scheduler
dtype=gpc.config.model.get("dtype", torch.half),
sync_buffer=False,
)
for _m in model
]
)
else:
model = NaiveAMPModel(
model=model,
output_to_fp32=is_no_pp_or_last_stage(),
dtype=gpc.config.model.get("dtype", torch.half),
sync_buffer=False,
)
# This sync is very important, cause the model weights kept in optimizer are copied
# from the origin parameters in the memory, so we should make sure the dp sync
# does not influence the model weights in optimizer be different with the origin parameters.
sync_model_param(model, parallel_mode=ParallelMode.DATA)
# This function is needed to make sure parameters that are not splitted by tensor parallelism are
# the same across tensor parallelism.
sync_model_param_within_tp(model)
return model
def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
"""
Initialize optimizer.
Args:
model (torch.nn.Module): Your model instance to be trained or evaluated.
Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
"""
if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
param_bcast_sync_handler = ParamBcastSyncHandler(model)
else:
param_bcast_sync_handler = None
adam_cfg = gpc.config.adam
naive_optimizer = torch.optim.AdamW(
params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
lr=adam_cfg.lr,
betas=(adam_cfg.adam_beta1, adam_cfg.adam_beta2),
eps=adam_cfg.adam_eps,
)
optimizer = HybridZeroOptimizer(
naive_optimizer,
grad_scal_cfg=gpc.config.grad_scaler,
zero_cfg=gpc.config.hybrid_zero_optimizer,
param_bcast_sync_handler=param_bcast_sync_handler,
)
beta2_scheduler = Beta2Scheduler(optimizer=naive_optimizer, **gpc.config.beta2_scheduler)
lr_scheduler = FineTuneCosineAnnealingWarmupLR(optimizer, **gpc.config.lr_scheduler)
return optimizer, beta2_scheduler, lr_scheduler
def get_train_data_loader(
num_worker: int = 0, dataset_generate_func: Callable = None, train_sampler=None, train_collate_fn=None
):
"""
Generate and return the training data loader.
Returns: A tuple of (train_dl, dataset_types).
"""
# Get the dataset types
dataset_types = None
dataset_types = list(DATASET_TYPE_IDS_MAP.keys())
data_cfg = gpc.config.data
# Get the sample weight dictionary
train_folder = data_cfg.train_folder
if not train_folder:
train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
if data_cfg.pack_sample_into_one:
train_ds = PackedDatasetWithoutCuSeqlen(
train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
)
else:
train_ds = PackedDataset(
train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
)
else:
if dataset_generate_func is not None:
train_ds = dataset_generate_func()
else:
train_ds = get_packed_dataset_without_short_length(
folder=data_cfg.train_folder,
packed_length=data_cfg.packed_length,
max_length_per_sample=data_cfg.seq_len,
show_progress=dist.get_rank() == 0,
min_length=data_cfg.min_length,
min_length_dict=data_cfg.get("min_length_dict", {}),
pack_into_one_sample=data_cfg.pack_sample_into_one,
)
if dataset_generate_func is None or not train_folder:
# partition already completed
assert isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen, ConcatDataset))
# Create the training dataset sampler
train_sampler = StaticBatchSampler(
train_ds.datasets if isinstance(train_ds, ConcatDataset) else [train_ds],
batch_size=data_cfg.micro_num,
rampup_batch_size=data_cfg.rampup_batch_size,
micro_bsz=data_cfg.micro_bsz,
seed=1024,
drop_last=True,
data_rank=gpc.get_local_rank(ParallelMode.DATA),
data_world_size=gpc.get_world_size(ParallelMode.DATA),
)
if dataset_generate_func is None or not train_folder:
train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
# Create the training data loader
train_dl = DataLoader(
dataset=train_ds,
batch_sampler=train_sampler,
num_workers=num_worker,
pin_memory=True,
collate_fn=train_collate_fn,
persistent_workers=num_worker > 0,
)
return train_dl, dataset_types
def get_validation_data_loader(
num_worker: int = 0, dataset_generate_func: Callable = None, val_collate_fn=None, dataloader_func=None
):
"""Generate and return the validation data loader."""
data_cfg = gpc.config.data
if not data_cfg.valid_folder:
val_ds = RandomDataset(num_samples=gpc.get_world_size(ParallelMode.DATA) * 500, max_len=data_cfg.seq_len)
else:
if dataset_generate_func is not None:
assert val_collate_fn and dataloader_func is not None
val_ds = dataset_generate_func()
else:
val_ds = get_dataset_dict(folder=data_cfg.valid_folder, split="")
if not isinstance(val_ds, dict):
val_ds = {"val": val_ds}
if val_collate_fn is None or not data_cfg.valid_folder:
val_collate_fn = partial(jsonl_ds_collate_fn, max_length_per_sample=data_cfg.seq_len)
val_dls = {}
for val_name, ds in val_ds.items():
if dataloader_func and data_cfg.valid_folder is not None:
val_dls[val_name] = dataloader_func(dataset=ds, collate_fn=val_collate_fn)
if gpc.is_rank_for_log():
logger.info(
f"load validation dataset {val_name} with valid batch size {str(data_cfg.valid_micro_num)} and "
f"{ds.size} Byte samples."
)
else:
# making the batch_size of validate larger can speed up the evaluation, but it should not be too large,
# otherwise too much data may be dropped
batch_size = min(
data_cfg.valid_micro_num * data_cfg.micro_bsz, len(ds) // gpc.get_world_size(ParallelMode.DATA)
)
batch_size = batch_size // data_cfg.micro_bsz * data_cfg.micro_bsz
if batch_size == 0 and gpc.is_rank_for_log():
logger.info(f"skip validate {val_name}.")
continue
val_dls[val_name] = get_dpsampler_dataloader(
ds,
shuffle=False,
num_workers=num_worker,
batch_size=batch_size,
collate_fn=val_collate_fn,
drop_last=True,
) # drop_last=True, otherwise it may cause problems in the last batch
if gpc.is_rank_for_log():
logger.info(
f"load validation dataset {val_name} with valid batch size {str(batch_size)} and "
f"samples {str(len(val_dls[val_name]))}."
)
return val_dls
def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
"""
Load and return the new batch data based on training data loader.
Args:
train_dl (torch.utils.data.DataLoader): Dataloader for training.
train_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
train_state (TrainState): Current training state.
Returns: A batch data and the updated train_iter.
"""
timer("batch-gen").start()
try:
batch = next(train_iter) # structure is ({'input_ids': Tensor, 'cu_seqlens': Tensor}, Tensor)
if hasattr(train_state, "batch_sampler_iter"):
next(train_state.batch_sampler_iter)
except StopIteration:
train_iter = iter(train_dl)
batch = next(train_iter)
train_state.num_consumed_samples_in_epoch = 0
if hasattr(train_state, "batch_sampler"):
train_state.batch_sampler_iter = iter(train_state.batch_sampler)
next(train_state.batch_sampler_iter)
timer("batch-gen").stop()
if batch[0].get("type_ids", None) is not None:
# if use_flash_attn is False, we need to unpack type_ids
if not gpc.config.model.use_flash_attn:
batch[0]["type_ids"] = unpack_data(batch[0]["type_ids"], batch[0]["cu_seqlens"])
return batch, train_iter
def initialize_llm_profile(profiling: bool = False, start_time: str = None):
"""Initialize and return the profiler context manager instance."""
if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
llm_profile = torch.profiler.profile
logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
else:
llm_profile = DummyProfile
return llm_profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler(
f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
+ f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
),
with_stack=True,
with_modules=True,
)
def record_current_batch_training_metrics(
get_tflops_func,
logger,
writer,
success_update,
batch_count,
batch,
train_state,
optimizer,
beta2_scheduler,
trainer,
start_time,
loss,
grad_norm,
metric,
update_panel,
):
"""
Print some training metrics of current batch.
"""
set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))
if success_update in (0, True):
train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
if is_no_pp_or_last_stage():
acc_perplex = metric.get_metric()
if success_update and gpc.is_rank_for_log():
lr = optimizer.param_groups[0]["lr"]
if hasattr(trainer.engine.optimizer, "grad_scaler"):
scaler = trainer.engine.optimizer.grad_scaler._scale.item()
elif hasattr(trainer.engine.optimizer.optim, "grad_scaler"):
scaler = trainer.engine.optimizer.optim.grad_scaler._scale.item()
num_tokens_in_batch = batch[1].nelement()
num_samples_in_batch = sum([len(b) - 1 for b in batch[0]["cu_seqlens"]])
max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
tk_per_gpu = 0
tk_per_gpu = round(
num_tokens_in_batch
* gpc.get_world_size(ParallelMode.DATA)
/ gpc.get_world_size(ParallelMode.GLOBAL)
/ (time.time() - start_time),
2,
)
tflops = get_tflops_func((time.time() - start_time))
infos = {
"tflops": tflops,
"step": batch_count,
"loss": loss.item(),
"tgs (tokens/gpu/second)": tk_per_gpu,
"lr": lr,
"loss_scale": scaler,
"grad_norm": grad_norm,
}
infos["micro_num"] = len(batch[1])
infos["num_consumed_tokens"] = train_state.num_consumed_tokens
infos["inf_nan_skip_batches"] = train_state.inf_nan_skip_batches
infos["num_samples_in_batch"] = num_samples_in_batch # the number of batches which have the most samples
infos["largest_length"] = max_length_in_batch # the longest input
infos["largest_batch"] = max_samples_in_batch # the batch with the most samples
infos["smallest_batch"] = min_samples_in_batch
infos["adam_beta2"] = beta2_scheduler.get_beta2()
fwd_bwd_time = round(timer("fwd-bwd").elapsed(), 2)
infos["fwd_bwd_time"] = fwd_bwd_time
for key, value in acc_perplex.items():
infos[key] = value
line = ""
for key, value in infos.items():
line += f"{key}={value} "
if isinstance(value, dict):
writer.add_scalars(key=key, value=value, step=train_state.step_count)
else:
writer.add_scalar(key=key, value=value, step=train_state.step_count)
if update_panel:
# metrics shown with dashboard panels
panel_metrics = {
"step": batch_count,
"lr": lr,
"num_consumed_tokens": train_state.num_consumed_tokens,
"loss": loss.item(),
"flops": tflops,
"tgs": tk_per_gpu,
"acc": acc_perplex["acc"],
"perplexity": acc_perplex["perplexity"],
"fwd_bwd_time": fwd_bwd_time,
}
for norm_key, norm_value in grad_norm.items():
panel_metrics[norm_key] = norm_value
logger.info(
"{line}",
line=line,
extra=panel_metrics,
)
else:
logger.info(line)
# if loss spike occurs, send alert info to feishu
mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())

View File

@ -52,12 +52,12 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape
@contextmanager
def switch_sequence_parallel_mode():
prev_mode = gpc.config.model.sequence_parallel
prev_mode = gpc.config.parallel.sequence_parallel
try:
gpc.config.model.sequence_parallel = False
gpc.config.parallel.sequence_parallel = False
yield
finally:
gpc.config.model.sequence_parallel = prev_mode
gpc.config.parallel.sequence_parallel = prev_mode
def evaluate_on_val_dls(
@ -67,6 +67,7 @@ def evaluate_on_val_dls(
logger,
step_count,
update_panel: bool = False,
streaming: bool = False,
):
with switch_sequence_parallel_mode():
torch.cuda.empty_cache()
@ -75,7 +76,7 @@ def evaluate_on_val_dls(
data_cfg = gpc.config.data
for val_name, val_dl in val_dls.items():
if len(val_dl) == 0 and verbose:
if not streaming and len(val_dl) == 0 and verbose:
logger.info(f"Validation dataset: {val_name} is empty")
continue
@ -91,7 +92,7 @@ def evaluate_on_val_dls(
for val_idx, batch in tqdm(
enumerate(val_dl),
desc="Val.",
total=len(val_dl),
total=len(val_dl) if not streaming else None,
position=1,
disable=not verbose,
leave=False,
@ -135,7 +136,7 @@ def evaluate_on_val_dls(
dist.barrier()
val_res = val_metric.get_metric()
if verbose and len(val_dl) != 0:
if verbose and (streaming or len(val_dl) != 0):
val_loss = val_loss / (val_idx + 1 + 1e-6)
infos = {
"step": step_count,

163
internlm/utils/gputest.py Normal file
View File

@ -0,0 +1,163 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import math
import socket
import torch
import torch.distributed as dist
from flash_attn.modules.mha import FlashSelfAttention, SelfAttention
from torch.utils import benchmark
from internlm.utils.logger import get_logger
try:
import GPUtil
import psutil
except ImportError:
GPUtil, psutil = None, None
from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.utils.common import get_current_device
logger = get_logger(__file__)
def benchmark_forward(
test_fn,
*inputs,
repeats=100,
amp=True,
amp_dtype=torch.float16,
**kwinputs,
):
"""Use Pytorch Benchmark on the forward pass of an arbitrary function."""
def amp_wrapper(*inputs, **kwinputs):
with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
test_fn(*inputs, **kwinputs)
bench_timer = benchmark.Timer(
stmt="test_fn_amp(*inputs, **kwinputs)",
globals={"test_fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
num_threads=torch.get_num_threads(),
)
used_time = bench_timer.timeit(repeats)
return used_time.mean
def flops(batch, seqlen, headdim, nheads, time_f):
"""Compute the flops value of a GPU with give flashattention function"""
flop = 4 * batch * seqlen**2 * nheads * headdim
return (flop / time_f / 10**12) if not math.isnan(time_f) else 0.0
def get_gpu_temperature():
"""Get current GPU temperature."""
try:
gpu_id = torch.cuda.current_device()
except AssertionError:
gpu_id = -1
if GPUtil is not None and gpu_id >= 0:
gpus = GPUtil.getGPUs()
gpu_temperature = gpus[gpu_id].temperature
else:
gpu_temperature = -1
return gpu_temperature
def get_cpu_temperature():
"""Get current CPU temperature."""
if psutil is not None:
cpu_temperature = psutil.sensors_temperatures()["coretemp"][0].current
else:
cpu_temperature = -1
return cpu_temperature
def bench_net():
"""Benchmark nccl performance for slow node detection."""
if gpc.get_world_size(ParallelMode.GLOBAL) <= 1:
return
if gpc.is_rank_for_log():
logger.info("benchmarking network speed ...")
repeats = 100
input_data = torch.randn(
8 * 1024 * 1024,
device=get_current_device(),
dtype=torch.bfloat16,
)
def allreduce_fn(inputs):
dist.all_reduce(inputs, op=torch.distributed.ReduceOp.AVG, group=gpc.get_group(ParallelMode.NETTEST))
bench_timer = benchmark.Timer(
stmt="test_fn_amp(inputs)",
globals={"test_fn_amp": allreduce_fn, "inputs": input_data},
num_threads=torch.get_num_threads(),
)
allreduce_time = bench_timer.timeit(repeats).mean
allreduce_time = allreduce_time * 10**3
allreduce_time_this = allreduce_time
allreduce_time = torch.Tensor([allreduce_time]).to(device=get_current_device())
dist.all_reduce(allreduce_time, group=gpc.get_group(ParallelMode.GLOBAL))
allreduce_time_avg = allreduce_time / gpc.get_world_size(ParallelMode.GLOBAL)
allreduce_time_avg = float(allreduce_time_avg.item())
if allreduce_time_this >= allreduce_time_avg * 1.05:
logger.warning(
f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} NCCL test is slower than avg, "
f"Hostname {socket.gethostname()}, "
f"allreduce_time {allreduce_time_this:.2f}, avg {allreduce_time_avg:.2f}, "
f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
)
def bench_gpu(use_flash_attn=True):
"""Benchmark single GPU performance for slow node detection."""
if gpc.is_rank_for_log():
logger.info("benchmarking gpu speed ...")
headdim = 64
dim = 2048
batch_size, seqlen = 2, 1024
nheads = dim // headdim
inner_attn = FlashSelfAttention if use_flash_attn else SelfAttention
inner_attn = inner_attn(causal=True, softmax_scale=None, attention_dropout=0)
qkv = torch.randn(
batch_size,
seqlen,
3,
dim // headdim,
headdim,
device=get_current_device(),
dtype=torch.float16,
requires_grad=True,
)
time_f = benchmark_forward(inner_attn, qkv)
speed = flops(batch_size, seqlen, headdim, nheads, time_f)
speed_this = speed
speed = torch.Tensor([speed]).to(device=get_current_device())
dist.all_reduce(speed, group=gpc.get_group(ParallelMode.GLOBAL))
speed_avg = speed / gpc.get_world_size(ParallelMode.GLOBAL)
speed_avg = float(speed_avg.item())
if speed_this <= speed_avg * 0.95:
logger.warning(
f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} GPU is slower than avg, "
f"Hostname {socket.gethostname()}, "
f"tflops {speed_this:.2f}, avg {speed_avg:.2f}, "
f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
)

View File

@ -14,18 +14,19 @@ class _Timer:
self.elapsed_ = 0.0
self.started_ = False
self.start_time = time.time()
self.stream = torch.cuda.current_stream()
def start(self):
"""Start the timer."""
assert not self.started_, "timer has already been started"
torch.cuda.synchronize()
self.stream.synchronize()
self.start_time = time.time()
self.started_ = True
def stop(self):
"""Stop the timer."""
assert self.started_, "timer is not started"
torch.cuda.synchronize()
self.stream.synchronize()
self.elapsed_ += time.time() - self.start_time
self.started_ = False

View File

@ -2,7 +2,9 @@
# -*- encoding: utf-8 -*-
import copy
import fcntl
import os
import socket
import time
from enum import Enum
from typing import Dict
@ -12,6 +14,7 @@ import torch
from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.core.trainer import TrainState
from internlm.monitor import send_alert_message
from internlm.solver.optimizer import HybridZeroOptimizer
from internlm.utils.common import get_current_device
from internlm.utils.logger import get_logger
@ -25,8 +28,6 @@ from internlm.utils.storage_manager import (
logger = get_logger(__file__)
quit_signal_handler = None
class CheckpointType(Enum):
NORMAL_CHECKPOINT = 1
@ -167,44 +168,6 @@ def save_optimizer_checkpoint(optim, state_path):
llm_save(os.path.join(state_path, fp), states)
def save_checkpoint(folder, model, optimizer, scheduler, train_state: TrainState, model_config: Dict = None):
"""
Save checkpoint to the given folder path.
"""
start = time.time()
torch.distributed.barrier()
folder = os.path.join(folder, str(train_state.step_count))
logger.info(
f"Saving checkpoint to `{folder}` at batch count:{train_state.step_count} from rank:{gpc.get_global_rank()}..."
)
timer("save-model").start()
save_model_checkpoint(folder=folder, model=model)
timer("save-model").stop()
timer("save-optimizer").start()
save_optimizer_checkpoint(optim=optimizer, state_path=folder)
timer("save-optimizer").stop()
if gpc.is_rank_for_log():
scheduler_states = scheduler.state_dict()
llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
sampler_state = train_state.batch_sampler.state_dict()
llm_save(os.path.join(folder, "sampler.pt"), saved_obj=sampler_state)
llm_save(os.path.join(folder, "context.pt"), saved_obj=train_state.state_dict())
if model_config is not None:
llm_save(os.path.join(folder, "model_config.pt"), saved_obj=model_config)
torch.distributed.barrier()
if gpc.is_rank_for_log():
timer.log(["save-model", "save-optimizer"], logger=logger)
logger.info(f"Step: {train_state.step_count}, rank 0 save ckpt use {time.time() - start:.3f} s")
def load_optimizer_checkpoint(folder, optim):
"""Load the optimizer state from the local file system or remote
object storage Service (OSS).
@ -304,19 +267,12 @@ def load_scheduler(ckpt_path: str, lr_scheduler, optimizer, learning_rate, train
logger.info(f"reload load_scheduler:{lr_scheduler}")
class CheckpointSaveManager:
class CheckpointManager:
"""StorageManagerContext"""
def __init__(
self,
ckpt_config,
model,
optimizer,
lr_scheduler,
model_config,
) -> None:
def __init__(self, ckpt_config, model, model_config=None, model_config_file=None, feishu_address=None) -> None:
"""
CheckpointSaveManager is used to decide when to store ckpt. If it is an asynchronous
CheckpointManager is used to decide when to store ckpt. If it is an asynchronous
upload mode, you must call wait_async_upload_finish at the end of the program to wait
for the asynchronous ckpt upload to complete.
@ -332,26 +288,96 @@ class CheckpointSaveManager:
self.save_ckpt_folder = ckpt_config.save_ckpt_folder
self.snapshot_ckpt_folder = ckpt_config.snapshot_ckpt_folder
self.oss_snapshot_freq: int = ckpt_config.oss_snapshot_freq
self.stop_file_path = ckpt_config.stop_file_path
self.load_model_only_folder = ckpt_config.load_model_only_folder
self.feishu_address = feishu_address
self.storage_manager = get_storage_manager()
self.snapshot_counter = 0
self.load_optimizer = gpc.config.ckpt.load_optimizer
self.model = model
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
self.model_config = model_config
self.model_config_file = model_config_file
if self.stop_file_path and gpc.get_global_rank() == 0:
dir_path = os.path.dirname(self.stop_file_path)
if dir_path != "" and not os.path.exists(dir_path):
os.makedirs(dir_path)
with open(self.stop_file_path, "w", encoding="utf-8") as f:
f.write("0")
if ckpt_config.load_given_ckpt is False:
# Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
latest_ckpt_path = self.query_lastest_ckpt()
if latest_ckpt_path:
self.load_ckpt_folder = latest_ckpt_path
else:
# At this time, we have to load model init weights and train from step 0.
self.load_ckpt_folder = self.load_model_only_folder
else:
self.load_ckpt_folder = ckpt_config.load_ckpt_folder
if gpc.is_rank_for_log():
logger.info(f"load_ckpt_folder will set to :'{self.load_ckpt_folder}'")
if self.stop_file_path is None:
logger.warning("no set stop_file_path, quit_signal_handler is disable")
def quit_signal_handler(self, train_state) -> bool:
"""
Exit signal detection function, if we write the exit step in the 'QUIT_FILE_PATH' file,
all ranks will save ckpt and exit.
Negative integer step means save ckpt.
Positive integer step means save ckpt and quit.
Args:
train_state (TrainState):
Returns:
bool: whether to quit.
"""
now_break, now_save_ckpt, save_type = False, False, CheckpointType.NORMAL_CHECKPOINT
if self.stop_file_path is None:
return now_break, now_save_ckpt, save_type
with open(self.stop_file_path, "a+", encoding="utf-8") as f:
fcntl.flock(f, fcntl.LOCK_EX)
f.seek(0)
msg = f.read()
fcntl.flock(f, fcntl.LOCK_UN)
action_step = int(msg)
if action_step < 0 and abs(action_step) == train_state.step_count:
now_save_ckpt = True
if action_step > 0 and action_step == train_state.step_count:
now_break, now_save_ckpt = True, True
if action_step != 0 and gpc.is_rank_for_log():
msg = "Stop" if action_step > 0 else "Save"
action_step = abs(action_step)
if train_state.step_count <= action_step:
if self.feishu_address:
send_alert_message(
address=self.feishu_address,
message=f"training will {msg} at step_count {action_step}!\
now step_count is {train_state.step_count}",
)
return now_break, now_save_ckpt, save_type
def try_save_checkpoint(self, train_state):
if not self.enable_save_ckpt:
return
return False
save_ckpts, save_type = False, CheckpointType.NORMAL_CHECKPOINT
if self.oss_snapshot_freq > 1 and train_state.step_count % self.oss_snapshot_freq == 0:
save_ckpts, save_type = True, CheckpointType.SNAPSHOT_CHECKPOINT
if train_state.step_count % self.checkpoint_every == 0:
save_ckpts, save_type = True, CheckpointType.NORMAL_CHECKPOINT
now_break, singal_save_ckpts, singal_save_type = self.quit_signal_handler(train_state)
if save_ckpts is False:
if quit_signal_handler is not None:
save_ckpts, save_type = quit_signal_handler(train_state)
save_ckpts = singal_save_ckpts
save_type = singal_save_type
if save_ckpts:
# Wait for the previous round of asynchronous upload storage to complete.
@ -361,18 +387,247 @@ class CheckpointSaveManager:
self.snapshot_counter = (self.snapshot_counter + 1) % 2
save_ckpt_folder = os.path.join(self.snapshot_ckpt_folder, f"{self.snapshot_counter}")
else:
save_ckpt_folder = self.save_ckpt_folder
save_ckpt_folder = os.path.join(self.save_ckpt_folder, str(train_state.step_count))
save_checkpoint(
self.save_checkpoint(
folder=save_ckpt_folder,
model=self.model,
optimizer=self.optimizer,
scheduler=self.lr_scheduler,
train_state=train_state,
model_config=self.model_config,
model_config_file=self.model_config_file,
)
return now_break
def wait_async_upload_finish(self):
"""wait for all checkpoint uploads to be completed"""
self.storage_manager.wait()
torch.distributed.barrier()
def query_latest_snapshot_step_boto3(self):
"""query_latest_snapshot_step_boto3
Returns:
Tuple(str, int): path of latest ckpt and ckpt step, if not found, None will return.
"""
ckpt_list = self.storage_manager.get_fns(self.save_ckpt_folder)
if len(ckpt_list) == 0:
return None, None
max_normal_step = 0
ckpt_list = list(map(lambda a: int(a.strip("/")) if a.strip("/").isdigit() else 0, ckpt_list))
ckpt_list.sort(reverse=True)
for ckpt in ckpt_list:
fns_list = self.storage_manager.get_fns(os.path.join(self.save_ckpt_folder, str(ckpt)))
for fn in fns_list:
if fn.endswith(".step"):
max_normal_step = ckpt
break
if max_normal_step != 0:
break
max_normal_step = ckpt_list[0]
load_normal_ckpt_path = os.path.join(self.save_ckpt_folder, str(max_normal_step))
snapshot_path_0 = os.path.join(self.save_ckpt_folder, "snapshot", "0")
snapshot_path_1 = os.path.join(self.save_ckpt_folder, "snapshot", "1")
ckpt_list_1 = self.storage_manager.get_fns(snapshot_path_0)
ckpt_list_2 = self.storage_manager.get_fns(snapshot_path_1)
max_step_0, max_step_1 = 0, 0
for ckpt in ckpt_list_1:
ckpt = ckpt.strip("/")
if ckpt.endswith(".step"):
max_step_0 = max(max_step_0, int(ckpt.split(".")[0]))
for ckpt in ckpt_list_2:
ckpt = ckpt.strip("/")
if ckpt.endswith(".step"):
max_step_1 = max(max_step_1, int(ckpt.split(".")[0]))
snap_load_path = snapshot_path_0 if max_step_0 > max_step_1 else snapshot_path_1
snap_step = max(max_step_0, max_step_1)
load_path = snap_load_path if snap_step > max_normal_step else load_normal_ckpt_path
load_step = max(snap_step, max_normal_step)
return load_path, load_step
def query_latest_snapshot_step_local(self):
max_step, max_step_path = 0, None
for root, _, files in os.walk(self.save_ckpt_folder, followlinks=True):
for fn in files:
fn = fn.strip("/")
if fn.endswith(".step"):
# We assume that both normal ckpt and snapshot ckpt will store the '.step' file
# as an integrity flag.
step = int(fn.rsplit(".", maxsplit=1)[0])
if max_step < step:
max_step = step
max_step_path = root
return max_step_path, max_step
def query_lastest_ckpt(self):
latest_checkpoint = None
# Training was automatically restarted by the process, forcing the latest snapshot to be read.
if self.save_ckpt_folder:
if self.save_ckpt_folder.startswith("boto3"):
latest_checkpoint, step = self.query_latest_snapshot_step_boto3()
elif self.save_ckpt_folder.startswith("local"):
latest_checkpoint, step = self.query_latest_snapshot_step_local()
else:
latest_checkpoint, step = None, 0
if latest_checkpoint is not None:
if gpc.is_rank_for_log():
logger.info(f"Found latest ckpt : {latest_checkpoint}, step: {step}")
send_alert_message(
address=self.feishu_address,
message=f"Auto restart resume from ckpt-path: '{latest_checkpoint}', step : {step}",
)
else:
if gpc.is_rank_for_log():
send_alert_message(
address=self.feishu_address,
message=f"Can't find snapshot checkpoint, use default load-ckpt path: {latest_checkpoint}",
)
return latest_checkpoint
def try_load_model(self, current_time=""):
model_load_path = None
if self.load_ckpt_folder and self.load_model_only_folder:
raise ValueError(
"Error, try to use both load_ckpt_folder and load_model_only_folder paths, \
if you only need to load model weights (for example starting an SFT task for the first time), \
set load_model_only_folder path, if you need to resume training from ckpt, \
set load_ckpt_folder or use default value \
(if is the default value, internlm will try to load the latest ckpt from save_ckpt_folder)"
)
if self.load_ckpt_folder:
if gpc.is_rank_for_log():
logger.info(
f"===========Resume training from `{self.load_ckpt_folder}` {current_time} on host:"
f"{socket.gethostname()}==========="
)
model_load_path = self.load_ckpt_folder
elif self.load_model_only_folder:
if gpc.is_rank_for_log():
logger.info(
f"===========Load Model from `{self.load_model_only_folder}` {current_time} on host:"
f"{socket.gethostname()}==========="
)
model_load_path = self.load_model_only_folder
else:
if gpc.is_rank_for_log():
logger.info(
f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
)
# Loading model weights must be done before zero is initialized.
if model_load_path is not None:
load_model_checkpoint(folder=model_load_path, model=self.model)
def try_resume_training(self, lr_scheduler, optimizer, lr, train_state, train_dl):
"""Attempt to restore the training state of the last ckpt.
Args:
lr_scheduler (_LRScheduler): lr_scheduler object.
optimizer (Optimizer): optimizer object.
lr (float): learning rate.
train_state (dict): traing states.
train_dl (DataLoader): traning dataloader object
"""
if self.load_ckpt_folder is not None:
# load optimzier states.
if self.load_optimizer:
load_optimizer_checkpoint(self.load_ckpt_folder, optimizer)
# load lr scheduler states.
load_scheduler(self.load_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
# load training states.
load_context(self.load_ckpt_folder, train_dl, train_state)
# load dataloader sampler states.
if hasattr(train_state, "batch_sampler") and not isinstance(
train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
):
load_sampler(self.load_ckpt_folder, train_dl.batch_sampler)
if hasattr(train_state, "data_state_dict"):
train_dl.dataset.load_state_dict(
llm_load(os.path.join(self.load_ckpt_folder, "sampler_0.pt")), ckpt_path=self.load_ckpt_folder
)
self.optimizer = optimizer
self.lr_scheduler = lr_scheduler
def save_checkpoint(
self,
folder,
model,
optimizer,
scheduler,
train_state: TrainState,
model_config: Dict = None,
model_config_file: str = None,
):
"""
Save checkpoint to the given folder path.
"""
start = time.time()
self.set_save_folder(folder, train_state.step_count)
torch.cuda.synchronize()
torch.distributed.barrier()
if gpc.is_rank_for_log():
logger.info(f"Saving checkpoint to `{folder}` at batch count:{train_state.step_count}...")
timer("save-model").start()
save_model_checkpoint(folder=folder, model=model)
timer("save-model").stop()
timer("save-optimizer").start()
save_optimizer_checkpoint(optim=optimizer, state_path=folder)
timer("save-optimizer").stop()
if (
hasattr(train_state, "data_state_dict")
and gpc.get_local_rank(ParallelMode.TENSOR) == 0
and gpc.get_local_rank(ParallelMode.PIPELINE) == 0
):
llm_save(
os.path.join(folder, f"sampler_{gpc.get_local_rank(ParallelMode.DATA)}.pt"),
saved_obj=train_state.data_state_dict,
)
if gpc.is_rank_for_log():
scheduler_states = scheduler.state_dict()
llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
if hasattr(train_state, "batch_sampler") and not isinstance(
train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
):
sampler_state = train_state.batch_sampler.state_dict()
llm_save(os.path.join(folder, "sampler.pt"), saved_obj=sampler_state)
llm_save(os.path.join(folder, "context.pt"), saved_obj=train_state.state_dict())
if model_config is not None:
# Model configuration dictionary.
llm_save(os.path.join(folder, "model_config.pt"), saved_obj=model_config)
if model_config_file is not None:
# The complete training config file content, stored in binary format.
llm_save(os.path.join(folder, "config_file.pt"), saved_obj=model_config_file)
torch.distributed.barrier()
if gpc.is_rank_for_log():
timer.log(["save-model", "save-optimizer"], logger=logger)
logger.info(f"Step: {train_state.step_count}, rank 0 save ckpt use {time.time() - start:.3f} s")
if self.storage_manager.async_mode is False:
llm_save(
os.path.join(folder, f"{train_state.step_count}.step"),
saved_obj=dict({"step": train_state.step_count}),
)
def set_save_folder(self, folder, step):
self.storage_manager.latest_save_folder = folder
self.storage_manager.latest_save_step = step

View File

@ -1,15 +1,13 @@
import os
import time
from collections import OrderedDict
from functools import partial
from functools import partial, reduce
from typing import Any, Dict, List, Tuple
import pyecharts
import torch
from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.solver.pipeline_utils import partition_uniform
from internlm.core.naive_amp import NaiveAMPModel
mb = 1024 * 1024
@ -107,6 +105,8 @@ class SimpleMemState:
"""
Update the total memory usage of the model and sub-models.
"""
self._total_mem = self._layer_mem
for stat in self.sub_model_stats.values():
# Update sub-model status first.
stat.update_total_memory()
@ -169,6 +169,39 @@ class SimpleMemState:
return {"name": self.layer_name, "children": children}
class ActivationMemState:
"""
Activation Memory State
"""
def __init__(self, num_chunks: int) -> None:
self._num_chunks = num_chunks
self.inited: List[bool] = [False for _ in range(num_chunks)]
self.states: List[SimpleMemState] = [SimpleMemState(f"activations_{idx}") for idx in range(num_chunks)]
@property
def total_mem(self) -> int:
return sum(state.total_mem for state in self.states)
def dump(self, prefix: str = "") -> str:
return reduce(lambda x, y: x + y, [state.dump(prefix) for state in self.states])
def to_json(self, base: int = 1024 * 1024) -> List:
return [state.to_json(base) for state in self.states]
def _unpack_naive_wrapper(model: torch.nn.Module) -> Tuple[torch.nn.Module, int]:
num_chunks = len(model) if isinstance(model, torch.nn.ModuleList) else 1
if num_chunks > 1:
model = torch.nn.ModuleList([_model.model if isinstance(_model, NaiveAMPModel) else _model for _model in model])
else:
model = model.model if isinstance(model, NaiveAMPModel) else model
return model, num_chunks
class SimpleMemoryProfiler:
"""
A memory profiler for a llm model.
@ -177,7 +210,7 @@ class SimpleMemoryProfiler:
model (torch.nn.Module): The model to profile.
optimizer (torch.optim.Optimizer): The optimizer used for training the model.
log_file (str): The file to write the memory state information to.
activation_config (List[str], optional): The list of activation layers to track. Defaults to None.
total_steps: number of steps to trace.
"""
def __init__(
@ -186,9 +219,8 @@ class SimpleMemoryProfiler:
optimizer: torch.optim.Optimizer,
log_folder: str,
total_steps: int = 5,
activation_config: List[str] = None,
):
self._model = model
self._model, self._num_model_chunks = _unpack_naive_wrapper(model)
self._optimizer = optimizer
self._log_folder = log_folder
self._remaining_steps = total_steps
@ -197,17 +229,20 @@ class SimpleMemoryProfiler:
self._record_start_time = time.time()
# For activation memory state.
self._activation_config = activation_config
self._activation_mem_inited: bool = False
self._activation_mem: int = 0
self._activation_max_count = 0
self._activation_base_mem: SimpleMemState = SimpleMemState("activations")
self._activation_mem_max: int = 0
self._activation_base_mems = ActivationMemState(self._num_model_chunks)
# Check or create log folder
os.makedirs(self._log_folder, exist_ok=True)
# Register activation memory tracking hooks
self._register_activation_trace_hooks()
if self._num_model_chunks > 1:
for chunk_id in range(self._num_model_chunks):
self._register_activation_trace_hooks(chunk_id, self._model[chunk_id])
else:
self._register_activation_trace_hooks(0, self._model)
# Calculate static parameter cuda memory
self._param_mem_state = SimpleMemState("param_mem")
@ -221,7 +256,7 @@ class SimpleMemoryProfiler:
self._calc_tensor_group_memory(self._os_params_mem_state, list(enumerate(self._optimizer.param_groups)))
# Generate the first memory record
self.point(create=True)
self.point(with_options="params,grads,os_params", create=True)
def point(self, with_options: str = "", create: bool = False) -> None:
"""
@ -272,7 +307,7 @@ class SimpleMemoryProfiler:
if "os_state" in options:
layout_info += "os_state_layout:\n" + self._os_state_mem_state.dump()
if "activation_base" in options:
layout_info += "activation_base_layout:\n" + self._activation_base_mem.dump()
layout_info += "activation_base_layout:\n" + self._activation_base_mems.dump()
# Write memory state information to log file
file_mode = "w" if create else "a"
@ -315,14 +350,14 @@ class SimpleMemoryProfiler:
[self._os_params_mem_state.to_json(), self._os_state_mem_state.to_json()],
"os_memory_sunburst",
)
self._render_sunburst_chart(self._activation_base_mem.to_json()["children"], "activation_memory_sunburst")
self._render_sunburst_chart(self._activation_base_mems.to_json(), "activation_memory_sunburst")
# Generate summary sunburst chart
summary_sunburst_data = [
{"name": "params", "value": self._param_mem_state.total_mem // mb},
{"name": "grads", "value": self._grad_mem_state.total_mem // mb},
{"name": "os_params", "value": self._os_params_mem_state.total_mem // mb},
{"name": "os_state", "value": self._os_state_mem_state.total_mem // mb},
{"name": "activation", "value": self._activation_base_mem.total_mem // mb},
{"name": "activation", "value": self._activation_mem_max // mb},
]
self._render_sunburst_chart(summary_sunburst_data, "summary_sunburst")
@ -337,12 +372,13 @@ class SimpleMemoryProfiler:
{},
{
"r0": "10%",
"r": "40%",
"r": "35%",
"itemStyle": {"borderWidth": 3},
"label": {"align": "left"},
},
{"r0": "40%", "r": "65%", "label": {"align": "left"}},
{"r0": "65%", "r": "80%", "label": {"align": "left"}},
{"r0": "35%", "r": "55%", "label": {"align": "left"}},
{"r0": "55%", "r": "70%", "label": {"align": "left"}},
{"r0": "70%", "r": "80%", "label": {"align": "left"}},
{"r0": "80%", "r": "90%", "label": {"align": "left"}},
{
"r0": "90%",
@ -357,7 +393,14 @@ class SimpleMemoryProfiler:
f"{self._log_folder}/{name}.html"
)
def _inner_activation_trace_hook(self, layer_name: str, model: Any, inputs: Any, output: torch.Tensor) -> None:
def _inner_activation_trace_hook(
self,
chunk_id: int,
layer_name: str,
model: Any,
inputs: Any,
output: torch.Tensor,
) -> None:
"""
Hook function to trace the activation memory usage for a inner layer.
@ -373,13 +416,15 @@ class SimpleMemoryProfiler:
del model, inputs
assert isinstance(output, torch.Tensor), f"Invalid output type: {type(output)}"
if self._stoped or self._activation_mem_inited:
if self._stoped or self._activation_base_mems.inited[chunk_id]:
return
# Delay updating the total_mem of activation_base_mem here, it will be handled in the forward ending hook.
self._activation_base_mem.add(layer_name, output.element_size() * output.nelement(), flush=False)
self._activation_base_mems.states[chunk_id].add(
layer_name, output.element_size() * output.nelement(), flush=False
)
def _activation_trace_hook_forward(self, model: Any, inputs: Any, output: torch.Tensor) -> None:
def _activation_trace_hook_forward(self, chunk_id: int, model: Any, inputs: Any, output: torch.Tensor) -> None:
"""
Hook function to trace the activation memory usage for a forward pass.
@ -398,23 +443,24 @@ class SimpleMemoryProfiler:
return
# Check if the activation memory has been initialized
if self._activation_mem_inited is False:
if self._activation_base_mems.inited[chunk_id] is False:
self._activation_base_mems.inited[chunk_id] = True
# Update the total memory of the activation base memory state
self._activation_base_mem.update_total_memory()
self._activation_base_mems.states[chunk_id].update_total_memory()
# Set with_options to "activation_base" to include activation_base_layout in the memory dump
self._activation_mem_inited = True
with_options = "activation_base"
else:
with_options = ""
# Accumulate activation memory usage for each forward pass
self._activation_mem += self._activation_base_mem.total_mem
# Update activation max count
if self._activation_mem // self._activation_base_mem.total_mem > self._activation_max_count:
self._activation_max_count = self._activation_mem // self._activation_base_mem.total_mem
self._activation_mem += self._activation_base_mems.states[chunk_id].total_mem
if self._activation_mem > self._activation_mem_max:
self._activation_mem_max = self._activation_mem
# Trigger a memory record
self.point()
self.point(with_options)
def _activation_tarce_hook_backward(self, model: Any, inputs: Any, grad_outputs: Any) -> None:
def _activation_tarce_hook_backward(self, chunk_id: int, model: Any, inputs: Any, grad_outputs: Any) -> None:
"""
Hook function to trace the activation memory usage for a backward pass.
@ -432,37 +478,28 @@ class SimpleMemoryProfiler:
return
# Release activation memory usage for each backward pass
self._activation_mem -= self._activation_base_mem.total_mem
self._activation_mem -= self._activation_base_mems.states[chunk_id].total_mem
# Trigger a memory record
self.point()
def _register_activation_trace_hooks(self) -> None:
def _register_activation_trace_hooks(self, chunk_id: int, model_chunk: torch.nn.Module) -> None:
"""
Register activation trace hooks for the model and each submodule in the model.
"""
# Register inner activation trace hooks for each submodule in the model
for layer_name in self._activation_config:
# Register a hook for every activation
model = self._model
sub_models = layer_name.split(".")
# Get the target sub-model
for sub_model_name in sub_models:
try:
model = model.get_submodule(sub_model_name)
except AttributeError:
model = None
break
for layer_name, sub_model in model_chunk.named_modules():
# Register the hook
if model is not None:
model.register_forward_hook(partial(self._inner_activation_trace_hook, layer_name))
if len(sub_model._modules) != 0:
continue # TODO: in some special cases, we may need some additional configuration to correct
sub_model.register_forward_hook(partial(self._inner_activation_trace_hook, chunk_id, layer_name))
# Register a forward hook for the main model to track activation memory usage
self._model.register_forward_hook(self._activation_trace_hook_forward)
model_chunk.register_forward_hook(partial(self._activation_trace_hook_forward, chunk_id))
# Register a backward hook for the main model to release activation memory usage
self._model.register_full_backward_hook(self._activation_tarce_hook_backward)
model_chunk.register_full_backward_hook(partial(self._activation_tarce_hook_backward, chunk_id))
def _calc_tensor_memory(
self, root_stat: SimpleMemState, named_tensors: Dict[str, torch.Tensor], require_grad: bool = False
@ -554,48 +591,6 @@ class SimpleMemoryProfiler:
self._calc_tensor_memory(root_stat, named_tensors)
def build_activation_config(num_layers: int, num_chunks: int = 1) -> List[str]:
# TODO: support interleaved pipeline scheduling.
assert num_chunks == 1, "Only support num_chunks == 1"
if gpc.is_initialized(ParallelMode.PIPELINE):
pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
else:
pipeline_size = 1
pipeline_rank = 0
all_parts = partition_uniform(num_layers, pipeline_size, num_chunks)
parts = all_parts[pipeline_rank]
start, end = parts[0]
num_blocks = end - start
block_conf_tmpl = [
"mixer.rotary_emb",
"mixer.Wqkv",
"mixer.inner_attn",
"mixer.inner_cross_attn",
"mixer.out_proj",
# "dropout1", # skip when dropout_selective_checkpoint is True
# "dropout2", # skip when dropout_selective_checkpoint is True
"norm1",
"norm2",
"mlp.w1",
"mlp.w2",
"mlp.w3",
]
block_conf = []
for block_id in range(num_blocks):
block_conf += [f"blocks.{block_id}.{layer}" for layer in block_conf_tmpl]
# We don't need to care about whether the embedding, norm, and head layers exist in the model after partitioning.
# If they don't exist, they will be automatically ignored when registering activation trace hooks.
activation_conf = ["embedding", "norm", "head"] + block_conf
return activation_conf
if __name__ == "__main__":
class SimpleModel(torch.nn.Module):
@ -635,32 +630,39 @@ if __name__ == "__main__":
return output
def _simple_schedule(_num_chunks, _model_chunks, _input) -> torch.Tensor:
if _num_chunks > 1:
_output = _input
for _model_chunk in _model_chunks:
_output = _model_chunk(_output)
else:
_output = _model_chunks(_input)
return _output
# num_chunks config
_num_chunks = 1
# init model and optimizer
_model: torch.nn.Module = SimpleModel()
if _num_chunks > 1:
_chunks = [SimpleModel(skip_layer2=idx % 2 == 0) for idx in range(_num_chunks)]
_model = torch.nn.ModuleList(_chunks).cuda()
else:
_model: torch.nn.Module = SimpleModel().cuda()
_optimizer = torch.optim.Adam(_model.parameters())
# create activation config for simple model layer by layer.
activation_configs = [
# model level 0
"layer1",
"layer2",
"layer3",
# model level 1
"layer2.layer1",
"layer2.layer3",
]
_model.modules()
# init profiler
profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler.log", activation_configs)
profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler", total_steps=1)
_optimizer.zero_grad()
x1 = torch.randn((128, 5120))
x2 = torch.randn((128, 5120))
out1 = _model(x1)
out2 = _model(x2)
# inputs
x1 = torch.randn((128, 5120)).cuda()
x2 = torch.randn((128, 5120)).cuda()
# forward
out1 = _simple_schedule(_num_chunks, _model, x1)
out2 = _simple_schedule(_num_chunks, _model, x2)
# backward
out1.mean().backward()
out2.mean().backward()

View File

@ -15,8 +15,6 @@ from asyncio.tasks import ALL_COMPLETED
from datetime import datetime
from typing import Any, Awaitable, Callable, Dict, List, Union
import boto3
import botocore
import torch
import torch.distributed as dist
@ -24,6 +22,13 @@ from internlm.core.context import global_context as gpc
from internlm.utils.common import SingletonMeta
from internlm.utils.logger import get_logger
try:
import boto3
import botocore
except ImportError:
pass
logger = get_logger(__file__)
boto3_url_re = re.compile(r"([^\.]+)\.([\d\.]+)")
@ -234,13 +239,13 @@ class Boto3Client(StorageClient):
"""
paginator = handler.client.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=bucket_name, Prefix=fp)
folder_name_list = []
for page in pages:
if "Contents" in page:
for obj in page["Contents"]:
fp: str = obj["Key"]
folder_name_list.append(fp.rsplit("/", maxsplit=1)[1])
return folder_name_list
pth: str = obj["Key"]
folder_name_list.append(pth.split(fp, maxsplit=1)[1].strip("/").split("/", maxsplit=1)[0])
return list(set(folder_name_list))
@staticmethod
def async_upload_fileobj(handler, bucket_name: str, fp: str, local_nvme_path: str):
@ -391,6 +396,11 @@ class StorageManager(metaclass=SingletonMeta):
self.tmp_local_folder = tmp_local_folder
self.async_mode = async_mode
self.has_warning = False
self._async_loop = None
self._thread_pool = None
self.latest_save_folder = None
self.latest_save_step = 0
self.async_task_peeding = False
if enable_save and self.async_mode:
self._async_loop = asyncio.new_event_loop()
@ -485,6 +495,7 @@ class StorageManager(metaclass=SingletonMeta):
torch.save(saved_obj, f, pickle_protocol=pickle.HIGHEST_PROTOCOL)
self.async_executor(meta.async_upload_fn, *unpack_meta(meta))
os.chmod(tmp_step_file, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
self.async_task_peeding = True
else:
meta.client.sync_upload_fileobj(*unpack_meta(meta), *args, saved_obj=saved_obj, **kwargs)
self.upload_count += 1
@ -523,23 +534,22 @@ class StorageManager(metaclass=SingletonMeta):
pass
async def _sync_tasks(self) -> Awaitable[None]:
if not self._async_stack:
return
if self._async_stack:
await asyncio.wait(self._async_stack, return_when=ALL_COMPLETED)
for task in self._async_stack:
count = 0
while self._async_stack:
t = self._async_stack[0]
try:
task.exception()
e = t.exception()
if e:
self._exception_list.append((e, count))
logger.error(f"File:{self._to_be_del_files[count]}, upload failed for {e}")
# raise e
count += 1
self._async_stack.pop(0)
except InvalidStateError:
continue
except Exception as e:
file_id = len(self._exception_list)
self._exception_list.append((e, file_id))
logger.error(f"File: {self._to_be_del_files[file_id]}, " f"upload failed with {e}")
self._async_stack.clear()
# Not finished. https://docs.python.org/3/library/asyncio-task.html#asyncio.Task.exception
pass
def async_executor(self, fn: Callable, *args, **kwargs) -> None:
"""
@ -559,11 +569,14 @@ class StorageManager(metaclass=SingletonMeta):
if not self.async_mode:
return
if not self.async_task_peeding:
return
if self._async_loop:
self._async_loop.run_until_complete(self._sync_tasks())
if self._exception_list:
for file_id, error_msg in self._exception_list:
for error_msg, file_id in self._exception_list:
logger.error(
f"Node:{socket.gethostname()}, Error: Checkpoint {self._to_be_del_files[file_id]} "
f"failed on step {self.upload_count}: {error_msg}"
@ -577,10 +590,16 @@ class StorageManager(metaclass=SingletonMeta):
self._del_tmp_folder()
self._exception_list.clear()
self._to_be_del_files.clear()
self.async_task_peeding = False
if gpc.is_rank_for_log():
logger.info("all async uploads succeeded!")
self.upload_count += 1
if self.async_mode:
self.save(
os.path.join(self.latest_save_folder, f"{self.latest_save_step}.step"),
saved_obj=dict({"step": self.latest_save_step}),
async_upload=False,
)
storage_manager: StorageManager = None

View File

@ -11,10 +11,6 @@ from torch.utils.tensorboard import SummaryWriter
from internlm.core.context import global_context as gpc
def copy_ignore_folder(source_path, target_path):
os.system(f"cp -r {source_path}/* {target_path}/")
def tb_save_run_info(writer, config_lines, global_step=0):
writer.add_text(tag="cmd", text_string=" ".join(sys.argv[:]), global_step=global_step)
lines = []
@ -42,9 +38,21 @@ def init_tb_writer(
tb_folder = tensorboard_folder
if gpc.get_global_rank() == 0:
# If we don't load ckpt, 'resume_tb_folder' is set as the tensorboard
# dir of the last task by 'make_launch_script.sh'.
# If we load ckpt, 'resume_tb_folder' will be overwritten as the
# reloaded 'train_state.resume_tb_folder'.s
if resume_tb_folder is not None:
assert len(resume_tb_folder) > 0 and resume_tb_folder != "/"
if not os.path.exists(resume_tb_folder):
logger.error(
f"Can't found resume_tb_folder{resume_tb_folder}, \
please make sure this folder is located at local file system."
)
else:
logger.info(f"Try mv tensorboard logs: {resume_tb_folder} to {tb_folder}... ")
copy_ignore_folder(resume_tb_folder, tb_folder)
os.system(f"cp -r {resume_tb_folder}/* {tb_folder}/")
os.system(f"chmod -R +w {tb_folder}/")
else:
logger.info(f"Login tensorboard logs to: {tb_folder}")
@ -126,6 +134,14 @@ class Writer:
except Exception:
traceback.print_exc()
def add_scalars(self, key, value, step):
try:
assert isinstance(value, dict)
if self.enable_tb and self.tb_writer is not None:
self.tb_writer.add_scalars(main_tag=key, tag_scalar_dict=value, global_step=step)
except Exception:
traceback.print_exc()
def add_text(self, key, value, step):
try:
if self.enable_tb and self.tb_writer is not None:

View File

@ -13,4 +13,4 @@ boto3
botocore
torch-scatter
pyecharts
-f https://data.pyg.org/whl/torch-1.13.0+cu117.html
-f https://data.pyg.org/whl/torch-1.13.1+cu117.html

516
train.py
View File

@ -5,99 +5,48 @@ import socket
import time
import traceback
from functools import partial
from typing import Iterable
import numpy as np
import torch
import torch.distributed as dist
from torch import nn
from torch.utils.data import DataLoader
import internlm
from internlm.core.context import ParallelMode
from internlm.core.context import global_context as gpc
from internlm.core.naive_amp import NaiveAMPModel
from internlm.core.scheduler import SchedulerMetricHook
from internlm.core.trainer import TrainState
from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
from internlm.data.dataset import get_dataset_dict
from internlm.data.dummy_dataset import RandomDataset
from internlm.data.packed_dataset import (
PackedDataset,
PackedDatasetWithoutCuSeqlen,
get_packed_dataset_without_short_length,
)
from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
from internlm.initialize import initialize_distributed_env
from internlm.model.loss import FlashGPTLMLoss
from internlm.model.metrics import AccPerplex
from internlm.monitor import initialize_monitor_manager, send_alert_message, set_env_var
from internlm.monitor import initialize_monitor_manager, send_alert_message
from internlm.monitor.monitor import monitor_manager as mm
from internlm.solver.beta2_scheduler import Beta2Scheduler
from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
from internlm.solver.optimizer import HybridZeroOptimizer
from internlm.train import (
get_train_data_loader,
get_validation_data_loader,
initialize_llm_profile,
initialize_model,
initialize_optimizer,
load_new_batch,
record_current_batch_training_metrics,
)
from internlm.utils.common import (
BatchSkipper,
DummyProfile,
get_master_node,
get_megatron_flops,
launch_time,
parse_args,
)
from internlm.utils.evaluation import evaluate_on_val_dls
from internlm.utils.gputest import bench_gpu, bench_net
from internlm.utils.logger import get_logger, initialize_uniscale_logger
from internlm.utils.megatron_timers import megatron_timer as timer
from internlm.utils.model_checkpoint import (
CheckpointSaveManager,
load_context,
load_model_checkpoint,
load_optimizer_checkpoint,
load_sampler,
load_scheduler,
)
from internlm.utils.parallel import (
get_parallel_log_file_name,
is_no_pp_or_last_stage,
sync_model_param,
sync_model_param_within_tp,
)
from internlm.utils.registry import MODEL_INITIALIZER
from internlm.utils.simple_memory_profiler import (
SimpleMemoryProfiler,
build_activation_config,
)
from internlm.utils.model_checkpoint import CheckpointManager
from internlm.utils.parallel import get_parallel_log_file_name
from internlm.utils.simple_memory_profiler import SimpleMemoryProfiler
from internlm.utils.writer import Writer
# global llm logger
logger = get_logger(__file__)
def initialize_distributed_env(config: str, launcher: str = "slurm", master_port: int = 8888, seed: int = 1024):
"""
Initialize distributed environment for distributed training.
Args:
config (str): Config file path.
launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
master_port (str): The master port for distributed training. 8888 by default.
seed (int, optional): Specified random seed for every process. 1024 by default.
"""
torch.cuda.empty_cache()
if launcher == "torch":
internlm.launch_from_torch(config=config, seed=seed)
elif launcher == "slurm":
internlm.launch_from_slurm(
config=config,
host=get_master_node(),
port=master_port,
seed=seed,
)
else:
assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
def initialize_llm_logger(start_time: str):
"""
Initialize customed uniscale logger.
@ -118,357 +67,14 @@ def initialize_llm_logger(start_time: str):
return uniscale_logger
def initialize_model():
"""
Initialize model.
Returns: The neural network model to be trained or evaluated.
"""
model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
if isinstance(model, nn.ModuleList):
model = nn.ModuleList(
[
NaiveAMPModel(
model=_m,
output_to_fp32=False, # manually controlled by interleaved pipleline scheduler
dtype=gpc.config.model.get("dtype", torch.half),
sync_buffer=False,
)
for _m in model
]
)
else:
model = NaiveAMPModel(
model=model,
output_to_fp32=is_no_pp_or_last_stage(),
dtype=gpc.config.model.get("dtype", torch.half),
sync_buffer=False,
)
# This sync is very important, cause the model weights kept in optimizer are copied
# from the origin parameters in the memory, so we should make sure the dp sync
# does not influence the model weights in optimizer be different with the origin parameters.
sync_model_param(model, parallel_mode=ParallelMode.DATA)
# This function is needed to make sure parameters that are not splitted by tensor parallelism are
# the same across tensor parallelism.
sync_model_param_within_tp(model)
return model
def get_train_data_loader(num_worker: int = 0):
"""
Generate and return the training data loader.
Returns: A tuple of (train_dl, dataset_types).
"""
# Get the dataset types
dataset_types = None
dataset_types = list(DATASET_TYPE_IDS_MAP.keys())
data_cfg = gpc.config.data
# Get the sample weight dictionary
train_folder = data_cfg.train_folder
if not train_folder:
train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
if data_cfg.pack_sample_into_one:
train_ds = PackedDatasetWithoutCuSeqlen(
train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
)
else:
train_ds = PackedDataset(
train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
)
else:
train_ds = get_packed_dataset_without_short_length(
folder=data_cfg.train_folder,
packed_length=data_cfg.packed_length,
max_length_per_sample=data_cfg.seq_len,
show_progress=dist.get_rank() == 0,
min_length=data_cfg.min_length,
min_length_dict=data_cfg.get("min_length_dict", {}),
pack_into_one_sample=data_cfg.pack_sample_into_one,
)
# partition already completed
# assert isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen))
if isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen)):
datasets = [train_ds]
else:
datasets = train_ds.datasets
# Create the training dataset sampler
train_sampler = StaticBatchSampler(
datasets,
batch_size=data_cfg.micro_num,
rampup_batch_size=data_cfg.rampup_batch_size,
micro_bsz=data_cfg.micro_bsz,
seed=1024,
drop_last=True,
data_rank=gpc.get_local_rank(ParallelMode.DATA),
data_world_size=gpc.get_world_size(ParallelMode.DATA),
)
train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
# Create the training data loader
train_dl = DataLoader(
dataset=train_ds,
batch_sampler=train_sampler,
num_workers=num_worker,
pin_memory=True,
collate_fn=train_collate_fn,
persistent_workers=True,
)
return train_dl, dataset_types
def get_validation_data_loader(num_worker: int = 0):
"""Generate and return the validation data loader."""
data_cfg = gpc.config.data
if not data_cfg.valid_folder:
val_ds = RandomDataset(num_samples=gpc.get_world_size(ParallelMode.DATA) * 500, max_len=data_cfg.seq_len)
else:
val_ds = get_dataset_dict(folder=data_cfg.valid_folder, split="")
if not isinstance(val_ds, dict):
val_ds = {"val": val_ds}
val_collate_fn = partial(jsonl_ds_collate_fn, max_length_per_sample=data_cfg.seq_len)
val_dls = {}
for val_name, ds in val_ds.items():
# making the batch_size of validate larger can speed up the evaluation, but it should not be too large,
# otherwise too much data may be dropped
batch_size = min(
data_cfg.valid_micro_num * data_cfg.micro_bsz, len(ds) // gpc.get_world_size(ParallelMode.DATA)
)
batch_size = batch_size // data_cfg.micro_bsz * data_cfg.micro_bsz
if batch_size == 0 and gpc.is_rank_for_log():
logger.info(f"skip validate {val_name}.")
continue
val_dls[val_name] = get_dpsampler_dataloader(
ds, shuffle=False, num_workers=num_worker, batch_size=batch_size, collate_fn=val_collate_fn, drop_last=True
) # drop_last=True, otherwise it may cause problems in the last batch
if gpc.is_rank_for_log():
logger.info(
f"load validation dataset {val_name} with valid batch size {str(batch_size)} and "
f"samples {str(len(val_dls[val_name]))}."
)
return val_dls
def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
"""
Load and return the new batch data based on training data loader.
Args:
train_dl (torch.utils.data.DataLoader): Dataloader for training.
train_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
train_state (TrainState): Current training state.
Returns: A batch data and the updated train_iter.
"""
timer("batch-gen").start()
try:
batch = next(train_iter) # structure is ({'input_ids': Tensor, 'cu_seqlens': Tensor}, Tensor)
next(train_state.batch_sampler_iter)
except StopIteration:
train_iter = iter(train_dl)
batch = next(train_iter)
train_state.batch_sampler_iter = iter(train_state.batch_sampler)
next(train_state.batch_sampler_iter)
train_state.num_consumed_samples_in_epoch = 0
timer("batch-gen").stop()
if batch[0].get("type_ids", None) is not None:
# if use_flash_attn is False, we need to unpack type_ids
if not gpc.config.model.use_flash_attn:
batch[0]["type_ids"] = unpack_data(batch[0]["type_ids"], batch[0]["cu_seqlens"])
return batch, train_iter
def initialize_optimizer(model: nn.Module):
"""
Initialize optimizer.
Args:
model (torch.nn.Module): Your model instance to be trained or evaluated.
Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
"""
adam_cfg = gpc.config.adam
naive_optimizer = torch.optim.AdamW(
params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
lr=adam_cfg.lr,
betas=(adam_cfg.adam_beta1, adam_cfg.adam_beta2),
eps=adam_cfg.adam_eps,
)
optimizer = HybridZeroOptimizer(
naive_optimizer, grad_scal_cfg=gpc.config.grad_scaler, zero_cfg=gpc.config.hybrid_zero_optimizer
)
beta2_scheduler = Beta2Scheduler(optimizer=naive_optimizer, **gpc.config.beta2_scheduler)
lr_scheduler = FineTuneCosineAnnealingWarmupLR(optimizer, **gpc.config.lr_scheduler)
return optimizer, beta2_scheduler, lr_scheduler
def initialize_llm_profile(profiling: bool = False, start_time: str = None):
"""Initialize and return the profiler context manager instance."""
if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
llm_profile = torch.profiler.profile
logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
else:
llm_profile = DummyProfile
return llm_profile(
activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler(
f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
+ f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
),
with_stack=True,
with_modules=True,
)
def record_current_batch_training_metrics(
get_tflops_func,
logger,
writer,
success_update,
batch_count,
batch,
train_state,
optimizer,
beta2_scheduler,
trainer,
start_time,
loss,
grad_norm,
metric,
update_panel,
):
"""
Print some training metrics of current batch.
"""
set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))
if success_update in (0, True):
train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
if is_no_pp_or_last_stage():
acc_perplex = metric.get_metric()
if success_update and gpc.is_rank_for_log():
lr = optimizer.param_groups[0]["lr"]
if hasattr(trainer.engine.optimizer, "grad_scaler"):
scaler = trainer.engine.optimizer.grad_scaler._scale.item()
elif hasattr(trainer.engine.optimizer.optim, "grad_scaler"):
scaler = trainer.engine.optimizer.optim.grad_scaler._scale.item()
num_tokens_in_batch = batch[1].nelement()
num_samples_in_batch = sum([len(b) - 1 for b in batch[0]["cu_seqlens"]])
max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
tk_per_gpu = 0
tk_per_gpu = round(
num_tokens_in_batch
* gpc.get_world_size(ParallelMode.DATA)
/ gpc.get_world_size(ParallelMode.GLOBAL)
/ (time.time() - start_time),
2,
)
tflops = get_tflops_func((time.time() - start_time))
infos = {
"tflops": tflops,
"step": batch_count,
"loss": loss.item(),
"tgs (tokens/gpu/second)": tk_per_gpu,
"lr": lr,
"loss_scale": scaler,
"grad_norm": grad_norm,
}
infos["micro_num"] = len(batch[1])
infos["num_consumed_tokens"] = train_state.num_consumed_tokens
infos["inf_nan_skip_batches"] = train_state.inf_nan_skip_batches
infos["num_samples_in_batch"] = num_samples_in_batch # the number of batches which have the most samples
infos["largest_length"] = max_length_in_batch # the longest input
infos["largest_batch"] = max_samples_in_batch # the batch with the most samples
infos["smallest_batch"] = min_samples_in_batch
infos["adam_beta2"] = beta2_scheduler.get_beta2()
fwd_bwd_time = round(timer("fwd-bwd").elapsed(), 2)
infos["fwd_bwd_time"] = fwd_bwd_time
for key, value in acc_perplex.items():
infos[key] = value
line = ""
for key, value in infos.items():
line += f"{key}={value} "
writer.add_scalar(key=key, value=value, step=train_state.step_count)
if update_panel:
logger.info(
line,
extra={
"step": batch_count,
"lr": lr,
"num_consumed_tokens": train_state.num_consumed_tokens,
"grad_norm": grad_norm,
"loss": loss.item(),
"flops": tflops,
"tgs": tk_per_gpu,
"acc": acc_perplex["acc"],
"perplexity": acc_perplex["perplexity"],
"fwd_bwd_time": fwd_bwd_time,
},
)
else:
logger.info(line)
# if loss spike occurs, send alert info to feishu
mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
def main(args):
# init setting
skip_batches = gpc.config.data.skip_batches
total_steps = gpc.config.data.total_steps
valid_every = gpc.config.data.valid_every
load_optimizer = gpc.config.ckpt.load_optimizer
label_smoothing = gpc.config.loss.label_smoothing
lr = gpc.config.adam.lr
load_model_only_folder = gpc.config.ckpt.get("load_model_only_folder", None)
load_resume_ckpt_folder = gpc.config.ckpt.get("load_ckpt_folder", None)
get_tflops_func = partial(
get_megatron_flops,
checkpoint=gpc.config.model.checkpoint,
@ -490,46 +96,22 @@ def main(args):
# initialize customed llm logger
uniscale_logger = initialize_llm_logger(start_time=current_time)
# initialize customed llm writer
with open(args.config, "r") as f:
config_lines = f.readlines()
writer = Writer(
job_name=gpc.config.JOB_NAME,
launch_time=current_time,
file_name=get_parallel_log_file_name(),
tensorboard_folder=gpc.config.tensorboard_folder,
resume_tb_folder=gpc.config.resume_tb_folder,
config=config_lines,
logger=logger,
enable_tb=gpc.config.enable_tb,
)
model_load_path = None
if load_resume_ckpt_folder is not None:
logger.info(
f"===========Resume training from `{load_resume_ckpt_folder}` {current_time} on host:"
f"{socket.gethostname()}==========="
)
model_load_path = load_resume_ckpt_folder
elif load_model_only_folder is not None:
logger.info(
f"===========SFT training from `{load_model_only_folder}` {current_time} on host:"
f"{socket.gethostname()}==========="
)
model_load_path = load_model_only_folder
else:
logger.info(
f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
)
# initialize and resume train state
train_state = TrainState(gpc.config)
# initialize model
model = initialize_model()
with open(args.config, "r") as f:
config_lines = f.readlines()
ckpt_manager = CheckpointManager(
ckpt_config=gpc.config.ckpt,
model=model,
model_config=gpc.config.model,
model_config_file="".join(config_lines),
feishu_address=gpc.config.alert_address,
)
# initialize loss function
criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)
@ -539,29 +121,24 @@ def main(args):
train_state.init_batch_sampler(train_dl)
# Loading model weights must be done before zero is initialized.
if model_load_path is not None:
load_model_checkpoint(folder=model_load_path, model=model)
ckpt_manager.try_load_model(current_time)
optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)
# Loading other persistent training states.
if load_resume_ckpt_folder is not None:
# load lr scheduler states.
load_scheduler(load_resume_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
# load training states.
load_context(load_resume_ckpt_folder, train_dl, train_state)
# load dataloader sampler states.
load_sampler(load_resume_ckpt_folder, train_dl.batch_sampler)
# load optimzier states.
if load_optimizer:
load_optimizer_checkpoint(load_resume_ckpt_folder, optimizer)
ckpt_manager.try_resume_training(lr_scheduler, optimizer, lr, train_state, train_dl)
ckpt_save_manager = CheckpointSaveManager(
ckpt_config=gpc.config.ckpt,
model=model,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
model_config=gpc.config.model,
# initialize customed llm writer
writer = Writer(
job_name=gpc.config.JOB_NAME,
launch_time=current_time,
file_name=get_parallel_log_file_name(),
tensorboard_folder=gpc.config.tensorboard_folder,
resume_tb_folder=train_state.resume_tb_folder, # resume from ckpt.
step_count=train_state.step_count, # resume from ckpt.
config=config_lines,
logger=logger,
enable_tb=gpc.config.enable_tb,
)
# initialize metric for calculating accuracy and perplexity
@ -598,12 +175,11 @@ def main(args):
# initialize simple memory profiler
if args.profiling:
memory_profiler = SimpleMemoryProfiler(
model.model,
model,
optimizer.optim,
log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
+ f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
+ f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
activation_config=build_activation_config(gpc.config.model.num_layers),
)
else:
memory_profiler = None
@ -621,6 +197,8 @@ def main(args):
for batch_count in range(train_state.batch_count, total_steps):
if batch_count % 50 == 0:
torch.cuda.empty_cache()
bench_gpu()
bench_net()
start_time = time.time()
timer("one-batch").start()
@ -645,6 +223,7 @@ def main(args):
# do forward and backward
timer("fwd-bwd").start()
_, _, loss = trainer.execute_schedule(
batch, forward_only=False, return_loss=True, return_output_label=False
)
@ -659,7 +238,7 @@ def main(args):
train_state.step_count += 1
else:
train_state.inf_nan_skip_batches += 1 # record the amount of updating parameters unsuccessfully.
if -99.0 in grad_norm_groups and gpc.is_rank_for_log(): # -99.0 encodes a specific failure case
if -1 in grad_norm_groups.values() and gpc.is_rank_for_log(): # -1 encodes a specific failure case
logger.warning(f"Warning: skip parameter update at step {batch_count}.")
send_alert_message(
address=gpc.config.alert_address,
@ -680,7 +259,7 @@ def main(args):
trainer=trainer,
start_time=start_time,
loss=loss,
grad_norm=np.array(grad_norm_groups),
grad_norm=grad_norm_groups,
metric=metric,
update_panel=uniscale_logger is not None,
)
@ -700,14 +279,17 @@ def main(args):
# checkpoint the training states in specific steps, which is determined by the args "checkpoint_every"
# # save batch sampler that tracks the true consumed samples
ckpt_save_manager.try_save_checkpoint(train_state)
now_break = ckpt_manager.try_save_checkpoint(train_state)
if now_break:
break
if memory_profiler is not None:
memory_profiler.step()
if batch_count % 2 == 0:
prof.step()
ckpt_save_manager.wait_async_upload_finish()
ckpt_manager.wait_async_upload_finish()
if __name__ == "__main__":