merge feature_add_moe into feature_add_moe_data

2023-09-21 15:47:31 +08:00 · 2023-09-21 15:47:31 +08:00 · b596255356
parent 742a21677b 17bc5f562b
commit b596255356
129 changed files with 9038 additions and 1085 deletions
--- a/.github/workflows/demo_in_readme.yaml
+++ b/.github/workflows/demo_in_readme.yaml
@ -39,7 +39,7 @@ jobs:
    needs: check-requirements
    runs-on: [lmtest]
    steps:
-    - name: mask env 
+    - name: mask env
      run: |
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
    - uses: actions/checkout@v3
@ -60,15 +60,29 @@ jobs:
    runs-on: [lmtest]
    timeout-minutes: 30
    steps:
-    - name: mask env 
+    - name: mask env
      run: |
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
    - uses: actions/checkout@v3

    - name: slurm-train
+      id: basic_train
      run: |
        source activate internlm-env-test
        sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
+
+    - name: load_preset_ckpt
+      if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
+      run: |
+        source activate internlm-env-test
+        export PYTHONPATH=$PWD:$PYTHONPATH
+        sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
+
+    - name: load_new_ckpt
+      run: |
+        source activate internlm-env-test
+        export PYTHONPATH=$PWD:$PYTHONPATH
+        sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
        rm -rf $GITHUB_WORKSPACE/llm_ckpts

    - name: torchrun-train
@ -91,18 +105,17 @@ jobs:
      run: |
        source activate internlm-env-test
        export PYTHONPATH=$PWD:$PYTHONPATH
-        sh ./ci_scripts/model/convert_to_hf.sh 
+        sh ./ci_scripts/model/convert_to_hf.sh
        cd ./hf_ckpt
        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
        cd ..
        rm -rf $GITHUB_WORKSPACE/hf_ckpt
-  
  load-chat-model-in-hf:
    if: ${{ always() }}
    needs: check-requirements
    runs-on: [lmtest]
    steps:
-    - name: mask env 
+    - name: mask env
      run: |
        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
    - uses: actions/checkout@v3
--- a/.github/workflows/e2e_test.yaml
+++ b/.github/workflows/e2e_test.yaml
@ -0,0 +1,56 @@
+name: e2e-tests
+on: 
+  pull_request:
+    branches:
+      - "main"
+      - "develop"
+    paths-ignore:
+      - "doc/**"
+      - "**.md"
+env:
+  WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
+  SLURM_PARTITION: llm
+
+jobs:
+  check-requirements:
+    runs-on: [lmtest]
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+    - uses: actions/checkout@v3
+      with:
+         fetch-depth: 2
+    - name: check-requirements
+      run: |
+        source activate internlm-env-test
+        changed_files=$(git diff --name-only -r HEAD^1 HEAD)
+        echo $changed_files
+        if [[ $changed_files =~ "runtime.txt" ]]; then
+          pip install -r requirements/runtime.txt
+        fi
+
+        if [[ $changed_files =~ "torch.txt"  ]]; then
+          pip install -r requirements/torch.txt
+        fi
+
+
+  e2e_tests:
+    if: ${{ always() }}
+    needs: check-requirements
+    runs-on: [lmtest]
+    timeout-minutes: 30
+    steps:
+    - name: mask env
+      run: |
+        echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
+    - uses: actions/checkout@v3
+
+    - name: e2e-test
+      run: |
+        source activate internlm-env-test
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n8 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_8GPU" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TP" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2TPSP" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP" ./tests/test_training
+        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} -n16 --ntasks-per-node=8 --cpus-per-task=4 --gpus-per-task=1 pytest -s -v --color=yes -m "training_16GPU_8DP2PP_InterleavedOverlap" ./tests/test_training
--- a/.github/workflows/lint_check.yaml
+++ b/.github/workflows/lint_check.yaml
@ -1,6 +1,6 @@
 name: lint-check

-on: 
+on:
  push:
  pull_request:
    branches:
--- a/.github/workflows/sonar.yaml
+++ b/.github/workflows/sonar.yaml
@ -1,7 +1,7 @@
 name: Sonarqube
 on:
  workflow_dispatch:
-     
+
 jobs:
  sonarqube:
    name: SonarQube Scan
@ -13,4 +13,4 @@ jobs:
      - uses: sonarsource/sonarqube-scan-action@master
        env:
          SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
-          SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
+          SONAR_HOST_URL: ${{ secrets.SONAR_HOST_URL }}
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -0,0 +1,28 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: doc/code-docs/source/conf.py
+  fail_on_warning: false
+
+# Optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: doc/code-docs/requirements.txt
--- a/README-ja-JP.md
+++ b/README-ja-JP.md
@ -16,6 +16,7 @@

 [![license](./doc/imgs/license.svg)](./LICENSE)
 [![evaluation](./doc/imgs/compass_support.svg)](https://github.com/internLM/OpenCompass/)
+[![Documentation Status](https://readthedocs.org/projects/internlm/badge/?version=latest)](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest)

 [📘使用法](./doc/en/usage.md) |
 [🛠️インストール](./doc/en/install.md) |
@ -40,6 +41,10 @@ InternLM は、70 億のパラメータを持つベースモデルと、実用

 さらに、大規模な依存関係を必要とせずにモデルの事前学習をサポートする軽量な学習フレームワークが提供されます。単一のコードベースで、数千の GPU を持つ大規模クラスタでの事前学習と、単一の GPU での微調整をサポートし、顕著な性能最適化を達成します。InternLM は、1024GPU でのトレーニングにおいて 90% 近いアクセラレーション効率を達成しています。

+## 新闻
+
+InternLM-7B-Chat v1.1 は、コード インタプリタと関数呼び出し機能を備えてリリースされました。 [Lagent](https://github.com/InternLM/lagent) で試すことができます。
+
 ## InternLM-7B

 ### パフォーマンス評価
@ -80,8 +85,8 @@ Transformers を使用して InternLM 7B チャットモデルをロードする

 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "こんにちは", history=[])
 >>> print(response)
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@ -16,6 +16,7 @@

 [![license](./doc/imgs/license.svg)](https://github.com/open-mmlab/mmdetection/blob/main/LICENSE)
 [![evaluation](./doc/imgs/compass_support.svg)](https://github.com/internLM/OpenCompass/)
+[![Documentation Status](https://readthedocs.org/projects/internlm/badge/?version=latest)](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest)

 [📘使用文档](./doc/usage.md) |
 [🛠️安装教程](./doc/install.md) |
@ -45,6 +46,10 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数

 提供了支持模型预训练的轻量级训练框架，无需安装大量依赖包，一套代码支持千卡预训练和单卡人类偏好对齐训练，同时实现了极致的性能优化，实现千卡训练下近90%加速效率。

+## 新闻
+
+我们开源了 InternLM-Chat-7B v1.1。该模型能够调用代码解释器和工具插件。你可以在 [Lagent](https://github.com/InternLM/lagent) 中体验这些新功能。
+
 ## InternLM-7B

 ### 性能评测
@ -74,6 +79,7 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数
 | 模型                 | InternLM 格式权重下载地址                                                                                                                      | Transformers 格式权重下载地址                    |
 | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------ |
 | **InternLM 7B**      | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
+| **InternLM Chat 7B v1.1**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1)    | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1)       |
 | **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)
 | **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k)

@ -85,8 +91,8 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数

 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "你好", history=[])
 >>> print(response)
@ -117,26 +123,44 @@ streamlit run web_demo.py

 我们使用 [LMDeploy](https://github.com/InternLM/LMDeploy) 完成 InternLM 的一键部署。

-1. 首先安装 LMDeploy:
+```bash
+python3 -m pip install lmdeploy
+```

-   ```bash
-   python3 -m pip install lmdeploy
-   ```
+执行以下命令，可以在终端与 `internlm-chat-7b` 模型进行交互式对话，或者通过 WebUI 与它聊天。

-2. 快速的部署命令如下：
+```bash
+# 转换权重格式
+python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b

-   ```bash
-   python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-7b/model
-   ```
+# 在终端进行交互式对话
+python3 -m lmdeploy.turbomind.chat ./workspace

-3. 在导出模型后，你可以直接通过如下命令启动服务，并在客户端与AI对话
+# 启动 gradio 服务
+python3 -m lmdeploy.serve.gradio.app ./workspace
+```
+以上过程中，LMDeploy 使用的是 FP16 的计算精度。

-   ```bash
-   bash workspace/service_docker_up.sh
-   python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
-   ```
+除了 FP16 精度，LMDeploy 还支持 `internlm-chat-7b` 4bit 权重模型推理。它不仅把模型的显存减少到 6G，大约只有 FP16 的 40%，更重要的是，经过 kernel 层面的极致优化，其推理性能在 A100-80G 上可达到 FP16 的 2.4 倍以上。
+
+以下是`internlm-chat-7b` 4bit 权重模型的部署方法。推理速度的 bechmark 请参考[这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md#%E6%8E%A8%E7%90%86%E9%80%9F%E5%BA%A6)
+
+```bash
+# download prequnantized internlm-chat-7b model from huggingface
+git-lfs install
+git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
+
+# Convert the model's layout and store it in the default path, ./workspace.
+python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
+
+# inference lmdeploy's turbomind engine
+python3 -m lmdeploy.turbomind.chat ./workspace
+
+# serving with gradio
+python3 -m lmdeploy.serve.gradio.app ./workspace
+```
+LMDeploy 是涵盖了 LLM 任务的全套轻量化、部署和服务的工具箱。请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。

-[LMDeploy](https://github.com/InternLM/LMDeploy) 支持了 InternLM 部署的完整流程，请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。

 ## 微调&训练

--- a/README.md
+++ b/README.md
@ -16,6 +16,7 @@

 [![license](./doc/imgs/license.svg)](./LICENSE)
 [![evaluation](./doc/imgs/compass_support.svg)](https://github.com/internLM/OpenCompass/)
+[![Documentation Status](https://readthedocs.org/projects/internlm/badge/?version=latest)](https://internlm.readthedocs.io/zh_CN/latest/?badge=latest)

 [📘Usage](./doc/en/usage.md) |
 [🛠️Installation](./doc/en/install.md) |
@ -45,6 +46,10 @@ InternLM has open-sourced a 7 billion parameter base model and a chat model tail

 Additionally, a lightweight training framework is offered to support model pre-training without the need for extensive dependencies. With a single codebase, it supports pre-training on large-scale clusters with thousands of GPUs, and fine-tuning on a single GPU while achieving remarkable performance optimizations. InternLM achieves nearly 90% acceleration efficiency during training on 1024 GPUs.

+## News
+
+InternLM-7B-Chat v1.1 is released with code interpreter and function calling capability. You can try it with [Lagent](https://github.com/InternLM/lagent).
+
 ## InternLM-7B

 ### Performance Evaluation
@ -74,6 +79,7 @@ InternLM 7B and InternLM 7B Chat, trained using InternLM, have been open-sourced
 | Model                         | InternLM Format Weight Download Link                                                                                                                 | Transformers Format Weight Download Link                                         |
 | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
 | **InternLM 7B**         | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b)         | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b)                 |
+| **InternLM Chat 7B v1.1**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1)    | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1)       |
 | **InternLM Chat 7B**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b)    | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)       |
 | **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k) |

@ -85,8 +91,8 @@ To load the InternLM 7B Chat model using Transformers, use the following code:

 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "hello", history=[])
 >>> print(response)
@ -118,28 +124,45 @@ The effect is as follows

 ### Deployment

-We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the one-click deployment of InternLM.
+We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the workflow of InternLM deployment.

-1. First, install LMDeploy:
+```bash
+python3 -m pip install lmdeploy
+```

-    ```bash
-    python3 -m pip install lmdeploy
-    ```
+You can utilize the following commands to conduct `internlm-chat-7b` FP16 inference, serve it and interact with AI assistant via WebUI:

-2. Use the following command for quick deployment:
+```bash
+# convert weight layout
+python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b

-    ```bash
-    python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b/model
-    ```
+# inference lmdeploy's turbomind engine
+python3 -m lmdeploy.turbomind.chat ./workspace

-3. After exporting the model, you can start a server and have a conversation with the deployed model using the following command:
-   
-    ```bash
-    bash workspace/service_docker_up.sh
-    python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
-    ```
+# serving with gradio
+python3 -m lmdeploy.serve.gradio.app ./workspace
+```

-[LMDeploy](https://github.com/InternLM/LMDeploy) provides a complete workflow for deploying InternLM. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.
+You can also deploy 4-bit quantized `internlm-chat-7b` model via LMDeploy. It greatly trims down the model's memory overhead to 6G, just 40% of what FP16 inference would take. More importantly, with extreme optimized kernel, the inference performance achieves 2.4x faster than FP16 inference on A100-80G.
+
+Try the followings to enjoy 4-bit `internlm-chat-7b` on a Geforce RTX 30x GPU card. You can find the inference benchmark from [here](https://github.com/InternLM/lmdeploy/blob/main/docs/en/w4a16.md#inference-performance).
+
+```bash
+# download prequnantized internlm-chat-7b model from huggingface
+git-lfs install
+git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
+
+# Convert the model's layout and store it in the default path, ./workspace.
+python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
+
+# inference lmdeploy's turbomind engine
+python3 -m lmdeploy.turbomind.chat ./workspace
+
+# serving with gradio
+python3 -m lmdeploy.serve.gradio.app ./workspace
+```
+
+LMDeploy is an efficient toolkit for compressing, deploying, and serving LLM models. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.

 ## Fine-tuning & Training

--- a/ci_scripts/common/com_func.py
+++ b/ci_scripts/common/com_func.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+
+def merge_dicts(dict_a: dict, dict_b: dict):
+    for key in dict_b.keys():
+        if isinstance(dict_b[key], dict):
+            dict_b[key] = {**dict_a[key], **dict_b[key]}
+            merge_dicts(dict_a[key], dict_b[key])
+    dict_c = {**dict_a, **dict_b}
+    return dict_c
+
+
+def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
+    result = ""
+    for key, value in data.items():
+        if isinstance(value, dict):
+            result += f"{' ' * indent}{key} = dict(\n"
+            result += format_dict_to_py_string(value, indent + 4, is_nested=True)
+            result += f"{' ' * indent})"
+        else:
+            result += f"{' ' * indent}{key} = {repr(value)}"
+        if is_nested:
+            result += ","
+        result += "\n"
+    result = f"""\
+{result}
+"""
+    return result
--- a/ci_scripts/data/tokenizer_alpaca.sh
+++ b/ci_scripts/data/tokenizer_alpaca.sh
@ -16,7 +16,7 @@ exit_code=0

 source ./ci_scripts/common/basic_func.sh

-echo "start to test alpaca_tokenizer.py." 
+echo "start to test alpaca_tokenizer.py."

 if [[ -d ${RESULTS} ]]; then
    if ! rm -rf ${RESULTS}/*; then
--- a/ci_scripts/data/tokenizer_chinese.sh
+++ b/ci_scripts/data/tokenizer_chinese.sh
@ -12,7 +12,7 @@ exit_code=0

 source ./ci_scripts/common/basic_func.sh

-echo "start to test tokenizer.py." 
+echo "start to test tokenizer.py."

 num=$(num_files "${RESULTS}")
 if [[ ${num} -gt 0 ]]; then
--- a/ci_scripts/model/convert_to_hf.sh
+++ b/ci_scripts/model/convert_to_hf.sh
@ -40,7 +40,7 @@ num=$(num_files "${CKPTS_OUTPUT}")

 if [[ ${num} -ne ${expected_num} ]]; then
    echo "expect: ${expected_num} files, actual: ${num} files."
-    exit_code=$(($exit_code + 1)) 
+    exit_code=$(($exit_code + 1))
 fi

 # NOTICE: should not remove the cached files, because the cached files will be used in the next test case.
--- a/ci_scripts/model/demo_load_7B_chat_model.py
+++ b/ci_scripts/model/demo_load_7B_chat_model.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
--- a/ci_scripts/train/ci_7B_sft.py
+++ b/ci_scripts/train/ci_7B_sft.py
@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
 # Ckpt folder format:
 # fs: 'local:/mnt/nfs/XXX'
 # oss: 'boto3:s3://model_weights/XXX'
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 # SAVE_CKPT_FOLDER = "local:llm_ckpts"
 SAVE_CKPT_FOLDER = "local:llm_ckpts"
 # LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
@ -27,7 +27,7 @@ ckpt = dict(
    load_optimizer=True,
 )

-TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
+TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
 data = dict(
    seq_len=SEQ_LEN,
    # micro_num means the number of micro_batch contained in one gradient update
@ -120,8 +120,8 @@ zero1 parallel:
    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-pipeline parallel: pipeline parallel size.
-tensor parallel: tensor parallel size, usually the number of GPUs per node.
+pipeline parallel: pipeline parallel size, only 1 is accepted currently.
+tensor parallel: tensor parallel size, usually the number of GPUs per node, only 1 is accepted currently.
 """
 parallel = dict(
    zero1=8,
--- a/ci_scripts/train/generate_config.py
+++ b/ci_scripts/train/generate_config.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import argparse
+import json
+import os
+
+from ci_scripts.common import com_func
+from internlm.core.context import Config
+
+
+def generate_new_config(config_py_file, test_config_json, case_name):
+    # generate path of the new config py
+    config_path = os.path.split(config_py_file)
+    new_config_py_file = os.path.join(config_path[0], case_name + ".py")
+
+    # merge dict
+    origin_config = Config.from_file(config_py_file)
+    with open(test_config_json) as f:
+        test_config = json.load(f)
+    if test_config:
+        if case_name not in test_config.keys():
+            raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
+    new_config = com_func.merge_dicts(origin_config, test_config[case_name])
+    print(f"new config is:\n{new_config}")
+
+    # write new config to py file
+    file_content = com_func.format_dict_to_py_string(new_config)
+    with open(new_config_py_file, "w") as f:
+        f.write(file_content)
+    print(f"The new test train config file is {new_config_py_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--origin_config",
+        type=str,
+        default="./ci_scripts/train/ci_7B_sft.py",
+        help="path to the origin train config file",
+    )
+    parser.add_argument(
+        "--test_config",
+        type=str,
+        default="./ci_scripts/train/test_config.json",
+        help="path to the test train config file",
+    )
+    parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
+    args = parser.parse_args()
+    generate_new_config(args.origin_config, args.test_config, args.case_name)
--- a/ci_scripts/train/load_ckpt.sh
+++ b/ci_scripts/train/load_ckpt.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+set -x
+
+[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
+readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
+readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
+readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
+expected_num=22
+exit_code=0
+
+source ./ci_scripts/common/basic_func.sh
+
+echo "start to test slurm training with loading checkpoint."
+
+python ./ci_scripts/train/generate_config.py --case_name $1
+file="./ci_scripts/train/$1.py"
+if [[ ! -f ${file} ]]; then
+        echo "expect: ${file} exists, actual: not exist."
+        exit_code=$(($exit_code + 1))
+    fi
+
+srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
+[[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
+
+
+num=$(num_files "${CKPTS40_OUTPUT}")
+if [[ ${num} -ne ${expected_num} ]]; then
+    echo "expect: ${expected_num} files, actual: ${num} files."
+    exit_code=$(($exit_code + 1))
+fi
+
+# clean the test files.
+if ! rm -rf ${CKPTS_PATH}/*; then
+    echo "cleaning cached file in ${CKPTS_PATH} failed."
+    exit_code=$(($exit_code + 1))
+fi
+
+exit $exit_code
--- a/ci_scripts/train/slurm_train.sh
+++ b/ci_scripts/train/slurm_train.sh
@ -5,7 +5,7 @@ set -x
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
 readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt"
-expected_num=21
+expected_num=22
 exit_code=0

 source ./ci_scripts/common/basic_func.sh
@ -25,12 +25,6 @@ srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -n 8 --ntasks-per-node=8 --
 num=$(num_files "${CKPTS20_OUTPUT}")
 if [[ ${num} -ne ${expected_num} ]]; then
    echo "expect: ${expected_num} files, actual: ${num} files."
-    exit_code=$(($exit_code + 1)) 
-fi
-
-# clean the test files.
-if ! rm -rf ${CKPTS_PATH}/*; then
-    echo "cleaning cached file in ${CKPTS_PATH} failed."
    exit_code=$(($exit_code + 1))
 fi

--- a/ci_scripts/train/test_config.json
+++ b/ci_scripts/train/test_config.json
@ -0,0 +1,45 @@
+{
+    "7B_basic_train": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "ckpt": {
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 20
+        }
+    },
+    "7B_load_new_ckpt": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
+        "ckpt": {
+            "load_ckpt_folder": "local:llm_ckpts/20",
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 40
+        }
+    },
+    "7B_load_preset_ckpt": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
+        "ckpt": {
+            "load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 40
+        }
+    }
+}
--- a/ci_scripts/train/torchrun.sh
+++ b/ci_scripts/train/torchrun.sh
@ -5,7 +5,7 @@ set -x
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
 readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt"
-expected_num=21
+expected_num=22
 exit_code=0

 source ./ci_scripts/common/basic_func.sh
@ -25,7 +25,7 @@ srun -p ${SLURM_PARTITION} --exclusive --job-name=$1 -N 1 torchrun --nnodes=1 --
 num=$(num_files "${CKPTS_OUTPUT}")
 if [[ ${num} -ne ${expected_num} ]]; then
    echo "expect: ${expected_num} files, actual: ${num} files."
-    exit_code=$(($exit_code + 1)) 
+    exit_code=$(($exit_code + 1))
 fi

 # clean the test files.
--- a/configs/moe_cfg.py
+++ b/configs/moe_cfg.py
@ -1,152 +0,0 @@
-JOB_NAME = "7b_train"
-
-SEQ_LEN = 2048
-HIDDEN_SIZE = 4096
-NUM_ATTENTION_HEAD = 32
-MLP_RATIO = 8 / 3
-NUM_LAYER = 16
-VOCAB_SIZE = 103168
-
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
-# Ckpt folder format:
-# fs: 'local:/mnt/nfs/XXX'
-SAVE_CKPT_FOLDER = "local:llm_ckpts"
-LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
-
-# boto3 Ckpt folder format:
-# import os
-# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
-# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
-# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
-CHECKPOINT_EVERY = 50
-ckpt = dict(
-    enable_save_ckpt=False,  # enable ckpt save.
-    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
-    # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
-    load_optimizer=True,  # Wheter to load optimizer states when continuing training.
-    checkpoint_every=CHECKPOINT_EVERY,
-    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
-    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
-    snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
-    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
-)
-
-TRAIN_FOLDER = "/mnt/petrelfs/share_data/llm_data/0623_scratch_tokenized_filtered/train/en/enwiki"
-VALID_FOLDER = "/mnt/petrelfs/share_data/llm_data/0623_scratch_tokenized_filtered/train/en/enwiki"
-data = dict(
-    seq_len=SEQ_LEN,
-    # micro_num means the number of micro_batch contained in one gradient update
-    micro_num=4,
-    packed_length = 2 * SEQ_LEN,
-    micro_bsz=2,
-    # defaults to the value of micro_num
-    valid_micro_num=4,
-    # defaults to 0, means disable evaluate
-    valid_every=50000,
-    pack_sample_into_one=False,
-    total_steps=50000,
-    skip_batches="",
-    rampup_batch_size="",
-    # Datasets with less than 50 rows will be discarded
-    min_length=50,
-    train_folder=TRAIN_FOLDER,
-    valid_folder=VALID_FOLDER,
-)
-
-grad_scaler = dict(
-    fp16=dict(
-        # the initial loss scale, defaults to 2**16
-        initial_scale=2**16,
-        # the minimum loss scale, defaults to None
-        min_scale=1,
-        # the number of steps to increase loss scale when no overflow occurs
-        growth_interval=1000,
-    ),
-    # the multiplication factor for increasing loss scale, defaults to 2
-    growth_factor=2,
-    # the multiplication factor for decreasing loss scale, defaults to 0.5
-    backoff_factor=0.5,
-    # the maximum loss scale, defaults to None
-    max_scale=2**24,
-    # the number of overflows before decreasing loss scale, defaults to 2
-    hysteresis=2,
-)
-
-hybrid_zero_optimizer = dict(
-    # Enable low_level_optimzer overlap_communication
-    zero_overlap_communication=True,
-    # bucket size for nccl communication params
-    reduce_bucket_size=512 * 1024 * 1024,
-    # grad clipping
-    clip_grad_norm=1.0,
-)
-
-loss = dict(
-    label_smoothing=0,
-    moe_loss_coeff=0.1,
-)
-
-adam = dict(
-    lr=1e-4,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    adam_beta2_c=0,
-    adam_eps=1e-8,
-    weight_decay=0.01,
-)
-
-lr_scheduler = dict(
-    total_steps=data["total_steps"],
-    init_steps=0,  # optimizer_warmup_step
-    warmup_ratio=0.01,
-    eta_min=1e-5,
-    last_epoch=-1,
-)
-
-beta2_scheduler = dict(
-    init_beta2=adam["adam_beta2"],
-    c=adam["adam_beta2_c"],
-    cur_iter=-1,
-)
-
-model = dict(
-    checkpoint=False,
-    num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
-    vocab_size=VOCAB_SIZE,
-    embed_grad_scale=1,
-    parallel_output=True,
-    hidden_size=HIDDEN_SIZE,
-    num_layers=NUM_LAYER,
-    mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",
-    norm_type="rmsnorm",
-    layer_norm_epsilon=1e-5,
-    use_flash_attn=True,
-    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-    sequence_parallel=False,
-    num_experts=4,
-    moe_use_residual=False,
-)
-"""
-zero1 parallel:
-    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
-        so parameters will be divided within the range of dp.
-    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
-    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
-        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-pipeline parallel (dict):
-    1. size: int, the size of pipeline parallel.
-    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
-tensor parallel: tensor parallel size, usually the number of GPUs per node.
-"""
-parallel = dict(
-    # zero1=4,
-    pipeline=dict(size=4, interleaved_overlap=False),
-    # tensor=dict(size=4),
-)
-
-cudnn_deterministic = False
-cudnn_benchmark = False
--- a/doc/code-docs/Makefile
+++ b/doc/code-docs/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/checkpoint.po
@ -0,0 +1,106 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-13 17:07+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/checkpoint.rst:2
+msgid "模型保存"
+msgstr "Model Checkpointing"
+
+#: ../../source/checkpoint.rst:4
+msgid ""
+"InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` "
+"来管理模型保存。其中，可以使用 ``CheckpointManager.try_save_checkpoint(train_state)`` "
+"来保存指定 step 的模型状态。"
+msgstr ""
+"InternLM uses ``internlm.utils.model_checkpoint.CheckpointManager`` to "
+"manage model checkpointing. In the implementation, we use "
+"``CheckpointManager.try_save_checkpoint(train_state)`` to checkpoint "
+"training states at specific steps. "
+
+#: ../../source/checkpoint.rst:6
+msgid "InternLM支持启动时自动加载最新的模型备份，并在接收信号退出训练时自动进行模型备份。"
+msgstr "InternLM supports automatic loading of latest ckpt at startup and automatic model checkpointing at signal quit. "
+
+#: ../../source/checkpoint.rst:9
+msgid "Checkpointing"
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager:1 of
+msgid "StorageManagerContext"
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:1 of
+msgid ""
+"Exit signal detection function, if we write the exit step in the "
+"'QUIT_FILE_PATH' file, all ranks will save ckpt and exit. Negative "
+"integer step means save ckpt. Positive integer step means save ckpt and "
+"quit."
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
+msgid "参数"
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
+msgid "返回"
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler:9 of
+msgid "whether to quit."
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager.quit_signal_handler of
+msgid "返回类型"
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager.wait_async_upload_finish:1
+#: of
+msgid "wait for all checkpoint uploads to be completed"
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager.query_latest_snapshot_step_boto3:1
+#: of
+msgid ""
+"Returns: Tuple(str, int): path of latest ckpt and ckpt step, if not "
+"found, None will return."
+msgstr ""
+
+#: internlm.utils.model_checkpoint.CheckpointManager.save_checkpoint:1 of
+msgid "Save checkpoint to the given folder path."
+msgstr ""
+
+#~ msgid "Attempt to restore the training state of the last ckpt."
+#~ msgstr ""
+
+#~ msgid "lr_scheduler object."
+#~ msgstr ""
+
+#~ msgid "optimizer object."
+#~ msgstr ""
+
+#~ msgid "learning rate."
+#~ msgstr ""
+
+#~ msgid "traing states."
+#~ msgstr ""
+
+#~ msgid "traning dataloader object"
+#~ msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/30B_demo.po
@ -0,0 +1,50 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/example/30B_demo.rst:2 242d1f89ae2045f1bf1f31bf82f07846
+msgid "30B Demo"
+msgstr ""
+
+#: ../../source/example/30B_demo.rst:5 c2415bfa6978414a939dcc395fdfb544
+msgid "训练配置"
+msgstr "Training Config"
+
+#: ../../source/example/30B_demo.rst:7 75f568d1ca5546228f88958c12c2dd65
+msgid "30B demo 训练配置文件样例如下:"
+msgstr "30B demo config file example:"
+
+#: ../../source/example/30B_demo.rst:164 533cb04f94314eeb8381e45f06d03108
+msgid "启动训练"
+msgstr "Start Training"
+
+#: ../../source/example/30B_demo.rst:166 24974384d5ab42e68266aeb67ae222ce
+msgid "完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动两节点 16GPU 的训练命令如下所示："
+msgstr "After completing the data preparation and relevant training configurations, you can start the demo training. "
+"The following example shows how to start distributed training in ``slurm`` environments with 16 GPUs."
+
+#: ../../source/example/30B_demo.rst:173 948ac71ed53848f9bad07f69d956c4bb
+msgid "训练结果"
+msgstr "Training Results"
+
+#: ../../source/example/30B_demo.rst:175 615a3481b0aa49729b7219b1365519aa
+msgid "基于以上训练配置和启动命令，两节点 16GPU 下的模型训练部分日志展示如下："
+msgstr "Taking the configuration of the demo training on two nodes with 16 GPUs on slurm as an example, the training result log is shown below:"
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/7B_demo.po
@ -0,0 +1,50 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/example/7B_demo.rst:2 8576f969040249bb93e7c347ef210990
+msgid "7B Demo"
+msgstr ""
+
+#: ../../source/example/7B_demo.rst:5 5429ceea12424825991744bece744f60
+msgid "训练配置"
+msgstr "Training Config"
+
+#: ../../source/example/7B_demo.rst:7 c9a47faf5deb40b68ad2bc950fdf2b14
+msgid "7B demo 的训练配置文件样例如下:"
+msgstr "7B demo config file example:"
+
+#: ../../source/example/7B_demo.rst:162 eb93a6ca05c8421eb87a2470f9f31fc2
+msgid "启动训练"
+msgstr "Start Training"
+
+#: ../../source/example/7B_demo.rst:164 9e7a864ae2e14d05b0681f16792e5278
+msgid "完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动单节点 8GPU 的训练命令如下所示："
+msgstr "After completing the data preparation and relevant training configurations, you can start the demo training. "
+"The following example shows how to start distributed training in ``slurm`` environments with 8 GPUs."
+
+#: ../../source/example/7B_demo.rst:171 fdd053efb1854d46aabf6c0f279fe7fc
+msgid "训练结果"
+msgstr "Training Results"
+
+#: ../../source/example/7B_demo.rst:173 33ec81f34e3c4340beacdb5254069d08
+msgid "基于以上训练配置和启动命令，单节点 8GPU 下的模型训练部分日志展示如下："
+msgstr "Taking the configuration of the demo training on a single machine with 8 GPUs on slurm as an example, the training result log is shown below:"
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/example/index.po
@ -0,0 +1,33 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/example/index.rst:2 de54695e8bde40ffb8878043072197e6
+msgid "训练样例"
+msgstr "Training Example"
+
+#: ../../source/example/index.rst:5 da388b3209ff4bd39fd0700a7fba413a
+msgid "7B Demo"
+msgstr ""
+
+#: ../../source/example/index.rst:13 b095e27dfc924a7a943b7cba5361700a
+msgid "30B Demo"
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/index.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/index.po
@ -0,0 +1,81 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/index.rst:8 11e029810acf410180311a3c63eb01f4
+msgid "InternLM"
+msgstr "InternLM"
+
+#: ../../source/index.rst:11 e6fd7d058e4b43bb81157ac79867e3d3
+msgid "环境构建"
+msgstr "Environment Setup"
+
+#: ../../source/index.rst:19 f323ede90c0f434d8b627eded1d8fc10
+msgid "快速上手"
+msgstr "Quickstart Guide"
+
+#: ../../source/index.rst:27 3c504b4b92264e9182abb0fa81fe80c3
+msgid "训练构建"
+msgstr "Model Setup"
+
+#: ../../source/index.rst:35 5cc5c831399a40b089d27b777a776b16
+msgid "训练 API"
+msgstr "Training API"
+
+#: ../../source/index.rst:43 21a7473eabb441f8bfe28d2a0e306889
+msgid "并行训练"
+msgstr "Parallel Training"
+
+#: ../../source/index.rst:51 9234725f3c464731993d73607608c874
+msgid "模型备份"
+msgstr "Model Checkpointing"
+
+#: ../../source/index.rst:59 8e4ce037017f4510b2892a66003877fa
+msgid "性能分析"
+msgstr "Profiler"
+
+#: ../../source/index.rst:67 a36e02819ecd4b448a8cb4ebbecb6600
+msgid "训练监控"
+msgstr "Monitor"
+
+#: ../../source/index.rst:75 b912e292486f455c8b5cdd75962e8ac2
+msgid "训练样例"
+msgstr "Example"
+
+#: ../../source/index.rst:83 ea9e9281720941a1830e5df7a2badf7a
+msgid "常见问题"
+msgstr "Q&A"
+
+#: ../../source/index.rst:91 e08edc5aa1c74965b10084b393b88fae
+msgid "索引和表格"
+msgstr "Indices and tables"
+
+#: ../../source/index.rst:93 f3fdca059caa49dcad09aa44be7f02d6
+msgid ":ref:`genindex`"
+msgstr ""
+
+#: ../../source/index.rst:94 b3791e811315435097bb507edc3f4b9b
+msgid ":ref:`modindex`"
+msgstr ""
+
+#: ../../source/index.rst:95 a164b772960f4ab8b18c7e8820f69f55
+msgid ":ref:`search`"
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/initialize.po
@ -0,0 +1,248 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-14 12:23+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: zh_CN\n"
+"Language-Team: zh_CN <LL@li.org>\n"
+"Plural-Forms: nplurals=1; plural=0;\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/initialize.rst:2
+msgid "训练构建"
+msgstr "Training Setup"
+
+#: ../../source/initialize.rst:4
+msgid "InternLM 的训练流程可以归纳为两个步骤："
+msgstr "The training process of InternLM can be summarized into two steps: "
+
+#: ../../source/initialize.rst:6
+msgid "初始化"
+msgstr "Initialization"
+
+#: ../../source/initialize.rst:8
+msgid "初始化模型、优化器、数据加载器、Trainer，生成不同种类的进程组，为混合并行的迭代训练做准备。"
+msgstr ""
+"Initialize model, optimizer, dataloader, trainer, and create different "
+"types of process groups to prepare for iterative steps of hybrid parallel training. "
+
+#: ../../source/initialize.rst:9
+msgid "初始化Logger、Checkpoint管理器、Monitor管理器、Profiler，对迭代训练的过程观察、预警、记录。"
+msgstr ""
+"Initialize logger, checkpoint manager, monitor manager, and profiler to "
+"watch, alert, and record the iterative training steps. "
+
+#: ../../source/initialize.rst:11
+msgid "迭代训练"
+msgstr "Iterative training steps"
+
+#: ../../source/initialize.rst:13
+msgid "根据配置文件定义的张量并行、流水线并行、数据并行的大小，加载训练引擎和调度器进行混合并行训练。"
+msgstr ""
+"Load the training engine and scheduler for hybrid parallel training "
+"according to the configuration such as tensor parallel size, pipeline "
+"parallel size, and data parallel size. "
+
+#: ../../source/initialize.rst:14
+msgid "在迭代训练中，调用 Trainer API 进行梯度置零，前向传播计算损失并反向传播，参数更新。"
+msgstr ""
+"In iterative training steps, the Trainer API is called to perform zero "
+"gradients, forward-loss-backward, and parameter update."
+
+#: ../../source/initialize.rst:20
+msgid "InternLM训练流程图"
+msgstr "InternLM training process"
+
+#: ../../source/initialize.rst:25
+msgid "命令行参数解析"
+msgstr "Argument Parsing"
+
+#: ../../source/initialize.rst:27
+msgid ""
+"InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_"
+" 库来向InternLM运行时提供命令行参数配置。"
+msgstr ""
+"InternLM uses the `argparse "
+"<https://docs.python.org/3/library/argparse.html>`_ library to supply "
+"commandline configuration to the InternLM runtime. "
+
+#: ../../source/initialize.rst:29
+msgid ""
+"用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM "
+"的默认解析器，其中包含一些内置参数，用户可以向此解析器添加自定义参数。"
+msgstr ""
+"Use ``internlm.initialize.get_default_parser()`` to get InternLM's "
+"default parser with some builtin arguments, users can add custom "
+"parameters to this parser."
+
+#: internlm.initialize.launch.get_default_parser:1 of
+msgid ""
+"Reads user command line and uses an argument parser to parse the input "
+"arguments. Input arguments include configuration, host, port, world size,"
+" local rank, backend for torch.distributed."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer
+#: internlm.initialize.launch.get_default_parser
+#: internlm.train.training_internlm.get_train_data_loader
+#: internlm.train.training_internlm.initialize_model
+#: internlm.train.training_internlm.initialize_optimizer of
+msgid "返回"
+msgstr ""
+
+#: internlm.initialize.launch.get_default_parser:4 of
+msgid ""
+"Returns the parser with the default arguments, the user may add "
+"customized arguments into this parser."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer
+#: internlm.initialize.launch.get_default_parser
+#: internlm.train.training_internlm.initialize_model of
+msgid "返回类型"
+msgstr ""
+
+#: ../../source/initialize.rst:45
+msgid "模型初始化"
+msgstr "Model Initialization"
+
+#: internlm.train.training_internlm.initialize_model:1 of
+msgid "Initialize model with Automatic Mixed Precision."
+msgstr ""
+
+#: internlm.train.training_internlm.initialize_model:3 of
+msgid "The neural network model to be trained or evaluated."
+msgstr ""
+
+#: ../../source/initialize.rst:49
+msgid "InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下："
+msgstr ""
+"InternLM uses the field ``model_type`` and ``model`` in the config file "
+"to control model initialization process. An example model initialization "
+"configuratio"
+
+#: ../../source/initialize.rst:77
+msgid "字段 ``model_type`` 指明了要初始化的模型类型"
+msgstr ""
+"The field ``model_type`` specifics the model type has been registered and"
+" to be initialized."
+
+#: ../../source/initialize.rst:78
+msgid "字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置"
+msgstr ""
+"The parameters in field ``model`` specific the configuration settings "
+"during model initialization."
+
+#: ../../source/initialize.rst:80
+msgid ""
+"值得注意的是，用户可以定义新的模型类型，并使用装饰器 ``@MODEL_INITIALIZER.register_module`` "
+"注册模型的初始化函数，其中 ``MODEL_INITIALIZER`` 是类 "
+"``internlm.util.registry.Registry`` 的一个实例化对象，示例如下所示："
+msgstr ""
+"It is worth noting that, users can define new model type, and register "
+"model's initialization function by decorater "
+"``@MODEL_INITIALIZER.register_module``, which ``MODEL_INITIALIZER`` is an"
+" instantiated object of class ``internlm.util.registry.Registry``, the "
+"example is shown as follows."
+
+#: ../../source/initialize.rst:92
+msgid "优化器初始化"
+msgstr "Optimizer Initialization"
+
+#: internlm.train.training_internlm.initialize_optimizer:1 of
+msgid "Initialize optimizer."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer
+#: internlm.train.training_internlm.get_train_data_loader
+#: internlm.train.training_internlm.initialize_optimizer of
+msgid "参数"
+msgstr ""
+
+#: internlm.train.training_internlm.initialize_optimizer:3 of
+msgid "Your model instance to be trained or evaluated."
+msgstr ""
+
+#: internlm.train.training_internlm.initialize_optimizer:6 of
+msgid "A tuple of (optimizer, beta2_scheduler, lr_scheduler)."
+msgstr ""
+
+#: ../../source/initialize.rst:99
+msgid "数据加载器初始化"
+msgstr "Dataloader Initialization"
+
+#: internlm.train.training_internlm.get_train_data_loader:1 of
+msgid "Generate and return the training data loader."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:3 of
+msgid "number of subprocesses used for dataloader."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:5 of
+msgid "generate function for dataset."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:7 of
+msgid "dataset sampler for training dataloader."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:9 of
+msgid "collate function for training dataloader."
+msgstr ""
+
+#: internlm.train.training_internlm.get_train_data_loader:12 of
+msgid "A tuple of (train_dl, dataset_types)."
+msgstr ""
+
+#: ../../source/initialize.rst:106
+msgid "Trainer 初始化"
+msgstr "Trainer Initialization"
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:1 of
+msgid ""
+"Core function to wrap the essential training components with our "
+"functionality based on the config which is loaded into gpc.config."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:4 of
+msgid "Your model instance or a function to build the model."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:6 of
+msgid "Your optimizer for training."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:8 of
+msgid "Your criterion instance."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:10 of
+msgid "Dataloader for training."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:12 of
+msgid "Dataloader for testing."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:14 of
+msgid "Your lr scheduler instance, optional."
+msgstr ""
+
+#: internlm.initialize.initialize_trainer.initialize_trainer:17 of
+msgid ""
+"A tuple of ``(trainer, train_dataloader, test_dataloader, lr_scheduler)``"
+" where only ``trainer`` could not be None."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/install.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/install.po
@ -0,0 +1,140 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../../install.md:2 ../../../install.md:28
+#: c237a7328df9440eb54f36c5e6ceef46 e55787faf3f74d5996f251b28422cf15
+msgid "环境安装"
+msgstr "Installation"
+
+#: ../../../install.md:4 d5cd61481eb04f55a9b1636e47e2bc49
+msgid "环境准备"
+msgstr "Environment Preparation"
+
+#: ../../../install.md:5 418763cd4acb4ff3afba059ae7066739
+msgid "首先，需要安装的依赖包及对应版本列表如下："
+msgstr "The required packages and corresponding version are shown as follows:"
+
+#: ../../../install.md:6 dcb95218036f4452a92a5a9c2fdbe337
+msgid "Python == 3.10"
+msgstr ""
+
+#: ../../../install.md:7 79e3d9ff5df7455fa596ba63ce3089b7
+msgid "GCC == 10.2.0"
+msgstr ""
+
+#: ../../../install.md:8 d14840f7b64d4a32a0be5762027e9c32
+msgid "MPFR == 4.1.0"
+msgstr ""
+
+#: ../../../install.md:9 851e3e5c874a4d0f8fd37a4f85ec8f2f
+msgid "CUDA >= 11.7"
+msgstr ""
+
+#: ../../../install.md:10 dbf2012c72e1479ba6647baa047ecc04
+msgid "Pytorch >= 1.13.1"
+msgstr ""
+
+#: ../../../install.md:11 b191e289a079455ea906694a75439b3e
+msgid "Transformers >= 4.28.0"
+msgstr ""
+
+#: ../../../install.md:12 17accf19fe184e3cb704274d8a66e87e
+msgid "Flash-Attention >= v1.0.5"
+msgstr ""
+
+#: ../../../install.md:13 8063cdce4bb94947a07dbaedd97e1013
+msgid "Apex == 23.05"
+msgstr ""
+
+#: ../../../install.md:14 7d6d2682ed214d0cba0048903c128bce
+msgid "Ampere或者Hopper架构的GPU (例如H100, A100)"
+msgstr "GPU with Ampere or Hopper architecture (such as H100, A100)"
+
+#: ../../../install.md:15 91039fb42b94421586c558a2afcbed71
+msgid "Linux OS"
+msgstr ""
+
+#: ../../../install.md:17 694b95a146d54878a4a5d57e0c1e8c6c
+msgid "以上依赖包安装完成后，需要更新配置系统环境变量："
+msgstr "After installing the above dependencies, some system environment variables need to be updated:"
+
+#: ../../../install.md:29 d0ebf84438dc43708ea517c7eff92e79
+msgid "将项目`internlm`及其依赖子模块，从 github 仓库中 clone 下来，命令如下："
+msgstr "Clone the project `internlm` and its dependent submodules from the github repository, as follows:"
+
+#: ../../../install.md:34 c278177fc1974f3fac9b33688d0591fd
+msgid "推荐使用 conda 构建一个 Python-3.10 的虚拟环境， 并基于`requirements/`文件安装项目所需的依赖包："
+msgstr "It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:"
+
+#: ../../../install.md:43 6a152c8e332f47b0ba35a9bcec2ed32d
+msgid "安装 flash-attention (version v1.0.5)："
+msgstr "Install flash-attention (version v1.0.5):"
+
+#: ../../../install.md:55 d7b2116e6ca745ceb48a792fae371283
+msgid "安装 Apex (version 23.05)："
+msgstr "Install Apex (version 23.05):"
+
+#: ../../../install.md:62 8bcbfb9f74de4a2796212a339feb8283
+msgid "环境镜像"
+msgstr "Environment Image"
+
+#: ../../../install.md:63 6cbb97568d704cf19e7dabab20ce1d5b
+msgid ""
+"用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像，或者也可以从 "
+"https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。"
+msgstr "Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm."
+
+#: ../../../install.md:65 9c29ae2ac9984a8094daf52751f5c7b9
+msgid "镜像配置及构造"
+msgstr "Image Configuration and Build"
+
+#: ../../../install.md:66 12bd6b0729464cb5af663a384dadd0ec
+msgid ""
+"dockerfile 的配置以及构造均通过 docker.Makefile 文件实现，在 InternLM 根目录下执行如下命令即可 build "
+"镜像："
+msgstr "The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:"
+
+#: ../../../install.md:70 b5f42dbca3e340c4bb80de1f502e0700
+msgid ""
+"在 docker.Makefile 中可自定义基础镜像，环境版本等内容，对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 "
+"ubuntu20.04 和 centos7。"
+msgstr "In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported."
+
+#: ../../../install.md:72 4abb47ce9cf64b3c9b8dc23ace37a826
+msgid "镜像拉取"
+msgstr "Pull Standard Image"
+
+#: ../../../install.md:73 1b6e61b2e0cb4da98f5d70d67ac638f9
+msgid "基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用："
+msgstr "The standard image based on ubuntu and centos has been built and can be directly pulled:"
+
+#: ../../../install.md:82 2bd75cc4b74848c19775e2b1c83726c1
+msgid "容器启动"
+msgstr "Run Container"
+
+#: ../../../install.md:83 4bb2dd4bba904255a204776a50721159
+msgid "对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器："
+msgstr "For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:"
+
+#: ../../../install.md:87 66613606256e4094a6be5ab2af1269ae
+msgid "容器内默认目录即 `/InternLM`，根据[使用文档](./usage.md)即可启动训练。"
+msgstr "The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md)."
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/monitor.po
@ -0,0 +1,198 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/monitor.rst:2 f95ef3bff8574c77a28ca2f6212cc4b8
+msgid "监控和告警"
+msgstr "Monitor and Alert"
+
+#: ../../source/monitor.rst:5 959bd4a6061f4483875c7950ab4546cf
+msgid "监控"
+msgstr "Monitoring"
+
+#: ../../source/monitor.rst:7 6071bc878d894865b73380cb887847c1
+msgid ""
+"InternLM 使用 ``internlm.monitor.monitor.initialize_monitor_manager()`` "
+"来初始化上下文监控管理。其中，一个实例化的单例对象 ``internlm.monitor.monitor.MonitorManager`` "
+"将管理监控线程并使用 ``internlm.monitor.monitor.MonitorTracker`` 来跟踪模型训练生命周期和训练状态。"
+msgstr ""
+"InternLM uses ``internlm.monitor.monitor.initialize_monitor_manager()`` to initialize context monitor. During this time, "
+"a singleton ``internlm.monitor.monitor.MonitorManager`` will manage monitoring thread and track training status "
+"with ``internlm.monitor.monitor.MonitorTracker``."
+
+#: 9256a063b6dd449786f29e03ce085176
+#: internlm.monitor.monitor.initialize_monitor_manager:1 of
+msgid ""
+"Initialize monitor manager for monitoring training lifetime and alerting "
+"exception info to Feishu."
+msgstr ""
+
+#: 138340fca72a4226be901f7f16c8a590 904b7938fdea46bf81c1ef738aa7bfae
+#: 9ed2a7b4af2243b289e72b2751aec902 aa0dd0dc6bee4a5bb15cc9705f7c13ee
+#: internlm.monitor.alert.send_feishu_msg_with_webhook
+#: internlm.monitor.monitor.MonitorManager.start_monitor
+#: internlm.monitor.monitor.MonitorTracker
+#: internlm.monitor.monitor.initialize_monitor_manager of
+msgid "参数"
+msgstr ""
+
+#: 3b302339e1d143b6b1d782ff59c9396d 6a06f053828b4c80aef56970750e2085
+#: internlm.monitor.monitor.MonitorManager.start_monitor:3
+#: internlm.monitor.monitor.initialize_monitor_manager:3 of
+msgid "The training job name."
+msgstr ""
+
+#: 3330d06145ee4d35b0b3632e799a35b3 c105473f2f6a4f838a9f0d098762d698
+#: internlm.monitor.monitor.MonitorManager.start_monitor:5
+#: internlm.monitor.monitor.initialize_monitor_manager:5 of
+msgid "The Feishu webhook address for sending alert messages."
+msgstr ""
+
+#: 774c6ff82a2e452295a1a7dcabaded3d internlm.monitor.monitor.MonitorManager:1
+#: of
+msgid ""
+"Monitor Manager for managing monitor thread and monitoring training "
+"status."
+msgstr ""
+
+#: 72e696c0ce8f41ea8c7947d35cf322f0
+#: internlm.monitor.monitor.MonitorManager.monitor_loss_spike:1 of
+msgid "Check loss value, if loss spike occurs, send alert message to Feishu."
+msgstr ""
+
+#: 2b668b057fa84e8b92c65bfd49bfb3e9
+#: internlm.monitor.monitor.MonitorManager.monitor_exception:1 of
+msgid "Catch and format exception information, send alert message to Feishu."
+msgstr ""
+
+#: 9852b7143026476d89e1a175223e6d79
+#: internlm.monitor.monitor.MonitorManager.handle_sigterm:1 of
+msgid "Catch SIGTERM signal, and send alert message to Feishu."
+msgstr ""
+
+#: 2e3827bad7b1445fb0d9a7c5a28def5d
+#: internlm.monitor.monitor.MonitorManager.start_monitor:1 of
+msgid ""
+"Initialize and start monitor thread for checking training job status, "
+"loss spike and so on."
+msgstr ""
+
+#: 271cc3e1b0834a7ba6a1ba4d5cce0ef1
+#: internlm.monitor.monitor.MonitorManager.start_monitor:7 of
+msgid "The time of monitor interval in seconds, defaults to 300."
+msgstr ""
+
+#: e4a06091fce8401b83e31ce26c8075a0
+#: internlm.monitor.monitor.MonitorManager.start_monitor:9 of
+msgid ""
+"The limit multiple of current loss to previous loss value, which means "
+"loss spike may be occurs, defaults to 1.5."
+msgstr ""
+
+#: 28bde748477e41f39fa6ca3e1855923d
+#: internlm.monitor.monitor.MonitorManager.stop_monitor:1 of
+msgid "Stop the monitor and alert thread."
+msgstr ""
+
+#: ffb3dda227664748bdb326b6630bc827 internlm.monitor.monitor.MonitorTracker:1
+#: of
+msgid "Track job status and alert to Feishu during job training."
+msgstr ""
+
+#: a1e93683cbb04d8ab825e2776e76efa7 internlm.monitor.monitor.MonitorTracker:3
+#: of
+msgid "The Feishu webhook address for sending alerting messages."
+msgstr ""
+
+#: 7913eeecc0904c128046e80cec1553f2 internlm.monitor.monitor.MonitorTracker:5
+#: of
+msgid "The interval in seconds for monitoring checks. Defaults to 300."
+msgstr ""
+
+#: 8d1abc3067584866983139dd3d85c59c internlm.monitor.monitor.MonitorTracker:7
+#: of
+msgid "The threshold for detecting loss value spikes. Defaults to 1.5."
+msgstr ""
+
+#: a0416fd68700450793daa2167f776618
+#: internlm.monitor.monitor.MonitorTracker.run:1 of
+msgid "start the monitor tracker."
+msgstr ""
+
+#: f55eb990c07b4e8f9388236dd60f0017
+#: internlm.monitor.monitor.MonitorTracker.stop:1 of
+msgid "Stop the monitor tracker."
+msgstr ""
+
+#: ../../source/monitor.rst:18 2202bc091aab417097a1b0268dfe6785
+msgid "告警"
+msgstr "Alerting"
+
+#: ../../source/monitor.rst:20 69334f83e644455aa619dde70b8ed1f2
+msgid ""
+"InternLM 监控线程会周期性地检查模型训练过程中是否出现 loss spike、潜在的 training stuck、运行时异常等，并捕获 "
+"SIGTERM 异常信号。当出现上述情况时，将触发警报，并通过调用 "
+"``internlm.monitor.alert.send_feishu_msg_with_webhook()`` 向飞书的 Webhook "
+"地址发送报警消息。"
+msgstr ""
+"InternLM monitor thread periodically tracks loss spike, potential stuck condition, runtime exception, and SIGTERM signal. "
+"When above situation occurs, an alert will be triggered and a message will be sent to the Feishu webhook address by calling "
+"``internlm.monitor.alert.send_feishu_msg_with_webhook()``."
+
+#: 15980526c2fa4ed8befa1604f271a3f1
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:1 of
+msgid "Use Feishu robot to send messages with the given webhook."
+msgstr ""
+
+#: 38e5738c2b914c8096e1a0f345e6c0b4
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:3 of
+msgid "The webhook to be used to send message."
+msgstr ""
+
+#: 4984f1a3bb0d46b48b2aad4fba8b43d9
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:5 of
+msgid "The message title."
+msgstr ""
+
+#: a9822a4cf30d4947b12f70a0efe62a5e
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:7 of
+msgid "The message body."
+msgstr ""
+
+#: 57d9ab65fe9f45c28351839fecf2f31e
+#: internlm.monitor.alert.send_feishu_msg_with_webhook of
+msgid "返回"
+msgstr ""
+
+#: 2b6ac97fd152498183a8624a9087812b
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:10 of
+msgid "The response from the request. Or catch the exception and return None."
+msgstr ""
+
+#: ec45dedf976046eb909f5b7f79a7d44c
+#: internlm.monitor.alert.send_feishu_msg_with_webhook of
+msgid "抛出"
+msgstr ""
+
+#: 4c6aeec19a6041cfbfa577b1c5a85ac1
+#: internlm.monitor.alert.send_feishu_msg_with_webhook:12 of
+msgid "An exception rasied by the HTTP post request."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/parallel.po
@ -0,0 +1,457 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/parallel.rst:2 28d82a05db464e35aa3ec83e36597214
+msgid "并行训练"
+msgstr "Parallel Training"
+
+#: ../../source/parallel.rst:6 f5c2eef4812640fca0aeaef62a2d85d4
+msgid ""
+"InternLM 支持张量并行、流水线并行、序列并行、数据并行和 ZeRO1.5 "
+"等并行化训练策略。在初始化分布式环境时，我们需要指定张量并行大小、流水线并行大小、数据并行大小以及 ZeRO1.5 策略。"
+msgstr ""
+"InternLM supports tensor parallel, pipeline parallel, sequence parallel, data parallel, and ZeRO1.5 "
+"to parallelize the training pipeline. When initializing the distributed environment, we need to specify "
+"tensor parallel size, pipeline parallel size, data parallel size, and ZeRO1.5 strategy."
+
+#: ../../source/parallel.rst:8 649c52696a734a0c86d3d5377193aba5
+msgid ""
+"InternLM 的并行设置由配置文件中的 ``parallel`` 字段指定，用户可以通过修改配置文件 `config file "
+"<https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_ "
+"来更改并行配置。以下是一个并行训练配置示例："
+msgstr ""
+"The parallel setting of InternLM is fully config-driven, and you can change the parallelism by modifying "
+"`config file <https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_. An exmaple parallel "
+"training configuration can be defined as follows:"
+
+#: ../../source/parallel.rst:19 a06ae11e51ea479b9501ada103c9d071
+msgid "zero1：zero 并行策略，分如下三种情况，默认值为 -1"
+msgstr "zero1: zero parallel strategy, divided into the following three cases, the default value is -1"
+
+#: ../../source/parallel.rst:21 08005d5cdde84057b870495d9683c7be
+msgid "当 ``zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
+msgstr "When ``zero1 <= 0``, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range."
+
+#: ../../source/parallel.rst:22 fe30803c0aec4b70847ac40b68641e05
+msgid "当 ``zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
+msgstr "When ``zero1 == 1``, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters."
+
+#: ../../source/parallel.rst:23 e0acea7d80094e018fab75404ec25163
+msgid ""
+"当 ``zero1 > 1`` 且 ``zero1 <= data_parallel_world_size``，则 zero1 "
+"进程组是数据并行进程组的子集"
+msgstr "When ``zero1 > 1`` and ``zero1 <= data_parallel_world_size``, the zero1 process group is a subset of the data parallel process group."
+
+#: ../../source/parallel.rst:25 17bba79e2e884993a602df9cf20d2489
+msgid "tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1"
+msgstr "tensor: tensor parallel size, usually the number of GPUs per node, the default value is 1"
+
+#: ../../source/parallel.rst:26 3bda721a03a144f28f33d360a87cbf83
+msgid "pipeline：流水线并行策略"
+msgstr "pipeline: pipeline parallel strategy"
+
+#: ../../source/parallel.rst:28 2b10f2b57ef64fcc872d036a7ad82b03
+msgid "size：流水线并行大小，默认值为 1"
+msgstr "size: pipeline parallel size, the default value is 1"
+
+#: ../../source/parallel.rst:29 49c8a409e60244c49514a27780ae39a3
+msgid "interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为 False"
+msgstr "interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False"
+
+#: ../../source/parallel.rst:31 e4ff81960c434b78847174787f0423e2
+msgid "sequence_parallel：是否开启序列化并行，默认值为 False"
+msgstr "sequence_parallel: whether to enable sequence parallelism, the default value is False"
+
+#: ../../source/parallel.rst:33 a24f4bc81fea48619ae2720e0cb6a392
+msgid "注意：数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小"
+msgstr "Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`"
+
+#: ../../source/parallel.rst:36 a93fc45f855c4ca7901ccbe23bf14edc
+msgid "张量并行"
+msgstr "Tensor Parallel"
+
+#: ../../source/parallel.rst:38 cce9e8f3c8f14c1c96c63273baceb164
+msgid ""
+"InternLM 的张量并行实现方案基于 `flash attention <https://github.com/Dao-AILab"
+"/flash-attention>`_, 主要对 `attention "
+"<https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_"
+" 和 `linear "
+"<https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_"
+" 这两个模块进行张量并行操作。"
+msgstr ""
+"The implementation of tensor parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_, "
+"which has tensor parallel extensions to parallelize `attention <https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_ "
+"and `linear <https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_ blocks in InternLM model. "
+
+#: ../../source/parallel.rst:41 f98a4b36ffdf4381a03899b605346be6
+msgid "用户可通过配置文件中的 ``parallel.tensor`` 字段来设置张量并行大小。"
+msgstr "To use tensor parallel, you need to set the value of tensor parallel size ``parallel.tensor`` in the config file, which is usually the number of GPUs per node."
+
+#: ../../source/parallel.rst:47 956804e7cde441989212f7eb505e8815
+msgid "张量并行，采用自 `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_"
+msgstr "Tensor parallel, adopted from `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_"
+
+#: ../../source/parallel.rst:50 a6424fd0ff0246fcadf56436260fadb6
+msgid "流水线并行"
+msgstr "Pipeline Parallel"
+
+#: ../../source/parallel.rst:52 f2c163418fed432a8f3f59f1a5229e88
+msgid ""
+"InternLM 在流水线并行中使用 `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ "
+"（1F1B，一次前向传递后跟一次反向传递）策略。对于 1F1B 策略，有两种实现方式："
+msgstr "InternLM uses `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ (one forward pass followed by one backward pass) for pipeline parallel. For 1F1B strategy, there are two implementations:"
+
+#: ../../source/parallel.rst:54 43f3b988e2924fe9968b9d049b46ffa0
+msgid "非交错调度器，内存高效。"
+msgstr "non-interleaved scheduler, which is memory-efficient"
+
+#: ../../source/parallel.rst:55 7a45446082c441d48d49b6be661ea8d2
+msgid "交错调度器，内存高效且时间高效（GPU空泡较少）。"
+msgstr "interleaved scheduler, which is both memory-efficient and time-efficient."
+
+#: ../../source/parallel.rst:61 92f2a168d7794811b56f9bb3bc170982
+msgid "1F1B 流水线并行调度器，采用自 `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_"
+msgstr "Non-interleaved and interleaved scheduler for 1F1B pipeline parallelism, adopted from `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_"
+
+#: ../../source/parallel.rst:64 a6d3df0b74b14b158a04ddda3e904004
+msgid "非交错式流水线调度"
+msgstr "scheduler for non-interleaved 1F1B strategy"
+
+#: ../../source/parallel.rst:65 1fa48743f39a44a29d78fb7f9eed5a52
+msgid "如果要使用非交错式调度, 需要设置 ``model.num_chunks = 1``。"
+msgstr "To use non-interleaved pipeline scheduler, users need to set ``model.num_chunks = 1`` in the config file."
+
+#: 57206dc0bc734686841c363c88839708
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:1 of
+msgid ""
+"A helper schedule class for pipeline parallelism running environment. It "
+"uses non-interleaved 1F1B strategy. Other properties are similar as "
+":class:`NonPipelineSchedule`."
+msgstr ""
+
+#: 6475fee6f3cd462ba1073a641b322e12 7060a021efb0459598f49f74e8e7185b
+#: 9218fee47e5542cab88ac65ff0054068 d1be8d5479fb48f59be379548ee24bd9
+#: d41da940b4a84cd0822c3f94c2eaf344 f5654fe6eacc49dba5baa1d058df5d29
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad of
+msgid "参数"
+msgstr ""
+
+#: 567e2a87a45245469af9f8709e020a20
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:5 of
+msgid "The number of microbatches."
+msgstr ""
+
+#: 6d3b2256ea9c4897bf72f551f8b4696b
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:7 of
+msgid "Type of data. torch.float by default."
+msgstr ""
+
+#: 6e36198f5ed344f7ad02f56aec9a333c
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:9 of
+msgid ""
+"The post processing function which receives a micro batch of data, and it"
+" will be executed in `load_micro_batch`."
+msgstr ""
+
+#: ffae9611bd854615af1ced927f72c556
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:12 of
+msgid "Specified shape in pipeline communication."
+msgstr ""
+
+#: 31d45af550334cb8a94142da335b9724
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:14 of
+msgid ""
+"If set to `True`, communication will be reduced over pipeline when using "
+"1D tensor parallelization."
+msgstr ""
+
+#: 5c852dc7866f4e50ab87c15b86d338f2
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler:16 of
+msgid "List of scheduler hooks."
+msgstr ""
+
+#: 4ebec38a972b4c31a59f1fc824d51f62
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing:1
+#: of
+msgid "To perform actions before running the schedule."
+msgstr ""
+
+#: d491d0dfa1bf41708150cc57567ac0f0
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.pre_processing:3
+#: of
+msgid "InternLM engine for training and inference."
+msgstr ""
+
+#: bc5dc62440b94825b192ad2e28641976
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:1
+#: of
+msgid ""
+"Runs non-interleaved 1F1B schedule, with communication between pipeline "
+"stages. Returns a tuple with losses if the last stage, an empty tuple "
+"otherwise."
+msgstr ""
+
+#: 765809e448b644678a9fb822f6427a94 99c948f562e343aabdecac2d43650f59
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:4
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:4
+#: of
+msgid "Colossalai engine for training and inference."
+msgstr ""
+
+#: 31af7a46c5a645628bea05ad35757dcf 4ea88ec52c5b4df79a57ab2d217de697
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:6
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:6
+#: of
+msgid ""
+"Dataloader as the form of an iterator, obtained by calling "
+"iter(dataloader)."
+msgstr ""
+
+#: 2deff747718449fabc5b47a1de0be52e e0d2e154ac134da28470924aa65342a1
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:8
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:8
+#: of
+msgid ""
+"Whether run forward step only. Default is false. If true, no backward "
+"will be run."
+msgstr ""
+
+#: 71aa2b45248c4af28525dbc1ba4a1aff d3b3c1e350334dd2a16cbb2e8c8d339a
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:10
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:10
+#: of
+msgid "Whether returns the loss value. Default is true."
+msgstr ""
+
+#: 2021eaca687148539b03f6b0b1c118c8 5c138015fb254eccae2f0df2dab45629
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:12
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:12
+#: of
+msgid "If False, the output and label won't be returned."
+msgstr ""
+
+#: 57a86115b88541b1a7220d9535058607 5dabcd12b6d844aab8039b022ad0cf1c
+#: b8ccfee837a242a3abbdf9e15eaa53d8
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step of
+msgid "返回"
+msgstr ""
+
+#: 7dc47f5518e64d1095a6051184985f17 fe678c953e8149a5ade387e95d10d3b2
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:17
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:15
+#: of
+msgid "A tuple of (output, label, loss), loss and label could be None."
+msgstr ""
+
+#: a50c7c3d40e14ba8a5af06aa0cb031cb ea3574b76d604402a41fcd3874d05c9a
+#: fa12b183c7534a20b61445eb9f2a2a7a
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step of
+msgid "返回类型"
+msgstr ""
+
+#: 82936eed6da5408c9361732f8fd5cb93 c46a28c21ca149d98ff625b7fdad4c03
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:19
+#: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler.forward_backward_step:16
+#: of
+msgid "Tuple[:class:`torch.Tensor`]"
+msgstr ""
+
+#: ../../source/parallel.rst:71 d2bfdbbd9a7641c38e6957a72ac6bc97
+msgid "交错式流水线调度"
+msgstr "scheduler for interleaved 1F1B strategy"
+
+#: ../../source/parallel.rst:72 395c484fef984a65a284147dc3056241
+msgid "如果要使用交错式调度, 需要设置 ``model.num_chunks > 1``。"
+msgstr "To use interleaved pipeline scheduler, users need to set ``model.num_chunks > 1`` in the config file."
+
+#: 036fffe3aacc4400af38ce5252840a50
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler:1 of
+msgid "Interleaved Pipeline Scheduler."
+msgstr ""
+
+#: 1b6e63b4004e44999e3ad38382b4e308
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:1
+#: of
+msgid ""
+"Run interleaved 1F1B schedule (model split into model chunks), with "
+"communication between pipeline stages as needed."
+msgstr ""
+
+#: 6ece1dfcdb5e408db4870d6c0f524787
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:15
+#: of
+msgid ""
+"A tuple of (output, label, loss), loss and label could be None.     The "
+"loss would be returned only in the last stage."
+msgstr ""
+
+#: ed7e5a4826f84e9eb2840e494761437f
+#: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler.forward_backward_step:18
+#: of
+msgid "The loss would be returned only in the last stage."
+msgstr ""
+
+#: ../../source/parallel.rst:77 1b771fea1d434f0b8b118f1b5344dde4
+msgid "值得注意的是，在使用交错式流水线调度器时可启用通信优化功能，即在 1F1B 阶段启用异步通信，以充分利用上行/下行带宽并实现通信与计算重叠。"
+msgstr "Asynchronous communication will be enabled in 1F1B stage to make full use of uplink/downlink bandwidth and achieve communication overlap. "
+
+#: ../../source/parallel.rst:79 27430e179b454d48a052b9fe6e11ecae
+msgid ""
+"用户需要在配置文件中设置 ``parallel.pipeline.interleaved_overlap = "
+"True``。该功能启用后，将调用函数 "
+"``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap``，并创建 "
+"``internlm.core.communication.AsynCommunicator`` 以管理异步通信。"
+msgstr ""
+"When ``parallel.pipeline.interleaved_overlap = True``, function ``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap`` will be called and "
+"``internlm.core.communication.AsynCommunicator`` will be created for managing async communication."
+
+#: ../../source/parallel.rst:81 4e0b6269ca48430098ed4619d0f0f22f
+msgid "``1F1B-without-overlap`` 和 ``1F1B-with-overlap`` 的区别如下所示："
+msgstr "The difference between 1F1B stage without overlap and 1F1B stage with overlap is shown as follows:"
+
+#: ../../source/parallel.rst:102 8412b1f6f51c479d9cbb281763215327
+msgid "序列并行"
+msgstr "Sequence Parallel"
+
+#: ../../source/parallel.rst:104 45aea8164dd244e5a730881c693eeecf
+msgid ""
+"序列并行是一种在不引入额外计算、通信和内存开销的情况下，减少层 ``layer_norm`` 和 ``dropout`` "
+"操作中的激活值内存。InternLM 中的序列并行实现基于 `flash attention <https://github.com/Dao-"
+"AILab/flash-attention>`_。这个并行策略有助于降低模型的内存消耗，提高了模型在资源受限环境中的可扩展性。"
+msgstr ""
+"Sequence parallel is a technique to reduce activation memory in layer norm and dropout without additional computation, "
+"communication or memory overhead. The implementation of sequence parallel for InternLM is based on `flash attention <https://github.com/Dao-AILab/flash-attention>`_. "
+
+#: ../../source/parallel.rst:106 29836b441ee84df6a6dbe877930ba911
+msgid "如果要启用序列并行, 用户需要设置 ``parallel.sequence_parallel = True``。"
+msgstr "To enable sequence parallel, you need to set ``parallel.sequence_parallel = True`` in the config file."
+
+#: ../../source/parallel.rst:112 eadcd6e77c2547998b4e132939a15856
+msgid "序列并行, 采用自 flash-attention"
+msgstr "Sequence parallel, adopted from flash-attention"
+
+#: ../../source/parallel.rst:115 47a0ac84251949fab0d9d8d34efb8751
+msgid "数据并行"
+msgstr "Data Parallel"
+
+#: ../../source/parallel.rst:117 938ad5a1cbc846bab36e8d2f4804a685
+msgid "InternLM 支持数据并行。数据并行大小为:"
+msgstr "InternLM supports data parallel. For data parallel:"
+
+#: ../../source/parallel.rst:119 1e8691a5ff4a4b40ae24815c681f7306
+msgid ""
+"`Data parallel size = Total number of GPUs / Pipeline parallel size / "
+"Tensor parallel size`"
+msgstr ""
+
+#: ../../source/parallel.rst:122 c417e2af4e8e45ca8ca18ad39e96dadd
+msgid "ZeRO1.5"
+msgstr ""
+
+#: ../../source/parallel.rst:124 9c05b4baf8a04e4b8a0f204c4e30cc9c
+msgid ""
+"ZeRO1.5 的实现使用了分层分片的概念，通过配置值 ``parallel.zero1`` "
+"启用了本地节点内的分片。这个方法有助于有效管理和分配模型参数和梯度，以减少内存使用并提高训练效率。"
+msgstr "The implementation of ZeRO1.5 uses the concept of hierarchical sharding via config value ``parallel.zero1``, which enables sharding within local nodes."
+
+#: ../../source/parallel.rst:126 48c994fe37d54c35bbf81f4be070e151
+msgid "当 ``parallel.zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
+msgstr "If ``parallel.zero1 <= 0``, the size of the zero process group is equal to the size of the dp process group, so parameters will be divided within the range of dp."
+
+#: ../../source/parallel.rst:127 3d31193758e24a08b1e90eae21259f71
+msgid "当 ``parallel.zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
+msgstr "If ``parallel.zero1 == 1``, zero is not used, and all dp groups retain the full amount of model parameters."
+
+#: ../../source/parallel.rst:128 fb5c43d2ac75423cabc12ba1512df25e
+msgid ""
+"当 ``parallel.zero1 > 1`` 且 ``parallel.zero1 <= "
+"data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集"
+msgstr "If ``parallel.zero1 > 1`` and ``parallel.zero1 <= dp world size``, the world size of zero is a subset of dp world size. For smaller models, it is usually a better choice to split the parameters within nodes with a setting ``parallel.zero1 <= 8``."
+
+#: ../../source/parallel.rst:130 47f03cea956a4477854591363359cdb3
+msgid ""
+"此外，用户可以在配置文件中通过 ``hybrid_zero_optimizer`` "
+"字段启用优化器的通信优化功能，设置桶大小，以及梯度剪裁等参数。这些设置有助于优化训练过程中的通信和计算效率，以及梯度的处理方式。"
+msgstr "Furthermore, you can enable communication-computation overlap, set bucket reduce size, gradient clipping parameters in the config file."
+
+#: ../../source/parallel.rst:144 dfc63103d4e341ccb7df8ef031e29f4e
+msgid "这里有两个值得关注的通信优化点："
+msgstr "There are two communication optimizations worth paying attention to here:"
+
+#: ../../source/parallel.rst:146 e4815f887d8f48368be01339b5e64d18
+msgid ""
+"overlap_sync_grad: 如果设置为 ``True``，则将训练的 ``backward pass`` 与梯度的 ``all-"
+"reduce`` 通信重叠"
+msgstr "overlap_sync_grad: If set True, overlapping training backward pass with gradients' all-reduce communication."
+
+#: ../../source/parallel.rst:147 bcb1aedd8a89441488b211cd81d4f80c
+msgid ""
+"overlap_sync_param: 如果设置为 ``True``，则将参数的 ``broadcast`` 通信与下一步的 ``forward "
+"pass`` 进行重叠"
+msgstr "overlap_sync_param: If set True, overlapping parameters' broadcast communication with next step's forward pass."
+
+#: ../../source/parallel.rst:149 3ba64e4762084e93ba62a70c909e7d82
+msgid "这些优化可以加速训练过程，提高训练效率。"
+msgstr "These optimizations can speed up the training process and improve training efficiency."
+
+#: 757dad6b9916403c83042b49eaa35ae5
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer:1 of
+msgid "Hybrid Zero Optimizer."
+msgstr ""
+
+#: 83bcd49c056446f6806a55e6138579f2
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad:1
+#: of
+msgid ""
+"Set parameter gradients to zero. If set_to_none = True, gradient will be "
+"set to None to save memory."
+msgstr ""
+
+#: 2d3da89d360c458f80844f9caed6c316
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.zero_grad:4
+#: of
+msgid "Whether set the gradient to None. Default value is True."
+msgstr ""
+
+#: 4164523156dc460cbbeaa17feed3c689
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:1 of
+msgid "Performs a single optimization step."
+msgstr ""
+
+#: 5c68dace1ec649bfa849b6652051daac
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:3 of
+msgid "A closure that reevaluates the model and returns the loss."
+msgstr ""
+
+#: 91e366d604ce48afa6b92666ece87b85
+#: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer.step:7 of
+msgid "Whether the gradient is success updated, and the gradient."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/profiler.po
@ -0,0 +1,175 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-14 11:05+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/profiler.rst:2
+msgid "性能分析"
+msgstr "Profiler"
+
+#: ../../source/profiler.rst:7
+msgid "Torch Profiler"
+msgstr ""
+
+#: ../../source/profiler.rst:9
+msgid ""
+"InternLM 使用 ``internlm.train.initialize_llm_profile()`` "
+"来收集和分析模型训练或推理期间的性能数据，如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler "
+"<https://pytorch.org/docs/stable/profiler.html>`_ ，输出的性能分析 trace 文件可以使用 "
+"`tensorboard <https://www.tensorflow.org/tensorboard?hl=en>`_ 进行可视化。"
+msgstr ""
+"InternLM uses ``internlm.train.initialize_llm_profile()`` to profile "
+"performance data, execution time duration and breakdown analysis of step "
+"time. The implementation is based on `torch.profiler "
+"<https://pytorch.org/docs/stable/profiler.html>`_ and output tracing "
+"files can be visualized with `tensorboard <https://www.tensorflow.org/tensorboard?hl=en>`_."
+
+#: ../../source/profiler.rst:11
+msgid ""
+"用户如果想使用这个 torch 性能分析工具，需要在启动训练时传递 ``--profiling`` 参数以启用性能分析。完成 torch "
+"性能分析后，用户可以在 ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` "
+"文件夹中看到性能分析结果。"
+msgstr ""
+"To use this torch profiler tool, you need to enable profiling by passing "
+"the ``--profiling`` flag when starting training. After torch profiling is"
+" completed, you can find the profiling results in the "
+"``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` folder."
+
+#: ../../source/profiler.rst:13
+msgid "实际运行生成的 ``Torch Profiler`` 目录结构如下："
+msgstr ""
+"The directory structure of ``Torch Profiler`` generated files is as "
+"follows:"
+
+#: ../../source/profiler.rst:22
+msgid "其中， ``traces`` 可以通过 ``TensorBoard`` 可视化，运行命令"
+msgstr ""
+"Among them, ``traces`` can be visualized through ``TensorBoard`` and run "
+"with the command"
+
+#: ../../source/profiler.rst:29
+msgid ""
+"在打开的 ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` "
+"页面可以看到Operator和GPU Kernel的性能分析时间线如下，更多的功能请参考 `torch profiler with "
+"tensorboard "
+"<https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html"
+"#pytorch-profiler-with-tensorboard>`_"
+msgstr ""
+"In the opened ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` page,"
+" you can see the timeline of profiled operators and GPU kernels. For more"
+" usage, please refer to `torch profiler with tensorboard "
+"<https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html"
+"#pytorch-profiler-with-tensorboard>`_"
+
+#: internlm.train.training_internlm.initialize_llm_profile:1 of
+msgid "Initialize and return the profiler context manager instance."
+msgstr ""
+
+#: ../../source/profiler.rst:38
+msgid "Memory Profiler"
+msgstr ""
+
+#: ../../source/profiler.rst:40
+msgid ""
+"InternLM 提供了一个实用的内存分析工具 "
+"``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` 来监控实际的 GPU"
+" 内存使用情况。在实现中，会对模型数据（包括模型参数、模型梯度和优化器状态）和非模型数据（包括激活值）分别进行详细的统计。"
+msgstr ""
+"InternLM provides a practical solution "
+"``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` to monitor"
+" actual GPU memory usage. In the implmentation, model data (including "
+"model parameters, model gradients, and optimizer states) and non-model "
+"data (including activations) are calculated."
+
+#: ../../source/profiler.rst:42
+msgid ""
+"要使用这个内存分析工具，用户需要在启动训练时传递 ``--profiling`` 参数以启用内存分析。完成内存分析后，用户可以在 "
+"``memory_trace/rank{}_dp{}_tp{}`` 文件夹中找到特定 rank "
+"对应的内存分析结果（包括不同时间点的内存使用日志和显示总体内存使用情况的太阳图表）。"
+msgstr ""
+"To use this memory profiler tool, you need to enable profiling by passing"
+" the ``--profiling`` flag when starting training. After memory profiling "
+"is completed, you can find the profiling results (including logs of "
+"memory usage at different time point and sunburst charts showing overall "
+"memory usage) for a specific rank device in the "
+"``memory_trace/rank{}_dp{}_tp{}`` folder."
+
+#: ../../source/profiler.rst:44
+msgid "实际运行生成的 ``memory_trace`` 目录结构如下："
+msgstr "The directory structure of ``memory_trace`` generated files is as follows:"
+
+#: ../../source/profiler.rst:107
+msgid "其中， ``memory.log`` 的内容示例如下："
+msgstr "An example of ``memory.log`` is as follows:"
+
+#: ../../source/profiler.rst:157
+msgid "模型参数的太阳图示例如下："
+msgstr "An example of model parameters sunburst chart is as follows:"
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:1 of
+msgid "A memory profiler for a llm model."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point of
+msgid "参数"
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:3 of
+msgid "The model to profile."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:5 of
+msgid "The optimizer used for training the model."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:7 of
+msgid "The file to write the memory state information to."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler:9 of
+msgid "number of steps to trace."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:1 of
+msgid "Record the memory state."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:3 of
+msgid "The options to include in the memory state. Defaults to \"\"."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:5 of
+msgid "Whether to create a new memory record file. Defaults to False."
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step of
+msgid "返回"
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.point:8
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step:3 of
+msgid "None"
+msgstr ""
+
+#: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler.step:1 of
+msgid "Update the memory state of the optimizer state."
+msgstr ""
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/qa.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/qa.po
@ -0,0 +1,25 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-07 10:56+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/qa.rst:2 e3b22a39640a40cfb527068a7f4bbfc9
+msgid "问&答"
+msgstr "Q&A"
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/training.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/training.po
@ -0,0 +1,162 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-14 12:23+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../source/training.rst:2
+msgid "训练 API"
+msgstr "Training API"
+
+#: ../../source/training.rst:4
+msgid ""
+"InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` "
+"管理。在定义了训练引擎和调度器之后，我们可以调用 Trainer API 来执行模型训练、评估、梯度清零和参数更新等。"
+msgstr ""
+"InternLM training API is managed in ``internlm.core.trainer.Trainer``. "
+"After defining the training engine and runtime scheduler, we can call "
+"training API to perform training, evaluation, zero gradients and "
+"parameter update steps."
+
+#: ../../source/training.rst:6
+msgid "有关详细用法，请参阅 Trainer API 文档和示例。"
+msgstr ""
+"For detailed usage, please refer to Trainer API documentation and "
+"examples."
+
+#: internlm.core.trainer.Trainer:1 of
+msgid ""
+"This is a class tending for easy deployments of users' training and "
+"evaluation instead of writing their own scripts."
+msgstr ""
+
+#: internlm.core.trainer.Trainer internlm.core.trainer.Trainer.execute_schedule
+#: of
+msgid "参数"
+msgstr ""
+
+#: internlm.core.trainer.Trainer:4 of
+msgid "Engine responsible for the process function."
+msgstr ""
+
+#: internlm.core.trainer.Trainer:6 of
+msgid "Runtime schedule. Defaults to None."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.engine:1 of
+msgid ""
+"Returns the engine that responsible for managing the training and "
+"evaluation process."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.schedule:1 of
+msgid "Returns the runtime scheduler."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.uses_pipeline:1 of
+msgid "Returns whether the pipeline parallel is used or not."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.train:1 of
+msgid "Sets the model to training mode."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.eval:1 of
+msgid "Sets the model to evaluation mode."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.zero_grad:1 of
+msgid "Sets the gradient of all parameters in the model to zero."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.step:1 of
+msgid "Executes the parameter update step."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.execute_schedule:1 of
+msgid ""
+"Runs the forward, loss computation, and backward for the model. Returns a"
+" tuple of (output, label, loss)."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.execute_schedule:4 of
+msgid "The data iterator."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.execute_schedule:6 of
+msgid "Additional keyword arguments."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.execute_schedule of
+msgid "返回"
+msgstr ""
+
+#: internlm.core.trainer.Trainer.execute_schedule:8 of
+msgid "A tuple of (output, label, loss)."
+msgstr ""
+
+#: internlm.core.trainer.Trainer.execute_schedule of
+msgid "返回类型"
+msgstr ""
+
+#: internlm.core.trainer.Trainer.execute_schedule:9 of
+msgid "Tuple[:class:`torch.Tensor`]"
+msgstr ""
+
+#~ msgid "InternLM 的训练流程可以归纳为两个步骤："
+#~ msgstr "The training process of InternLM can be summarized into two steps: "
+
+#~ msgid "初始化"
+#~ msgstr "Initialization"
+
+#~ msgid "初始化模型、优化器、数据加载器、Trainer，生成不同种类的进程组，为混合并行的迭代训练做准备。"
+#~ msgstr ""
+#~ "Initialize model, optimizer, dataloader, "
+#~ "trainer, and create different types of"
+#~ " process groups to prepare for "
+#~ "iterative steps of hybrid parallel "
+#~ "training. "
+
+#~ msgid "初始化Logger、Checkpoint管理器、Monitor管理器、Profiler，对迭代训练的过程观察、预警、记录。"
+#~ msgstr ""
+#~ "Initialize logger, checkpoint manager, monitor"
+#~ " manager, and profiler to watch, "
+#~ "alert, and record the iterative training"
+#~ " steps. "
+
+#~ msgid "迭代训练"
+#~ msgstr "Iterative training steps"
+
+#~ msgid "根据配置文件定义的张量并行、流水线并行、数据并行的大小，加载训练引擎和调度器进行混合并行训练。"
+#~ msgstr ""
+#~ "Load the training engine and scheduler"
+#~ " for hybrid parallel training according "
+#~ "to the configuration such as tensor "
+#~ "parallel size, pipeline parallel size, "
+#~ "and data parallel size. "
+
+#~ msgid "在迭代训练中，调用 Trainer API 进行梯度置零，前向传播计算损失并反向传播，参数更新。"
+#~ msgstr ""
+#~ "In iterative training steps, the Trainer"
+#~ " API is called to perform zero "
+#~ "gradients, forward-loss-backward, and "
+#~ "parameter update."
+
+#~ msgid "InternLM训练流程图"
+#~ msgstr "InternLM training process"
+
--- a/doc/code-docs/locales/en/LC_MESSAGES/usage.po
+++ b/doc/code-docs/locales/en/LC_MESSAGES/usage.po
@ -0,0 +1,367 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) 2023, InternLM Team
+# This file is distributed under the same license as the InternLM package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, 2023.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: InternLM \n"
+"Report-Msgid-Bugs-To: \n"
+"POT-Creation-Date: 2023-09-11 14:25+0800\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language: en\n"
+"Language-Team: en <LL@li.org>\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Generated-By: Babel 2.12.1\n"
+
+#: ../../../usage.md:2
+msgid "使用教程"
+msgstr "Quickstart Guide"
+
+#: ../../../usage.md:4
+msgid ""
+"启动一个 Demo "
+"模型训练，需要进行三项准备，**安装**，**数据集准备**和**模型训练配置**。接下来，首先会介绍数据准备相关的操作，再简要描述模型训练配置相关的内容。"
+msgstr ""
+"To start a demo model training, you need to prepare three things: "
+"**installation**, **dataset preparation**, and **model training "
+"configuration**. In this guide, we will first cover the steps for dataset"
+" preparation and then briefly describe the model training configuration."
+
+#: ../../../usage.md:6
+msgid "安装"
+msgstr "Installation"
+
+#: ../../../usage.md:7
+msgid "请参考[安装文档](./install.md)进行安装。"
+msgstr ""
+"Please refer to the [installation guide](./install.md) for instructions "
+"on how to install the necessary dependencies."
+
+#: ../../../usage.md:9
+msgid "数据准备 （预训练）"
+msgstr "Dataset Preparation (Pre-training)"
+
+#: ../../../usage.md:11
+msgid "InternLM训练任务的数据集包括一系列的`bin`和`meta`文件。使用`tokenizer`从原始文本文件生成训练用数据集。通过在`tools/tokenizer.py`中指定模型参数路径的方式来导入tokenizer模型。目前提供`V7_sft.model`来生成tokens。若想使用不同的模型，可直接修改`tokernizer.py`中的模型参数路径。"
+msgstr ""
+"The dataset for the InternLM training task includes a series of `bin` and"
+" `meta` files. A `tokenizer` is used to generate the training dataset "
+"from the original text files. The tokenizer model is imported by "
+"specifying the model parameter path in `tools/tokenizer.py`. Currently, "
+"`V7_sft.model` is provided to generate tokens. If you want to use a "
+"different model, you can directly modify the model parameter path in "
+"`tokenizer.py`."
+
+#: ../../../usage.md:13
+msgid "可以运行以下命令生成原始数据对应的`bin`和`meta`文件，其中参数`text_input_path`表示原始文本数据路径，目前支持`txt`、`json`和`jsonl`三种输入格式，`bin_output_path`表示生成的`bin`文件的保存路径。"
+msgstr ""
+"You can run the following command to generate `bin` and `meta` files "
+"corresponding to the original data. The parameter `text_input_path` "
+"represents the path of the original text data, currently supporting "
+"`txt`, `json`, and `jsonl` formats, while `bin_output_path` represents "
+"the save path of the generated `bin` files."
+
+#: ../../../usage.md:18
+msgid "下面是一个数据处理的例子："
+msgstr "Here is an example of data processing:"
+
+#: ../../../usage.md:20
+msgid "给定一个包含原始数据集的文件`raw_data.txt`，原始数据集如下所示："
+msgstr ""
+"Given a file `raw_data.txt` containing the raw dataset, the raw dataset "
+"is shown below:"
+
+#: ../../../usage.md:27
+msgid "可以通过运行以下命令来生成`bin`和`meta`文件："
+msgstr ""
+"You can generate the `bin` and `meta` files by running the following "
+"command:"
+
+#: ../../../usage.md:32
+msgid "需要注意的是，生成的`bin`文件需要保存在`cn`或者`en`或者`code`或者`ja`或者`ar`或者`kaoshi`这六个目录下，以区分数据集的类型。"
+msgstr ""
+"It should be noted that the generated `bin` files need to be saved in one"
+" of the following directories: `cn`, `en`, `code`, `ja`, `ar`, or "
+"`kaoshi`, depending on the type of dataset."
+
+#: ../../../usage.md:34
+msgid "其中，`cn`表示中文数据集；`en`表示英文数据集；`code`表示代码数据集；`ja`表示日语数据集；`ar`表示阿拉伯语数据集；`kaoshi`表示考试数据集。"
+msgstr ""
+"Here, `cn` represents the Chinese dataset, `en` represents the English "
+"dataset, `code` represents the code dataset, `ja` represents the Japanese"
+" dataset, `ar` represents the Arabic dataset, and `kaoshi` represents the"
+" exam dataset."
+
+#: ../../../usage.md:36
+msgid "生成的bin文件的格式如下："
+msgstr "The format of the generated `bin` files is as follows:"
+
+#: ../../../usage.md:42
+msgid "`bin`文件中的每一行均对应原始数据集中的每一个句子，表示每个句子的`token`（下文将用sequence指定）。"
+msgstr ""
+"Each line in the `bin` file corresponds to each sentence in the original "
+"dataset, representing the tokens of each sentence (referred to as "
+"sequence below)."
+
+#: ../../../usage.md:44
+msgid "生成的`meta`文件的格式如下："
+msgstr "The format of the generated `meta` file is as follows:"
+
+#: ../../../usage.md:48
+msgid ""
+"在`meta`文件中，每个元组对应着`bin`文件中每一个`sequence`的元信息。其中，元组的第一个元素表示每个`sequence`在所有`sequence`中的`starting"
+" index`，第二个元素表示每个`sequence`中有多少个`tokens`。"
+msgstr ""
+"Each tuple in the `meta` file represents the meta information of each "
+"`sequence`, where the first element in the tuple indicates the `starting "
+"index` of each `sequence` among all `sequences`, and the second element "
+"indicates the number of `tokens` for each `sequence`."
+
+#: ../../../usage.md:50
+msgid ""
+"例如，对于第一个`sequence`，`starting index`为 0，有 11 "
+"个`tokens`；对于第二个`sequence`，由于第一个`sequence`转换为`string`后的长度为`89`，因此它的`starting"
+" index`为 90，有 15 个`tokens`。"
+msgstr ""
+"For example, the first `sequence` starts at index 0 and has 16 `tokens`. "
+"The second `sequence` starts at index 110 and has 24 `tokens`."
+
+#: ../../../usage.md:52
+msgid "`json`和`jsonl`类型的文件的`bin`和`meta`文件格式和`txt`一致，此处不再赘叙。"
+msgstr ""
+"The `bin` and `meta` file formats for `json` and `jsonl` type files are "
+"the same as for `txt`, so we won't go over them here."
+
+#: ../../../usage.md:54
+msgid "数据准备 （微调）"
+msgstr "Data Preparation (Fine-tuning)"
+
+#: ../../../usage.md:56
+msgid ""
+"微调任务的数据集格式与预训练任务保持一致，生成的数据格式为一系列的`bin`和`meta`文件。以下以 Alpaca "
+"数据集为例，介绍微调的数据准备流程。"
+msgstr ""
+"The data format for fine-tuning tasks is the same as for pre-training "
+"tasks, which consists of a series of `bin` and `meta` files. Let's take "
+"the Alpaca dataset as an example to explain the data preparation process "
+"for fine-tuning."
+
+#: ../../../usage.md:58
+msgid ""
+"下载 [Alpaca 数据集](https://github.com/tatsu-"
+"lab/stanford_alpaca/blob/main/alpaca_data.json)"
+msgstr ""
+"Download the [Alpaca dataset](https://github.com/tatsu-"
+"lab/stanford_alpaca/blob/main/alpaca_data.json)."
+
+#: ../../../usage.md:60
+msgid "对 Alpaca 数据进行 tokenize，使用以下命令"
+msgstr "Tokenize the Alpaca dataset using the following command:"
+
+#: ../../../usage.md:66
+msgid "建议用户参考 alpaca_tokenizer.py 编写新的脚本对自己的数据集进行 tokenize"
+msgstr ""
+"It is recommended that users refer to alpaca_tokenizer.py to write new "
+"scripts to tokenize their own datasets"
+
+#: ../../../usage.md:68
+msgid "训练配置"
+msgstr "Training Configuration"
+
+#: ../../../usage.md:70
+#, fuzzy
+msgid "以 7B Demo 的配置文件`configs/7B_sft.py`为例："
+msgstr ""
+"Taking the configuration file `configs/7B_sft.py` for the 7B demo as an "
+"example,"
+
+#: ../../../usage.md:237
+msgid "接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。"
+msgstr ""
+"let's discuss the data, model, parallel and monitoring configurations "
+"required to start a model training."
+
+#: ../../../usage.md:239
+msgid "数据配置"
+msgstr "Data Configuration"
+
+#: ../../../usage.md:240
+msgid "数据相关的关键参数配置及释义如下所示："
+msgstr "Here are the key parameters and their explanations for data configuration:"
+
+#: ../../../usage.md:255
+msgid "![pack_into_one](./imgs/pack_into_one.png)"
+msgstr ""
+
+#: ../../../usage.md:255
+msgid "pack_into_one"
+msgstr ""
+
+#: ../../../usage.md:258
+msgid "目前支持传入数据集文件路径`train_folder`，且要求文件格式如下："
+msgstr ""
+"Currently, it supports passing the dataset file path `train_folder`, and "
+"the file format is required to be as follows:"
+
+#: ../../../usage.md:265
+msgid "数据集的详细内容可参考``数据准备``模块相关的介绍。"
+msgstr ""
+"For detailed information about the dataset, please refer to the \"Data "
+"Preparation\" section."
+
+#: ../../../usage.md:267
+msgid "模型配置"
+msgstr "Model Configuration"
+
+#: ../../../usage.md:269
+msgid "如果在启动训练时要加载模型 `checkpoint`，可进行如下相关配置："
+msgstr ""
+"If you want to load a model checkpoint when starting the training, you "
+"can configure it as follows:"
+
+#: ../../../usage.md:282
+msgid "注意："
+msgstr "Note:"
+
+#: ../../../usage.md:283
+msgid "路径若以 `local:` 为前缀，则存储在本地文件系统；若以 `boto3:` 为前缀，则存储在远程 oss 上"
+msgstr ""
+"If the path starts with `local:`, it means the file is stored in the "
+"local file system. If it starts with `boto3:`, it means the file is "
+"stored in the remote OSS."
+
+#: ../../../usage.md:285
+msgid "模型相关关键参数配置如下所示："
+msgstr "The configuration for the model is as follows:"
+
+#: ../../../usage.md:309
+msgid "注意：用户可自定义模型类型名和模型结构，并配置相对应的模型参数。通过`utils/registry.py`下的`MODEL_INITIALIZER`对象进行模型初始化函数接口注册，在训练主函数`train.py`中初始化模型时，可通过`model_type`配置获取指定的模型初始化接口函数。"
+msgstr ""
+"Note: Users can customize the model type name and model structure, and "
+"configure the corresponding model parameters. The model initialization "
+"function interface can be registered through the `MODEL_INITIALIZER` "
+"object in `utils/registry.py`. When initializing the model in the "
+"training main function `train.py`, the specified model initialization "
+"interface function can be obtained through the `model_type` "
+"configuration."
+
+#: ../../../usage.md:311
+msgid ""
+"*如果基于 InternLM 7B继续训练，可以参考 "
+"[ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-zoo) 中 "
+"OpenXLab 链接下载权重*"
+msgstr ""
+"*If you want to start training based on InternLM 7B, you can refer to "
+"OpenXLab [ModelZoo](https://github.com/InternLM/InternLM/tree/main#model-"
+"zoo) to download weights*."
+
+#: ../../../usage.md:313
+msgid "并行配置"
+msgstr "Parallel Configuration"
+
+#: ../../../usage.md:315
+msgid "训练并行配置样例如下："
+msgstr "Training parallel configuration example:"
+
+#: ../../../usage.md:324
+msgid "zero1：zero 并行策略，分如下三种情况，默认值为 -1"
+msgstr ""
+"zero1: zero parallel strategy, divided into the following three cases, "
+"default value is -1"
+
+#: ../../../usage.md:325
+msgid "当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配"
+msgstr ""
+"When `zero1 <= 0`, the size of the zero1 process group is equal to the "
+"size of the data parallel process group, so the optimizer state "
+"parameters will be split within the data parallel range."
+
+#: ../../../usage.md:326
+msgid "当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数"
+msgstr ""
+"When `zero1 == 1`, zero1 is not used, and all data parallel groups retain"
+" the complete optimizer state parameters."
+
+#: ../../../usage.md:327
+msgid "当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集"
+msgstr ""
+"When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 "
+"process group is a subset of the data parallel process group."
+
+#: ../../../usage.md:328
+msgid "tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1"
+msgstr ""
+"tensor: tensor parallel size, usually the number of GPUs per node, "
+"default is 1"
+
+#: ../../../usage.md:329
+msgid "pipeline：流水线并行策略"
+msgstr "pipeline: pipeline parallel strategy"
+
+#: ../../../usage.md:330
+msgid "size：流水线并行大小，默认值为 1"
+msgstr "size: pipeline parallel size, the default value is 1"
+
+#: ../../../usage.md:331
+msgid "interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为关闭"
+msgstr ""
+"interleaved_overlap: bool type, when interleaved scheduling, enable or "
+"disable communication optimization, the default value is False"
+
+#: ../../../usage.md:332
+msgid "sequence_parallel：是否开启序列化并行，默认值为 False"
+msgstr ""
+"sequence_parallel: Whether to enable sequence parallelism, the default "
+"value is False"
+
+#: ../../../usage.md:334
+msgid "注意：`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`"
+msgstr ""
+"Note: `Data parallel size = Total number of GPUs / Pipeline parallel size"
+" / Tensor parallel size`"
+
+#: ../../../usage.md:336
+msgid "启动训练"
+msgstr "Start Training"
+
+#: ../../../usage.md:338
+msgid "完成了以上数据集准备和相关训练配置后，可启动 Demo 训练。接下来分别以 slurm 和 torch 环境为例，介绍训练启动方式。"
+msgstr ""
+"After completing the data preparation and relevant training "
+"configurations mentioned above, you can start the demo training. The "
+"following examples demonstrate how to start the training in both slurm "
+"and torch environments."
+
+#: ../../../usage.md:340
+msgid "若在 slurm 上启动分布式运行环境，多节点 16 卡的运行命令如下所示："
+msgstr ""
+"If you want to start distributed training on slurm with 16 GPUs across "
+"multiple nodes, use the following command:"
+
+#: ../../../usage.md:345
+msgid "若在 torch 上启动分布式运行环境，单节点 8 卡的运行命令如下所示："
+msgstr ""
+"If you want to start distributed training on torch with 8 GPUs on a "
+"single node, use the following command:"
+
+#: ../../../usage.md:350
+msgid "运行结果"
+msgstr "Training Results"
+
+#: ../../../usage.md:352
+msgid "以 slurm 上单机 8 卡的 Demo 训练配置为例，训练结果日志展示如下："
+msgstr ""
+"Taking the configuration of the demo training on a single machine with 8 "
+"GPUs on slurm as an example, the training result log is shown below:"
+
+#~ msgid "`load_model_only_folder`与`load_ckpt_folder`不能同时设置"
+#~ msgstr ""
+#~ "`load_model_only_folder` and `load_ckpt_folder` "
+#~ "cannot be set at the same time."
+
--- a/doc/code-docs/make.bat
+++ b/doc/code-docs/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/doc/code-docs/requirements.txt
+++ b/doc/code-docs/requirements.txt
@ -0,0 +1,11 @@
+Sphinx
+sphinx-autobuild
+sphinx_rtd_theme
+sphinx_markdown_tables
+autodoc_pydantic==1.9
+enum_tools
+numpy
+torch
+tqdm
+pyecharts
+myst-parser
--- a/doc/code-docs/source/checkpoint.rst
+++ b/doc/code-docs/source/checkpoint.rst
@ -0,0 +1,12 @@
+模型保存
+===================
+
+InternLM 使用 ``internlm.utils.model_checkpoint.CheckpointManager`` 来管理模型保存。其中，可以使用 ``CheckpointManager.try_save_checkpoint(train_state)`` 来保存指定 step 的模型状态。
+
+InternLM支持启动时自动加载最新的模型备份，并在接收信号退出训练时自动进行模型备份。
+
+Checkpointing
+-------------
+
+.. autoclass:: internlm.utils.model_checkpoint.CheckpointManager
+    :members:
--- a/doc/code-docs/source/conf.py
+++ b/doc/code-docs/source/conf.py
@ -0,0 +1,103 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+import os
+import sys
+
+project = "InternLM"
+copyright = "2023, InternLM Team"
+author = "InternLM Team"
+
+with open("../../../version.txt", "r") as f:
+    release = f.readline().rstrip()
+
+master_doc = "index"
+
+autodoc_member_order = "bysource"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx_rtd_theme",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.autodoc",
+    "sphinxcontrib.autodoc_pydantic",
+    "sphinx.ext.autosectionlabel",
+    "sphinx.ext.napoleon",
+    "myst_parser",
+]
+
+pygments_style = "sphinx"
+
+# autodoc_pyandtic config
+autodoc_pydantic_model_show_field_summary = False
+autodoc_pydantic_field_signature_prefix = " "
+autodoc_pydantic_model_signature_prefix = "class"
+autodoc_pydantic_model_show_json = False
+autodoc_pydantic_model_show_config_summary = False
+autodoc_pydantic_model_show_config_member = False
+autodoc_pydantic_model_show_validator_summary = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_summary_list_order = "bysource"
+autodoc_pydantic_model_member_order = "bysource"
+autodoc_pydantic_field_list_validators = False
+
+# Napoleon settings
+napoleon_google_docstring = True
+napoleon_numpy_docstring = True
+napoleon_include_init_with_doc = False
+napoleon_include_private_with_doc = False
+napoleon_include_special_with_doc = True
+napoleon_use_admonition_for_examples = False
+napoleon_use_admonition_for_notes = False
+napoleon_use_admonition_for_references = False
+napoleon_use_ivar = False
+napoleon_use_param = True
+napoleon_use_rtype = True
+napoleon_preprocess_types = False
+napoleon_type_aliases = None
+napoleon_attr_annotations = True
+
+templates_path = ["_templates"]
+
+exclude_patterns = []
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "sphinx_rtd_theme"
+html_static_path = []
+
+# GitHub integration
+html_context = {
+    "display_github": True,
+    "github_user": "InternLM",
+    "github_repo": "InternLM",
+    "github_version": "main",
+    "conf_py_path": "/doc/code-docs/source/",
+}
+
+sys.path.insert(0, os.path.abspath("../../../"))
+
+# Prepend module names to class descriptions
+add_module_names = True
+
+autoclass_content = "class"
+
+autodoc_mock_imports = [
+    "apex",
+    "torch",
+    "numpy",
+]
+
+# support multi-language docs
+language = "zh_CN"
+locale_dirs = ["../locales/"]  # path is example but recommended.
+gettext_compact = False  # optional.
+gettext_uuid = False  # optional.
--- a/doc/code-docs/source/example/30B_demo.rst
+++ b/doc/code-docs/source/example/30B_demo.rst
@ -0,0 +1,202 @@
+30B Demo
+================
+
+训练配置
+----------------
+
+30B demo 训练配置文件样例如下:
+
+.. code-block:: python
+
+    JOB_NAME = "30b_train"
+
+    SEQ_LEN = 2048
+    HIDDEN_SIZE = 6144
+    NUM_ATTENTION_HEAD = 48
+    MLP_RATIO = 8 / 3
+    NUM_LAYER = 60
+    VOCAB_SIZE = 103168
+
+    MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+    # Ckpt folder format:
+    # fs: 'local:/mnt/nfs/XXX'
+    SAVE_CKPT_FOLDER = "local:llm_ckpts"
+    LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+    # boto3 Ckpt folder format:
+    # import os
+    # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+    # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+    # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+    CHECKPOINT_EVERY = 50
+    ckpt = dict(
+        enable_save_ckpt=False,  # enable ckpt save.
+        save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+        # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
+        # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
+        load_optimizer=True,  # Wheter to load optimizer states when continuing training.
+        checkpoint_every=CHECKPOINT_EVERY,
+        async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+        async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+        snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
+        oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+    )
+
+    TRAIN_FOLDER = "/path/to/dataset"
+    VALID_FOLDER = "/path/to/dataset"
+    data = dict(
+        seq_len=SEQ_LEN,
+        # micro_num means the number of micro_batch contained in one gradient update
+        micro_num=4,
+        # packed_length = micro_bsz * SEQ_LEN
+        micro_bsz=2,
+        # defaults to the value of micro_num
+        valid_micro_num=4,
+        # defaults to 0, means disable evaluate
+        valid_every=50,
+        pack_sample_into_one=False,
+        total_steps=50000,
+        skip_batches="",
+        rampup_batch_size="",
+        # Datasets with less than 50 rows will be discarded
+        min_length=50,
+        # train_folder=TRAIN_FOLDER,
+        # valid_folder=VALID_FOLDER,
+    )
+
+    grad_scaler = dict(
+        fp16=dict(
+            # the initial loss scale, defaults to 2**16
+            initial_scale=2**16,
+            # the minimum loss scale, defaults to None
+            min_scale=1,
+            # the number of steps to increase loss scale when no overflow occurs
+            growth_interval=1000,
+        ),
+        # the multiplication factor for increasing loss scale, defaults to 2
+        growth_factor=2,
+        # the multiplication factor for decreasing loss scale, defaults to 0.5
+        backoff_factor=0.5,
+        # the maximum loss scale, defaults to None
+        max_scale=2**24,
+        # the number of overflows before decreasing loss scale, defaults to 2
+        hysteresis=2,
+    )
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+    loss = dict(
+        label_smoothing=0,
+    )
+
+    adam = dict(
+        lr=1e-4,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        adam_beta2_c=0,
+        adam_eps=1e-8,
+        weight_decay=0.01,
+    )
+
+    lr_scheduler = dict(
+        total_steps=data["total_steps"],
+        init_steps=0,  # optimizer_warmup_step
+        warmup_ratio=0.01,
+        eta_min=1e-5,
+        last_epoch=-1,
+    )
+
+    beta2_scheduler = dict(
+        init_beta2=adam["adam_beta2"],
+        c=adam["adam_beta2_c"],
+        cur_iter=-1,
+    )
+
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+    """
+    zero1 parallel:
+        1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+            For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    pipeline parallel (dict):
+        1. size: int, the size of pipeline parallel.
+        2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+    tensor parallel: tensor parallel size, usually the number of GPUs per node.
+    """
+    parallel = dict(
+        zero1=-1,
+        tensor=4,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+    cudnn_deterministic = False
+    cudnn_benchmark = False
+
+
+启动训练
+----------------
+
+完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动两节点 16GPU 的训练命令如下所示：
+
+.. code-block:: bash
+
+    srun -p internllm -N 2 -n 16 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/30B_sft.py
+
+训练结果
+----------------
+
+基于以上训练配置和启动命令，两节点 16GPU 下的模型训练部分日志展示如下：
+
+.. code-block:: bash
+
+    2023-09-06 10:29:26,629 INFO parallel_context.py:508 in set_device -- process rank 10 is bound to host:HOST-10-140-66-20 device: 2
+    2023-09-06 10:29:26,632 INFO parallel_context.py:508 in set_device -- process rank 11 is bound to host:HOST-10-140-66-20 device: 3
+    2023-09-06 10:29:26,634 INFO parallel_context.py:508 in set_device -- process rank 12 is bound to host:HOST-10-140-66-20 device: 4
+    2023-09-06 10:29:26,636 INFO parallel_context.py:508 in set_device -- process rank 9 is bound to host:HOST-10-140-66-20 device: 1
+    2023-09-06 10:29:26,640 INFO parallel_context.py:508 in set_device -- process rank 15 is bound to host:HOST-10-140-66-20 device: 7
+    2023-09-06 10:29:26,639 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:HOST-10-140-66-9 device: 0
+    2023-09-06 10:29:26,641 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:HOST-10-140-66-9 device: 2
+    2023-09-06 10:29:26,643 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:HOST-10-140-66-9 device: 5
+    2023-09-06 10:29:26,645 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:HOST-10-140-66-9 device: 6
+    2023-09-06 10:29:26,661 INFO parallel_context.py:508 in set_device -- process rank 13 is bound to host:HOST-10-140-66-20 device: 5
+    2023-09-06 10:29:26,707 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:HOST-10-140-66-9 device: 1
+    2023-09-06 10:29:26,826 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:HOST-10-140-66-9 device: 4
+    2023-09-06 10:29:26,871 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:HOST-10-140-66-9 device: 7
+    2023-09-06 10:29:26,932 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:HOST-10-140-66-9 device: 3
+    2023-09-06 10:29:27,156 INFO parallel_context.py:508 in set_device -- process rank 14 is bound to host:HOST-10-140-66-20 device: 6
+    2023-09-06 10:29:27,271 INFO parallel_context.py:508 in set_device -- process rank 8 is bound to host:HOST-10-140-66-20 device: 0
+    2023-09-06 10:29:32,060 INFO launch.py:329 in launch -- Distributed environment is initialized, data parallel size: 4, pipeline parallel size: 1, tensor parallel size: 4
+    2023-09-06 10:30:06,141 INFO hybrid_zero_optim.py:291 in _partition_param_list -- Number of elements on ranks: [1782007296, 1812307968, 1812307968, 1706469888], rank:0
+    2023-09-06T10:30:38.216+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=40.00268401421643 step=0 loss=11.548227310180664 tgs (tokens/gpu/second)=227.37 lr=9.779754323328192e-05 loss_scale=65536.0 grad_norm={'0_default': 61.5836932112004} micro_num=4 num_consumed_tokens=65536 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=12.51 acc=0.0 perplexity=104121.5547 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=60571 tokens/cn=0 tokens/code=0 loss_from_metric=11.5533 loss/en=11.5533 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:46.343+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=89.00005814543725 step=1 loss=6.05580997467041 tgs (tokens/gpu/second)=505.86 lr=9.140576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 27.397946290506887} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=7.91 acc=0.0885 perplexity=405.4076 acc/en=0.0885 acc/cn=0.0 acc/code=0.0 tokens/en=60265 tokens/cn=0 tokens/code=0 loss_from_metric=6.0049 loss/en=6.0049 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:51.443+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.5138940898651 step=2 loss=5.054169654846191 tgs (tokens/gpu/second)=810.03 lr=8.14503363531613e-05 loss_scale=65536.0 grad_norm={'0_default': 10.438111430093606} micro_num=4 num_consumed_tokens=196608 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.87 acc=0.0715 perplexity=184.2986 acc/en=0.0715 acc/cn=0.0 acc/code=0.0 tokens/en=60244 tokens/cn=0 tokens/code=0 loss_from_metric=5.2166 loss/en=5.2166 loss/cn=nan loss/code=nan 
+    2023-09-06T10:30:56.509+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.56131674769466 step=3 loss=4.662276268005371 tgs (tokens/gpu/second)=815.98 lr=6.890576474687264e-05 loss_scale=65536.0 grad_norm={'0_default': 9.15959986316653} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=17 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.83 acc=0.0775 perplexity=102.6568 acc/en=0.0775 acc/cn=0.0 acc/code=0.0 tokens/en=60328 tokens/cn=0 tokens/code=0 loss_from_metric=4.6314 loss/en=4.6314 loss/cn=nan loss/code=nan 
+    2023-09-06T10:31:01.552+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=143.85087291011183 step=4 loss=4.020431041717529 tgs (tokens/gpu/second)=817.63 lr=5.500000000000001e-05 loss_scale=65536.0 grad_norm={'0_default': 6.873464794412589} micro_num=4 num_consumed_tokens=327680 inf_nan_skip_batches=0 num_samples_in_batch=22 largest_length=1893 largest_batch=8 smallest_batch=4 adam_beta2=0.95 fwd_bwd_time=4.82 acc=0.0701 perplexity=69.1167 acc/en=0.0701 acc/cn=0.0 acc/code=0.0 tokens/en=61028 tokens/cn=0 tokens/code=0 loss_from_metric=4.2358 loss/en=4.2358 loss/cn=nan loss/code=nan 
+    2023-09-06T10:31:06.830+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=15224 : tflops=142.8966468353613 step=5 loss=3.733311891555786 tgs (tokens/gpu/second)=812.2 lr=4.109423525312737e-05 loss_scale=65536.0 grad_norm={'0_default': 5.811005102730085} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=13 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.85 acc=0.0688 perplexity=46.298 acc/en=0.0688 acc/cn=0.0 acc/code=0.0 tokens/en=61004 tokens/cn=0 tokens/code=0 loss_from_metric=3.8351 loss/en=3.8351 loss/cn=nan loss/code=nan
--- a/doc/code-docs/source/example/7B_demo.rst
+++ b/doc/code-docs/source/example/7B_demo.rst
@ -0,0 +1,192 @@
+7B Demo
+================
+
+训练配置
+----------------
+
+7B demo 的训练配置文件样例如下:
+
+.. code-block:: python
+
+    JOB_NAME = "7b_train"
+
+    SEQ_LEN = 2048
+    HIDDEN_SIZE = 4096
+    NUM_ATTENTION_HEAD = 32
+    MLP_RATIO = 8 / 3
+    NUM_LAYER = 32
+    VOCAB_SIZE = 103168
+
+    MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+    # Ckpt folder format:
+    # fs: 'local:/mnt/nfs/XXX'
+    SAVE_CKPT_FOLDER = "local:llm_ckpts"
+    LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+    # boto3 Ckpt folder format:
+    # import os
+    # BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+    # SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+    # LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+    CHECKPOINT_EVERY = 50
+    ckpt = dict(
+        enable_save_ckpt=False,  # enable ckpt save.
+        save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+        # load_ckpt_folder=LOAD_CKPT_FOLDER, # Ckpt path to resume training(load weights and scheduler/context states).
+        # load_model_only_folder=MODEL_ONLY_FOLDER, # Path to initialize with given model weights.
+        load_optimizer=True,  # Wheter to load optimizer states when continuing training.
+        checkpoint_every=CHECKPOINT_EVERY,
+        async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+        async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+        snapshot_ckpt_folder="/".join([SAVE_CKPT_FOLDER, "snapshot"]),  # directory for snapshot ckpt storage path.
+        oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+    )
+
+    TRAIN_FOLDER = "/path/to/dataset"
+    VALID_FOLDER = "/path/to/dataset"
+    data = dict(
+        seq_len=SEQ_LEN,
+        # micro_num means the number of micro_batch contained in one gradient update
+        micro_num=4,
+        # packed_length = micro_bsz * SEQ_LEN
+        micro_bsz=2,
+        # defaults to the value of micro_num
+        valid_micro_num=4,
+        # defaults to 0, means disable evaluate
+        valid_every=50,
+        pack_sample_into_one=False,
+        total_steps=50000,
+        skip_batches="",
+        rampup_batch_size="",
+        # Datasets with less than 50 rows will be discarded
+        min_length=50,
+        # train_folder=TRAIN_FOLDER,
+        # valid_folder=VALID_FOLDER,
+    )
+
+    grad_scaler = dict(
+        fp16=dict(
+            # the initial loss scale, defaults to 2**16
+            initial_scale=2**16,
+            # the minimum loss scale, defaults to None
+            min_scale=1,
+            # the number of steps to increase loss scale when no overflow occurs
+            growth_interval=1000,
+        ),
+        # the multiplication factor for increasing loss scale, defaults to 2
+        growth_factor=2,
+        # the multiplication factor for decreasing loss scale, defaults to 0.5
+        backoff_factor=0.5,
+        # the maximum loss scale, defaults to None
+        max_scale=2**24,
+        # the number of overflows before decreasing loss scale, defaults to 2
+        hysteresis=2,
+    )
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+    loss = dict(
+        label_smoothing=0,
+    )
+
+    adam = dict(
+        lr=1e-4,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        adam_beta2_c=0,
+        adam_eps=1e-8,
+        weight_decay=0.01,
+    )
+
+    lr_scheduler = dict(
+        total_steps=data["total_steps"],
+        init_steps=0,  # optimizer_warmup_step
+        warmup_ratio=0.01,
+        eta_min=1e-5,
+        last_epoch=-1,
+    )
+
+    beta2_scheduler = dict(
+        init_beta2=adam["adam_beta2"],
+        c=adam["adam_beta2_c"],
+        cur_iter=-1,
+    )
+
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+    """
+    zero1 parallel:
+        1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+            For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    pipeline parallel (dict):
+        1. size: int, the size of pipeline parallel.
+        2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+    tensor parallel: tensor parallel size, usually the number of GPUs per node.
+    """
+    parallel = dict(
+        zero1=8,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+    cudnn_deterministic = False
+    cudnn_benchmark = False
+
+启动训练
+----------------
+
+完成以上训练配置后，可启动模型训练，以在 ``slurm`` 平台上为例，启动单节点 8GPU 的训练命令如下所示：
+
+.. code-block:: bash
+
+    srun -p internllm -N 1 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ./configs/7B_sft.py
+
+训练结果
+----------------
+
+基于以上训练配置和启动命令，单节点 8GPU 下的模型训练部分日志展示如下：
+
+.. code-block:: bash
+
+    2023-09-05 11:47:44,649 INFO parallel_context.py:508 in set_device -- process rank 4 is bound to host:SH-IDC1-10-140-1-110 device: 4
+    2023-09-05 11:47:44,650 INFO parallel_context.py:508 in set_device -- process rank 3 is bound to host:SH-IDC1-10-140-1-110 device: 3
+    2023-09-05 11:47:44,651 INFO parallel_context.py:508 in set_device -- process rank 6 is bound to host:SH-IDC1-10-140-1-110 device: 6
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 7 is bound to host:SH-IDC1-10-140-1-110 device: 7
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 5 is bound to host:SH-IDC1-10-140-1-110 device: 5
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 1 is bound to host:SH-IDC1-10-140-1-110 device: 1
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 2 is bound to host:SH-IDC1-10-140-1-110 device: 2
+    2023-09-05 11:47:44,652 INFO parallel_context.py:508 in set_device -- process rank 0 is bound to host:SH-IDC1-10-140-1-110 device: 0
+    2023-09-05 11:47:51,006 INFO launch.py:354 in launch -- Distributed environment is initialized, data parallel size: 8, pipeline parallel size: 1, tensor parallel size: 1
+    2023-09-05 11:49:09,855 INFO hybrid_zero_optim.py:294 in _partition_param_list -- Number of elements on ranks: [894509056, 944865280, 966909952, 966909952, 966909952, 944865280, 966909952, 670068736], rank:0
+    2023-09-05T11:49:58.225+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=63.283263603947816 step=0 loss=11.641494750976562 tgs (tokens/gpu/second)=1424.93 lr=4.0000000000000003e-07 loss_scale=65536.0 grad_norm={'0_default': 66.51907327507652} micro_num=4 num_consumed_tokens=131072 inf_nan_skip_batches=0 num_samples_in_batch=19 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=6.87 acc=0.0 perplexity=112181.7188 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120836 tokens/cn=0 tokens/code=0 loss_from_metric=11.6279 loss/en=11.6279 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:02.553+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=171.92140761933035 step=1 loss=11.546792984008789 tgs (tokens/gpu/second)=3871.11 lr=6.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 64.47430144542088} micro_num=4 num_consumed_tokens=262144 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=5 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=4.14 acc=0.0 perplexity=103779.1406 acc/en=0.0 acc/cn=0.0 acc/code=0.0 tokens/en=120572 tokens/cn=0 tokens/code=0 loss_from_metric=11.55 loss/en=11.55 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:06.504+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=186.0565203348341 step=2 loss=11.106071472167969 tgs (tokens/gpu/second)=4189.39 lr=8.000000000000001e-07 loss_scale=65536.0 grad_norm={'0_default': 62.520055376005146} micro_num=4 num_consumed_tokens=393216 inf_nan_skip_batches=0 num_samples_in_batch=16 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0001 perplexity=71139.6797 acc/en=0.0001 acc/cn=0.0 acc/code=0.0 tokens/en=122032 tokens/cn=0 tokens/code=0 loss_from_metric=11.1724 loss/en=11.1724 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:10.487+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.48897918112567 step=3 loss=10.444510459899902 tgs (tokens/gpu/second)=4176.61 lr=1.0000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 57.91057980979166} micro_num=4 num_consumed_tokens=524288 inf_nan_skip_batches=0 num_samples_in_batch=18 largest_length=2048 largest_batch=6 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.83 acc=0.0705 perplexity=39851.1289 acc/en=0.0705 acc/cn=0.0 acc/code=0.0 tokens/en=121125 tokens/cn=0 tokens/code=0 loss_from_metric=10.5929 loss/en=10.5929 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:14.476+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.8751803758398 step=4 loss=9.798665046691895 tgs (tokens/gpu/second)=4185.31 lr=1.2000000000000002e-06 loss_scale=65536.0 grad_norm={'0_default': 48.1136933755285} micro_num=4 num_consumed_tokens=655360 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.076 perplexity=18045.6699 acc/en=0.076 acc/cn=0.0 acc/code=0.0 tokens/en=121365 tokens/cn=0 tokens/code=0 loss_from_metric=9.8007 loss/en=9.8007 loss/cn=nan loss/code=nan 
+    2023-09-05T11:50:18.442+08:00 INFO [training_internlm.py, line 413, in record_current_batch_training_metrics] - pid=6794 : tflops=185.6236609556878 step=5 loss=9.215429306030273 tgs (tokens/gpu/second)=4179.64 lr=1.4000000000000001e-06 loss_scale=65536.0 grad_norm={'0_default': 36.95489557069029} micro_num=4 num_consumed_tokens=786432 inf_nan_skip_batches=0 num_samples_in_batch=14 largest_length=2048 largest_batch=4 smallest_batch=3 adam_beta2=0.95 fwd_bwd_time=3.82 acc=0.0767 perplexity=8999.0869 acc/en=0.0767 acc/cn=0.0 acc/code=0.0 tokens/en=121223 tokens/cn=0 tokens/code=0 loss_from_metric=9.1049 loss/en=9.1049 loss/cn=nan loss/code=nan 
--- a/doc/code-docs/source/example/index.rst
+++ b/doc/code-docs/source/example/index.rst
@ -0,0 +1,18 @@
+训练样例
+================
+
+7B Demo
+------------
+
+.. toctree::
+   :maxdepth: 2
+
+   7B_demo
+
+30B Demo
+------------
+
+.. toctree::
+   :maxdepth: 2
+
+   30B_demo
--- a/doc/code-docs/source/index.rst
+++ b/doc/code-docs/source/index.rst
@ -0,0 +1,95 @@
+.. InternLM documentation master file, created by
+   sphinx-quickstart on Mon Aug 28 17:33:28 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+
+InternLM
+========
+
+环境构建
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   install
+
+快速上手
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   usage
+
+训练构建
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   initialize
+
+训练 API
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   training
+
+并行训练
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   parallel
+
+模型备份
+--------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   checkpoint
+
+性能分析
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   profiler
+
+训练监控
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   monitor
+
+训练样例
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   example/index
+
+常见问题
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   qa
+
+索引和表格
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/doc/code-docs/source/initialize.rst
+++ b/doc/code-docs/source/initialize.rst
@ -0,0 +1,108 @@
+训练构建
+==============
+
+InternLM 的训练流程可以归纳为两个步骤：
+
+1. 初始化
+
+    * 初始化模型、优化器、数据加载器、Trainer，生成不同种类的进程组，为混合并行的迭代训练做准备。
+    * 初始化Logger、Checkpoint管理器、Monitor管理器、Profiler，对迭代训练的过程观察、预警、记录。
+
+2. 迭代训练
+   
+    * 根据配置文件定义的张量并行、流水线并行、数据并行的大小，加载训练引擎和调度器进行混合并行训练。
+    * 在迭代训练中，调用 Trainer API 进行梯度置零，前向传播计算损失并反向传播，参数更新。
+
+.. figure:: ../../imgs/hybrid_parallel_training.png
+  :scale: 45%
+  :class: with-border
+
+  InternLM训练流程图
+
+.. _InternLM-args:
+
+命令行参数解析
+----------------
+
+InternLM 使用 `argparse <https://docs.python.org/3/library/argparse.html>`_ 库来向InternLM运行时提供命令行参数配置。
+
+用户可使用 ``internlm.initialize.get_default_parser()`` 来获取 InternLM 的默认解析器，其中包含一些内置参数，用户可以向此解析器添加自定义参数。
+
+.. code-block:: python
+
+    # Get InternLM default parser
+    parser = internlm.initialize.get_default_parser()
+    # Add new argument
+    parser.add_argument("--user_arg", type=int, default=-1, help="arguments add by user.")
+    cmd_args = parser.parse_args()
+
+.. autofunction:: internlm.initialize.get_default_parser
+
+
+.. _InternLM-model-init:
+
+模型初始化
+-------------------------
+
+.. autofunction:: internlm.train.initialize_model
+
+InternLM 在配置文件中使用字段 ``model_type`` 和 ``model`` 来控制模型初始化过程。示例模型初始化配置定义如下：
+
+.. code-block:: python
+
+    model_type = "INTERNLM"  # default is "INTERNLM", used to register classes and modules for model initialization
+    NUM_ATTENTION_HEAD = 32
+    VOCAB_SIZE = 103168
+    HIDDEN_SIZE = 4096
+    NUM_LAYER = 32
+    MLP_RATIO = 8 / 3
+    model = dict(
+        checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+        num_attention_heads=NUM_ATTENTION_HEAD,
+        embed_split_hidden=True,
+        vocab_size=VOCAB_SIZE,
+        embed_grad_scale=1,
+        parallel_output=True,
+        hidden_size=HIDDEN_SIZE,
+        num_layers=NUM_LAYER,
+        mlp_ratio=MLP_RATIO,
+        apply_post_layer_norm=False,
+        dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+        norm_type="rmsnorm",
+        layer_norm_epsilon=1e-5,
+        use_flash_attn=True,
+        num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+    )
+
+- 字段 ``model_type`` 指明了要初始化的模型类型
+- 字段 ``model`` 中的参数指定了在模型初始化过程中的参数设置
+
+值得注意的是，用户可以定义新的模型类型，并使用装饰器 ``@MODEL_INITIALIZER.register_module`` 注册模型的初始化函数，其中 ``MODEL_INITIALIZER`` 是类 ``internlm.util.registry.Registry`` 的一个实例化对象，示例如下所示：
+
+.. code-block:: python
+
+    MODEL_TYPE = "NEW_MODEL"
+
+    @MODEL_INITIALIZER.register_module(module_name=MODEL_TYPE)
+    def build_new_model_with_cfg(*args, **kwargs):
+
+.. _InternLM-optim-init:
+
+优化器初始化
+-------------------------
+
+.. autofunction:: internlm.train.initialize_optimizer
+
+.. _InternLM-dl-init:
+
+数据加载器初始化
+-------------------------
+
+.. autofunction:: internlm.train.get_train_data_loader
+
+.. _InternLM-trainer-init:
+
+Trainer 初始化
+-------------------------
+
+.. autofunction:: internlm.initialize.initialize_trainer
--- a/doc/code-docs/source/install.md
+++ b/doc/code-docs/source/install.md
@ -0,0 +1,2 @@
+```{include} ../../install.md
+```
--- a/doc/code-docs/source/monitor.rst
+++ b/doc/code-docs/source/monitor.rst
@ -0,0 +1,22 @@
+监控和告警
+=================
+
+监控
+-----------------
+
+InternLM 使用 ``internlm.monitor.monitor.initialize_monitor_manager()`` 来初始化上下文监控管理。其中，一个实例化的单例对象 ``internlm.monitor.monitor.MonitorManager`` 将管理监控线程并使用 ``internlm.monitor.monitor.MonitorTracker`` 来跟踪模型训练生命周期和训练状态。
+
+.. autofunction:: internlm.monitor.monitor.initialize_monitor_manager
+
+.. autoclass:: internlm.monitor.monitor.MonitorManager
+    :members:
+
+.. autoclass:: internlm.monitor.monitor.MonitorTracker
+    :members:
+
+告警
+-----------------
+
+InternLM 监控线程会周期性地检查模型训练过程中是否出现 loss spike、潜在的 training stuck、运行时异常等，并捕获 SIGTERM 异常信号。当出现上述情况时，将触发警报，并通过调用 ``internlm.monitor.alert.send_feishu_msg_with_webhook()`` 向飞书的 Webhook 地址发送报警消息。
+
+.. autofunction:: internlm.monitor.alert.send_feishu_msg_with_webhook
--- a/doc/code-docs/source/parallel.rst
+++ b/doc/code-docs/source/parallel.rst
@ -0,0 +1,152 @@
+并行训练
+==================
+
+.. Brief introduction to training parallelism, and how-to guide about config setting
+
+InternLM 支持张量并行、流水线并行、序列并行、数据并行和 ZeRO1.5 等并行化训练策略。在初始化分布式环境时，我们需要指定张量并行大小、流水线并行大小、数据并行大小以及 ZeRO1.5 策略。
+
+InternLM 的并行设置由配置文件中的 ``parallel`` 字段指定，用户可以通过修改配置文件 `config file <https://github.com/InternLM/InternLM/blob/main/configs/7B_sft.py>`_ 来更改并行配置。以下是一个并行训练配置示例：
+
+.. code-block:: python
+
+    parallel = dict(
+        zero1=8,
+        tensor=1,
+        pipeline=dict(size=1, interleaved_overlap=True),
+        sequence_parallel=False,
+    )
+
+- zero1：zero 并行策略，分如下三种情况，默认值为 -1
+
+    - 当 ``zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
+    - 当 ``zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
+    - 当 ``zero1 > 1`` 且 ``zero1 <= data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集
+
+- tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1
+- pipeline：流水线并行策略
+
+    - size：流水线并行大小，默认值为 1
+    - interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为 False
+
+- sequence_parallel：是否开启序列化并行，默认值为 False
+
+注意：数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小
+
+张量并行
+-----------------
+
+InternLM 的张量并行实现方案基于 `flash attention <https://github.com/Dao-AILab/flash-attention>`_, 主要对 `attention <https://github.com/InternLM/InternLM/blob/main/internlm/model/multi_head_attention.py>`_ 和
+`linear <https://github.com/InternLM/InternLM/blob/main/internlm/model/linear.py>`_ 这两个模块进行张量并行操作。
+
+用户可通过配置文件中的 ``parallel.tensor`` 字段来设置张量并行大小。
+
+.. figure:: ../../imgs/tensor_parallel.png
+  :scale: 50%
+  :class: with-border
+
+  张量并行，采用自 `flash-attention <https://arxiv.org/pdf/2205.14135.pdf>`_
+
+流水线并行
+-----------------
+
+InternLM 在流水线并行中使用 `1F1B <https://arxiv.org/pdf/2104.04473.pdf>`_ （1F1B，一次前向传递后跟一次反向传递）策略。对于 1F1B 策略，有两种实现方式：
+
+1. 非交错调度器，内存高效。
+2. 交错调度器，内存高效且时间高效（GPU空泡较少）。
+
+.. figure:: ../../imgs/pipeline_schedule.png
+  :scale: 45%
+  :class: with-border
+
+  1F1B 流水线并行调度器，采用自 `Megatron-LM <https://arxiv.org/pdf/2104.04473.pdf>`_
+
+非交错式流水线调度
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+如果要使用非交错式调度, 需要设置 ``model.num_chunks = 1``。
+
+.. autoclass:: internlm.core.scheduler.pipeline_scheduler.PipelineScheduler
+    :members:
+
+交错式流水线调度
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+如果要使用交错式调度, 需要设置 ``model.num_chunks > 1``。
+
+.. autoclass:: internlm.core.scheduler.pipeline_scheduler.InterleavedPipelineScheduler
+    :members:
+
+值得注意的是，在使用交错式流水线调度器时可启用通信优化功能，即在 1F1B 阶段启用异步通信，以充分利用上行/下行带宽并实现通信与计算重叠。
+
+用户需要在配置文件中设置 ``parallel.pipeline.interleaved_overlap = True``。该功能启用后，将调用函数 ``InterleavedPipelineScheduler._run_1f1b_loop_with_overlap``，并创建 ``internlm.core.communication.AsynCommunicator`` 以管理异步通信。
+
+``1F1B-without-overlap`` 和 ``1F1B-with-overlap`` 的区别如下所示：
+
+.. code-block:: bash
+
+    # The 1F1B stage without overlap consists of the following steps:
+    1. Perform the forward pass.
+    2. Perform the backward pass.
+    3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration to the previous stage, and receive the forward and backward inputs for the next iteration.
+
+.. code-block:: bash
+
+    # The 1F1B stage with overlap consists of the following steps:
+    1. Perform the forward pass.
+    2. Check if the backward input is ready.
+    3. Send the forward output and receive the forward input for the next iteration.
+    4. Perform the backward pass.
+    5. Check if the forward input is ready.
+    6. Send the backward output and receive the backward input for the next iteration.
+
+
+序列并行
+-----------------
+
+序列并行是一种在不引入额外计算、通信和内存开销的情况下，减少层 ``layer_norm`` 和 ``dropout`` 操作中的激活值内存。InternLM 中的序列并行实现基于 `flash attention <https://github.com/Dao-AILab/flash-attention>`_。这个并行策略有助于降低模型的内存消耗，提高了模型在资源受限环境中的可扩展性。
+
+如果要启用序列并行, 用户需要设置 ``parallel.sequence_parallel = True``。
+
+.. figure:: ../../imgs/sequence_parallel.png
+  :scale: 50%
+  :class: with-border
+
+  序列并行, 采用自 flash-attention
+
+数据并行
+-----------------
+
+InternLM 支持数据并行。数据并行大小为:
+
+`Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`
+
+ZeRO1.5
+-----------------
+
+ZeRO1.5 的实现使用了分层分片的概念，通过配置值 ``parallel.zero1`` 启用了本地节点内的分片。这个方法有助于有效管理和分配模型参数和梯度，以减少内存使用并提高训练效率。
+
+1. 当 ``parallel.zero1 <= 0``，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
+2. 当 ``parallel.zero1 == 1``，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
+3. 当 ``parallel.zero1 > 1`` 且 ``parallel.zero1 <= data_parallel_world_size``，则 zero1 进程组是数据并行进程组的子集
+
+此外，用户可以在配置文件中通过 ``hybrid_zero_optimizer`` 字段启用优化器的通信优化功能，设置桶大小，以及梯度剪裁等参数。这些设置有助于优化训练过程中的通信和计算效率，以及梯度的处理方式。
+
+.. code-block:: python
+
+    hybrid_zero_optimizer = dict(
+        # Enable low_level_optimzer overlap_communication
+        overlap_sync_grad=True,  
+        overlap_sync_param=True,
+        # bucket size for nccl communication params
+        reduce_bucket_size=512 * 1024 * 1024,
+        # grad clipping
+        clip_grad_norm=1.0,
+    )
+
+这里有两个值得关注的通信优化点：
+
+- overlap_sync_grad: 如果设置为 ``True``，则将训练的 ``backward pass`` 与梯度的 ``all-reduce`` 通信重叠
+- overlap_sync_param: 如果设置为 ``True``，则将参数的 ``broadcast`` 通信与下一步的 ``forward pass`` 进行重叠
+
+这些优化可以加速训练过程，提高训练效率。
+
+.. autoclass:: internlm.solver.optimizer.hybrid_zero_optim.HybridZeroOptimizer
+    :members:
--- a/doc/code-docs/source/profiler.rst
+++ b/doc/code-docs/source/profiler.rst
@ -0,0 +1,164 @@
+性能分析
+========
+
+.. Mainly about the usage of torch profiler and memory profiler
+
+Torch Profiler
+-----------------
+
+InternLM 使用 ``internlm.train.initialize_llm_profile()`` 来收集和分析模型训练或推理期间的性能数据，如 CPU/CUDA/memory 等性能数据。这个实现基于 `torch.profiler <https://pytorch.org/docs/stable/profiler.html>`_ ，输出的性能分析 trace 文件可以使用 `tensorboard <https://www.tensorflow.org/tensorboard?hl=en>`_ 进行可视化。
+
+用户如果想使用这个 torch 性能分析工具，需要在启动训练时传递 ``--profiling`` 参数以启用性能分析。完成 torch 性能分析后，用户可以在 ``{JOB_NAME}/{start_time}/traces/rank{}_dp{}_tp{}_pp{}`` 文件夹中看到性能分析结果。
+
+实际运行生成的 ``Torch Profiler`` 目录结构如下：
+
+.. code-block:: bash
+
+    # tree ./7b_train/Sep08_11-00-51/traces -L 2
+    ./7b_train/Sep08_11-00-51/traces/
+    └── rank0_dp0_tp0_pp0
+        └── SH-IDC1-10-140-1-78_238619.1694142354680.pt.trace.json
+
+其中， ``traces`` 可以通过 ``TensorBoard`` 可视化，运行命令
+
+.. code-block:: bash
+
+    # visualize traces with tensorboard and custom port
+    tensorboard --logdir rank0_dp0_tp0_pp0 --port 10088
+
+在打开的 ``TensorBoard -> PyTorch Profiler -> Views -> Trace`` 页面可以看到Operator和GPU Kernel的性能分析时间线如下，更多的功能请参考 `torch profiler with tensorboard <https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html#pytorch-profiler-with-tensorboard>`_
+
+.. figure:: ../../imgs/torch_profiler_trace.png
+  :scale: 45%
+  :class: with-border
+
+.. autofunction:: internlm.train.initialize_llm_profile
+
+Memory Profiler
+-----------------
+
+InternLM 提供了一个实用的内存分析工具 ``internlm.utils.simple_memory_profiler.SimpleMemoryProfiler`` 来监控实际的 GPU 内存使用情况。在实现中，会对模型数据（包括模型参数、模型梯度和优化器状态）和非模型数据（包括激活值）分别进行详细的统计。
+
+要使用这个内存分析工具，用户需要在启动训练时传递 ``--profiling`` 参数以启用内存分析。完成内存分析后，用户可以在 ``memory_trace/rank{}_dp{}_tp{}`` 文件夹中找到特定 rank 对应的内存分析结果（包括不同时间点的内存使用日志和显示总体内存使用情况的太阳图表）。
+
+实际运行生成的 ``memory_trace`` 目录结构如下：
+
+.. code-block:: bash
+
+    # tree ./memory_trace -L 2
+    ./memory_trace
+    ├── rank0_dp0_tp0                              # Profiling results for a specific rank device
+    │   ├── activation_memory_sunburst.html        # Sunburst chart showing activation memory usage
+    │   ├── grads_memory_sunburst.html             # Sunburst chart showing gradient memory usage
+    │   ├── memory.log                             # Log of GPU memory usage at different time points
+    │   ├── os_memory_sunburst.html                # Sunburst chart showing optimizer state memory usage
+    │   ├── params_memory_sunburst.html            # Sunburst chart showing parameter memory usage
+    │   └── summary_sunburst.html                  # Sunburst chart showing overall memory usage
+    ├── rank1_dp1_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank2_dp2_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank3_dp3_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank4_dp4_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank5_dp5_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    ├── rank6_dp6_tp0
+    │   ├── activation_memory_sunburst.html
+    │   ├── grads_memory_sunburst.html
+    │   ├── memory.log
+    │   ├── os_memory_sunburst.html
+    │   ├── params_memory_sunburst.html
+    │   └── summary_sunburst.html
+    └── rank7_dp7_tp0
+        ├── activation_memory_sunburst.html
+        ├── grads_memory_sunburst.html
+        ├── memory.log
+        ├── os_memory_sunburst.html
+        ├── params_memory_sunburst.html
+        └── summary_sunburst.html
+
+其中， ``memory.log`` 的内容示例如下：
+
+.. code-block:: bash
+
+    Memory State:
+    time: 37.56313228607178
+    ---summary---
+    total_memory: 55953.56 MB
+    params_memory: 13965.51 MB, grads_memory: 13965.51 MB, os_params_memory: 3461.52 MB, os_state_memory: 6923.03 MB, activation_memory: 17638.00 MB
+
+    Memory State:
+    time: 38.46969723701477
+    ---summary---
+    total_memory: 38315.56 MB
+    params_memory: 13965.51 MB, grads_memory: 13965.51 MB, os_params_memory: 3461.52 MB, os_state_memory: 6923.03 MB, activation_memory: 0.00 MB
+    ---Layout---
+    params_layout:
+    layer: param_mem, layer_mem: 0.00 MB, total_mem: 13965.51 MB
+    layer: param_mem.embedding, layer_mem: 0.00 MB, total_mem: 806.00 MB
+    layer: param_mem.embedding.weight, layer_mem: 806.00 MB, total_mem: 806.00 MB
+    layer: param_mem.blocks, layer_mem: 0.00 MB, total_mem: 12353.50 MB
+    layer: param_mem.blocks.0, layer_mem: 0.00 MB, total_mem: 386.05 MB
+    layer: param_mem.blocks.0.mixer, layer_mem: 0.00 MB, total_mem: 128.03 MB
+    layer: param_mem.blocks.0.mixer.Wqkv, layer_mem: 0.00 MB, total_mem: 96.02 MB
+    layer: param_mem.blocks.0.mixer.Wqkv.weight, layer_mem: 96.00 MB, total_mem: 96.00 MB
+    layer: param_mem.blocks.0.mixer.Wqkv.bias, layer_mem: 0.02 MB, total_mem: 0.02 MB
+    layer: param_mem.blocks.0.mixer.out_proj, layer_mem: 0.00 MB, total_mem: 32.01 MB
+    layer: param_mem.blocks.0.mixer.out_proj.weight, layer_mem: 32.00 MB, total_mem: 32.00 MB
+    layer: param_mem.blocks.0.mixer.out_proj.bias, layer_mem: 0.01 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.norm1, layer_mem: 0.00 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.norm1.weight, layer_mem: 0.01 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.norm2, layer_mem: 0.00 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.norm2.weight, layer_mem: 0.01 MB, total_mem: 0.01 MB
+    layer: param_mem.blocks.0.mlp, layer_mem: 0.00 MB, total_mem: 258.00 MB
+    layer: param_mem.blocks.0.mlp.w1, layer_mem: 0.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w1.weight, layer_mem: 86.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w2, layer_mem: 0.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w2.weight, layer_mem: 86.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w3, layer_mem: 0.00 MB, total_mem: 86.00 MB
+    layer: param_mem.blocks.0.mlp.w3.weight, layer_mem: 86.00 MB, total_mem: 86.00 MB
+    ......
+    grads_layout:
+    ......
+    os_params_layout:
+    ......
+    os_state_layout:
+    ......
+    activation_base_layout:
+    ......
+
+模型参数的太阳图示例如下：
+
+.. figure:: ../../imgs/params_memory_sunburst.png
+  :scale: 50%
+  :class: with-border
+
+.. autoclass:: internlm.utils.simple_memory_profiler.SimpleMemoryProfiler
+    :members:
--- a/doc/code-docs/source/qa.rst
+++ b/doc/code-docs/source/qa.rst
@ -0,0 +1,2 @@
+问&答
+=====
--- a/doc/code-docs/source/training.rst
+++ b/doc/code-docs/source/training.rst
@ -0,0 +1,9 @@
+训练 API
+============
+
+InternLM 的训练 API 由 ``internlm.core.trainer.Trainer`` 管理。在定义了训练引擎和调度器之后，我们可以调用 Trainer API 来执行模型训练、评估、梯度清零和参数更新等。
+
+有关详细用法，请参阅 Trainer API 文档和示例。
+
+.. autoclass:: internlm.core.trainer.Trainer
+    :members:
--- a/doc/code-docs/source/usage.md
+++ b/doc/code-docs/source/usage.md
@ -0,0 +1,4 @@
+```{include} ../../usage.md
+:relative-docs: docs/
+:relative-images:
+```
--- a/doc/en/install.md
+++ b/doc/en/install.md
@ -1,4 +1,4 @@
-## InternLM Installation
+## Installation

 ### Environment Preparation
 The required packages and corresponding version are shown as follows:
@ -59,12 +59,28 @@ cd ../../
 ```

 ### Environment Image
-Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
+Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm.
+
+#### Image Configuration and Build
+The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:
+``` bash
+make -f docker.Makefile BASE_OS=centos7
+``` 
+In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported.
+
+#### Pull Standard Image
+The standard image based on ubuntu and centos has been built and can be directly pulled:

 ```bash
-# pull image
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-# start container
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-docker exec -it myinternlm bash
+# ubuntu20.04
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
+# centos7
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
 ```
+
+#### Run Container
+For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
+```
+The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md).
--- a/doc/en/structure.md
+++ b/doc/en/structure.md
@ -6,11 +6,14 @@ The system code file structure is shown below:
 ├── internlm                                 # Main directory of the system code
 │   ├── apis                                 # Interface module, containing some interface functions related to inference, etc.
 │   ├── core                                 # Core module, managing parallel context and training scheduling engine for training and inference
+│   │   ├── communication                    # Communication module, responsible for p2p communication in pipeline parallel scheduling
 │   │   ├── context                          # Context module, mainly responsible for initializing parallel process groups and managing parallel context
 │   │   │   ├── parallel_context.py
 │   │   │   └── process_group_initializer.py
+│   │   ├── scheduler                        # Scheduling module, which manages schedulers for parallel training, including non-pipeline and pipeline parallel schedulers
+│   │   │   ├── no_pipeline_scheduler.py
+│   │   │   └── pipeline_scheduler.py
 │   │   ├── engine.py                        # Responsible for managing the training and evaluation process of the model
-│   │   ├── no_pipeline_scheduler.py         # Scheduler for parallel training
 │   │   └── trainer.py                       # Responsible for managing the training engine and scheduler
 │   ├── data                                 # Data module, responsible for managing dataset generation and processing
 │   ├── initialize                           # Initialization module, responsible for managing distributed environment startup and trainer initialization
--- a/doc/en/usage.md
+++ b/doc/en/usage.md
@ -1,4 +1,4 @@
-## Pre-training and Fine-tuning Tutorial for InternLM
+## Quickstart Guide for Pre-training and Fine-tuning

 To start a demo model training, you need to prepare three things: **installation**, **dataset preparation**, and **model training configuration**. In this guide, we will first cover the steps for dataset preparation and then briefly describe the model training configuration.

@ -74,7 +74,173 @@ It is recommended that users refer to alpaca_tokenizer.py to write new scripts t

 ### Training Configuration

-Taking the configuration file `configs/7B_sft.py` for the 7B demo as an example, let's discuss the data, model, and parallel configurations required to start a model training.
+Taking the configuration file `configs/7B_sft.py` for the 7B demo as an example, let's discuss the data, model, parallel and monitoring configurations required to start a model training.
+```python
+JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 2048
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=50000,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel:
+    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+        so parameters will be divided within the range of dp.
+    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+tensor parallel: tensor parallel size, usually the number of GPUs per node.
+"""
+parallel = dict(
+    zero1=8,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
+```

 #### Data Configuration
 Here are the key parameters and their explanations for data configuration:
@ -93,10 +259,7 @@ data = dict(
 )
 ```

-<div align="left">
-    <img src="../imgs/pack_into_one.png" width="550"/>
-</div>
-
+![pack_into_one](../imgs/pack_into_one.png)

 Currently, it supports passing the dataset file path `train_folder`, and the file format is required to be as follows:

@ -115,19 +278,19 @@ If you want to load a model checkpoint when starting the training, you can confi

 ```python
 SAVE_CKPT_FOLDER = "local:/path/to/save/ckpt"
-MODEL_ONLY_FOLDER = "local:/path/to/load/init/model/ckpt"
 LOAD_CKPT_FOLDER = "local:/path/to/load/resume/ckpt"
 ckpt = dict(
    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save the model and optimizer checkpoints
    checkpoint_every=float("inf"),  # Save a checkpoint every specified number of steps, default value is inf
-    load_model_only_folder=MODEL_ONLY_FOLDER,  # Path to load the initial model weights, only load model weights without loading optimizer weights, training will start from the first step
-    load_ckpt_folder=LOAD_CKPT_FOLDER,  # Path to load the weights of the model and optimizer for resuming training, training will resume from the specified step
-    load_optimizer=True,  # Whether to load optimizer weights when resuming training, default value is True
+    # When resuming training from a breakpoint,:
+    # (1) 'path' is the path of the loaded checkpoint.
+    # (2) 'content' indicates which state will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # (3) 'ckpt_type' indicates which type ckpt will be loaded, currently supported: "internlm"
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
 )
 ```

 Note:
- `load_model_only_folder` and `load_ckpt_folder` cannot be set at the same time.
 - If the path starts with `local:`, it means the file is stored in the local file system. If it starts with `boto3:`, it means the file is stored in the remote OSS.

 The configuration for the model is as follows:
@ -165,17 +328,21 @@ Training parallel configuration example:
 ```python
 parallel = dict(
    zero1=8,
-    pipeline=1,
    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
 )
 ```

 - zero1: zero parallel strategy, divided into the following three cases, default value is -1
-  - When `size <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
-  - When `size == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
-  - When `size > 1` and `size <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
- pipeline: pipeline parallel size, default value is 1
- tensor: tensor parallel size, usually the number of GPUs per node, default value is 1
+  - When `zero1 <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
+  - When `zero1 == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
+  - When `zero1 > 1` and `zero1 <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
+- tensor: tensor parallel size, usually the number of GPUs per node, default is 1
+- pipeline: pipeline parallel strategy
+   - size: pipeline parallel size, the default value is 1
+   - interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False
+- sequence_parallel: Whether to enable sequence parallelism, the default value is False

 Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`

--- a/doc/imgs/hybrid_parallel_training.png
+++ b/doc/imgs/hybrid_parallel_training.png
--- a/doc/imgs/params_memory_sunburst.png
+++ b/doc/imgs/params_memory_sunburst.png
--- a/doc/imgs/pipeline_schedule.png
+++ b/doc/imgs/pipeline_schedule.png
--- a/doc/imgs/sequence_parallel.png
+++ b/doc/imgs/sequence_parallel.png
--- a/doc/imgs/tensor_parallel.png
+++ b/doc/imgs/tensor_parallel.png
--- a/doc/imgs/torch_profiler_trace.png
+++ b/doc/imgs/torch_profiler_trace.png
--- a/doc/install.md
+++ b/doc/install.md
@ -1,4 +1,4 @@
-## InternLM项目的依赖安装
+## 环境安装

 ### 环境准备
 首先，需要安装的依赖包及对应版本列表如下：
@ -59,11 +59,28 @@ cd ../../
 ```

 ### 环境镜像
-用户可以从 https://hub.docker.com/r/sunpengsdu/internlm 获取安装了 InternLM 运行环境的镜像，拉取镜像及启动容器的命令如下：
+用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像，或者也可以从 https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。
+
+#### 镜像配置及构造
+dockerfile 的配置以及构造均通过 docker.Makefile 文件实现，在 InternLM 根目录下执行如下命令即可 build 镜像：
+``` bash
+make -f docker.Makefile BASE_OS=centos7
+``` 
+在 docker.Makefile 中可自定义基础镜像，环境版本等内容，对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 ubuntu20.04 和 centos7。
+
+#### 镜像拉取
+基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用：
+
 ```bash
-# 拉取镜像
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-# 启动容器
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-docker exec -it myinternlm bash
+# ubuntu20.04
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
+# centos7
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
 ```
+
+#### 容器启动
+对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器：
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
+```
+容器内默认目录即 `/InternLM`，根据[使用文档](./usage.md)即可启动训练。
--- a/doc/structure.md
+++ b/doc/structure.md
@ -6,11 +6,14 @@
 ├── internlm                                 # 系统代码的主目录
 │   ├── apis                                 # 接口模块，包含一些关于推理等的接口函数
 │   ├── core                                 # 核心模块，管理用于训练和推理的 parallel context 和训练调度引擎
+│   │   ├── communication                    # 通信模块，负责流水线并行调度中的p2p通信
 │   │   ├── context                          # context 模块，主要负责初始化并行进程组，并管理 parallel context
 │   │   │   ├── parallel_context.py
 │   │   │   └── process_group_initializer.py
+│   │   ├── scheduler                        # 调度模块，管理并行训练的调度器，包括非流水线并行调度器和流水线并行调度器
+│   │   │   ├── no_pipeline_scheduler.py
+│   │   │   └── pipeline_scheduler.py
 │   │   ├── engine.py                        # 负责管理模型的训练和评估过程
-│   │   ├── no_pipeline_scheduler.py         # 并行训练的调度器
 │   │   └── trainer.py                       # 负责管理训练引擎和调度器
 │   ├── data                                 # 数据模块，负责管理数据集生成和处理
 │   ├── initialize                           # 初始化模块，负责管理分布式环境启动和训练器初始化
--- a/doc/usage.md
+++ b/doc/usage.md
@ -1,4 +1,4 @@
-## 基于InternLM的预训练与微调使用教程
+## 使用教程

 启动一个 Demo 模型训练，需要进行三项准备，**安装**，**数据集准备**和**模型训练配置**。接下来，首先会介绍数据准备相关的操作，再简要描述模型训练配置相关的内容。

@ -66,7 +66,174 @@ python tools/alpaca_tokenizer.py /path/to/alpaca_dataset /path/to/output_dataset

 ### 训练配置

-以 7B Demo 的配置文件`configs/7B_sft.py`为例，介绍启动一个模型训练所需要进行的数据、模型和并行等相关的配置。
+以 7B Demo 的配置文件`configs/7B_sft.py`为例：
+```python
+JOB_NAME = "7b_train"
+DO_ALERT = False
+
+SEQ_LEN = 2048
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+VOCAB_SIZE = 103168
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts"
+LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+# LOAD_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm/snapshot/1/"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
+    load_ckpt_folder="local:llm_ckpts/",
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, now only 'normal' type is supported.
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = "/path/to/dataset"
+VALID_FOLDER = "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=2,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=50,
+    pack_sample_into_one=False,
+    total_steps=50000,
+    skip_batches="",
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    # train_folder=TRAIN_FOLDER,
+    # valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=10,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+model = dict(
+    checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-5,
+    use_flash_attn=True,
+    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
+)
+"""
+zero1 parallel:
+    1. if zero1 <= 0, The size of the zero process group is equal to the size of the dp process group,
+        so parameters will be divided within the range of dp.
+    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler.
+tensor parallel: tensor parallel size, usually the number of GPUs per node.
+"""
+parallel = dict(
+    zero1=8,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+    ),
+)
+```
+接下来将详细介绍启动一个模型训练所需要进行的数据、模型、并行和监控等相关的配置。

 #### 数据配置
 数据相关的关键参数配置及释义如下所示：
@ -84,9 +251,7 @@ data = dict(
 )
 ```

-<div align="left">
-    <img src="./imgs/pack_into_one.png" width="550"/>
-</div>
+![pack_into_one](./imgs/pack_into_one.png)


 目前支持传入数据集文件路径`train_folder`，且要求文件格式如下：
@ -103,18 +268,17 @@ data = dict(
 如果在启动训练时要加载模型 `checkpoint`，可进行如下相关配置：
 ```python
 SAVE_CKPT_FOLDER = "local:/path/to/save/ckpt"
-MODEL_ONLY_FOLDER = "local:/path/to/load/init/model/ckpt"
 LOAD_CKPT_FOLDER = "local:/path/to/load/resume/ckpt"
 ckpt = dict(
    save_ckpt_folder=SAVE_CKPT_FOLDER,  # 存储模型和优化器 checkpoint 的路径
    checkpoint_every=float("inf"),  # 每多少个 step 存储一次 checkpoint，默认值为 inf
-    load_model_only_folder=MODEL_ONLY_FOLDER,  # 加载模型初始权重的路径，只加载模型权重，不加载优化器权重，训练将从第一个 step 开始
-    load_ckpt_folder=LOAD_CKPT_FOLDER,  # 断点续训时，加载模型和优化器等权重的路径，将从指定的 step 恢复训练
-    load_optimizer=True,  # 断点续训时，是否需要加载优化器权重，默认值为 True
+    # 断点续训时，加载模型和优化器等权重的路径，将从指定的 step 恢复训练
+    # content 表示哪些状态会被加载，支持： "model", "sampler", "optimizer", "scheduler", "all"
+    # ckpt_type 表示加载的模型类型，目前支持: "internlm"
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internlm"),
 )
 ```
 注意：
- `load_model_only_folder`与`load_ckpt_folder`不能同时设置
 - 路径若以 `local:` 为前缀，则存储在本地文件系统；若以 `boto3:` 为前缀，则存储在远程 oss 上

 模型相关关键参数配置如下所示：
@ -151,16 +315,20 @@ model = dict(
 ```python
 parallel = dict(
    zero1=8,
-    pipeline=1,
    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
 )
 ```
 - zero1：zero 并行策略，分如下三种情况，默认值为 -1
-  - 当`size <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
-  - 当`size == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
-  - 当`size > 1`且`size <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
- pipeline：流水线并行大小，默认值为 1
+  - 当`zero1 <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
+  - 当`zero1 == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
+  - 当`zero1 > 1`且`zero1 <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
 - tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1
+- pipeline：流水线并行策略
+  - size：流水线并行大小，默认值为 1
+  - interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为关闭
+- sequence_parallel：是否开启序列化并行，默认值为 False

 注意：`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`

--- a/docker.Makefile
+++ b/docker.Makefile
@ -0,0 +1,107 @@
+DOCKER_REGISTRY          ?= docker.io
+DOCKER_ORG               ?= my
+DOCKER_IMAGE             ?= internlm
+DOCKER_FULL_NAME          = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
+
+CUDA_VERSION              = 11.7.1
+GCC_VERSION               = 10.2.0
+
+CUDNN_VERSION             = 8
+BASE_RUNTIME              =
+# ubuntu20.04  centos7
+BASE_OS                   = centos7
+BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-${BASE_OS}
+# The conda channel to use to install cudatoolkit
+CUDA_CHANNEL              = nvidia
+# The conda channel to use to install pytorch / torchvision
+INSTALL_CHANNEL          ?= pytorch
+
+PYTHON_VERSION           ?= 3.10
+PYTORCH_VERSION          ?= 1.13.1
+TORCHVISION_VERSION      ?= 0.14.1
+TORCHAUDIO_VERSION       ?= 0.13.1
+BUILD_PROGRESS           ?= auto
+TRITON_VERSION           ?=
+GMP_VERSION              ?= 6.2.1
+MPFR_VERSION             ?= 4.1.0
+MPC_VERSION              ?= 1.2.1
+GCC_VERSION              ?= 10.2.0
+HTTPS_PROXY_I            ?=
+HTTP_PROXY_I             ?=
+FLASH_ATTEN_VERSION      ?= 1.0.5
+FLASH_ATTEN_TAG          ?= v${FLASH_ATTEN_VERSION}
+
+BUILD_ARGS                = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
+                            --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
+                            --build-arg CUDA_VERSION=$(CUDA_VERSION) \
+                            --build-arg CUDA_CHANNEL=$(CUDA_CHANNEL) \
+                            --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION) \
+                            --build-arg TORCHVISION_VERSION=$(TORCHVISION_VERSION) \
+                            --build-arg TORCHAUDIO_VERSION=$(TORCHAUDIO_VERSION) \
+                            --build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) \
+                            --build-arg TRITON_VERSION=$(TRITON_VERSION) \
+                            --build-arg GMP_VERSION=$(GMP_VERSION) \
+                            --build-arg MPFR_VERSION=$(MPFR_VERSION) \
+                            --build-arg MPC_VERSION=$(MPC_VERSION) \
+                            --build-arg GCC_VERSION=$(GCC_VERSION) \
+                            --build-arg https_proxy=$(HTTPS_PROXY_I) \
+                            --build-arg http_proxy=$(HTTP_PROXY_I) \
+                            --build-arg FLASH_ATTEN_TAG=$(FLASH_ATTEN_TAG)
+
+EXTRA_DOCKER_BUILD_FLAGS ?=
+
+BUILD                    ?= build
+# Intentionally left blank
+PLATFORMS_FLAG           ?=
+PUSH_FLAG                ?=
+USE_BUILDX               ?=1
+BUILD_PLATFORMS          ?=
+WITH_PUSH                ?= false
+BUILD_TYPE               ?= intrenlm-dev
+
+# Setup buildx flags
+ifneq ("$(USE_BUILDX)","")
+BUILD                     =  buildx build
+ifneq ("$(BUILD_PLATFORMS)","")
+PLATFORMS_FLAG            = --platform="$(BUILD_PLATFORMS)"
+endif
+endif
+# endif
+
+# # Only set platforms flags if using buildx
+# ifeq ("$(WITH_PUSH)","true")
+# PUSH_FLAG               = --push
+# endif
+# endif
+
+ifeq ($(findstring centos,$(BASE_OS)),centos)
+    DOCKERFILE_PATH ?= ./docker/Dockerfile-centos
+else
+    DOCKERFILE_PATH ?= ./docker/Dockerfile-ubuntu
+endif
+
+#use -f to specify dockerfile
+DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
+                            docker $(BUILD) \
+                                   --progress=$(BUILD_PROGRESS) \
+                                   $(EXTRA_DOCKER_BUILD_FLAGS) \
+                                   $(PLATFORMS_FLAG) \
+                                   $(PUSH_FLAG) \
+                                   -f $(DOCKERFILE_PATH) \
+                                   -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
+                                   $(BUILD_ARGS) .
+
+                                   # --target $(BUILD_TYPE)
+
+.PHONY: all
+all: devel-image
+
+.PHONY: devel-image
+devel-image: BASE_IMAGE := $(BASE_DEVEL)
+devel-image: DOCKER_TAG := torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}-flashatten${FLASH_ATTEN_VERSION}-${BASE_OS}
+devel-image:
+	$(DOCKER_BUILD)
+
+.PHONY: clean
+clean:
+	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
--- a/docker/Dockerfile-centos
+++ b/docker/Dockerfile-centos
@ -0,0 +1,131 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on centos
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN yum install deltarpm -y && yum update -y \
+    && yum install -y \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        bzip2 \
+        gcc \
+        gcc-c++ \
+        file \
+        texinfo \
+        which
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && git clone https://github.com/ninja-build/ninja.git \
+    && cd ninja \
+    && git checkout release \
+    && ./configure.py --bootstrap \
+    && mv ./ninja /usr/bin \
+    && cd ..
+
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
+
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+RUN git submodule update --init --recursive \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
+    && cd /InternLM/third_party/flash-attention \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/docker/Dockerfile-ubuntu
+++ b/docker/Dockerfile-ubuntu
@ -0,0 +1,112 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on ubuntu
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        ninja-build
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxJf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+RUN git submodule update --init --recursive \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
+    && cd /InternLM/third_party/flash-attention \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/Dockerfile-centos
+++ b/experiment/Dockerfile-centos
@ -0,0 +1,161 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on centos
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN yum install deltarpm -y && yum update -y \
+    && yum install -y \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        bzip2 \
+        gcc \
+        gcc-c++ \
+        file \
+        texinfo \
+        which
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && git clone https://github.com/ninja-build/ninja.git \
+    && cd ninja \
+    && git checkout release \
+    && ./configure.py --bootstrap \
+    && mv ./ninja /usr/bin \
+    && cd ..
+
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
+
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG PYTORCH_VERSION
+ARG TORCHVISION_VERSION
+ARG TORCHAUDIO_VERSION
+
+RUN /opt/conda/bin/pip --no-cache-dir install \
+    transformers==4.29.2 \
+    sentencepiece \
+    numpy \
+    tqdm \
+    psutil \
+    packaging \
+    pre-commit \
+    ninja \
+    gputil \
+    pytest \
+    packaging \
+    boto3 \
+    botocore \
+    torch-scatter \
+    pyecharts \
+    -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
+    && /opt/conda/bin/pip --no-cache-dir install \
+    --extra-index-url https://download.pytorch.org/whl/cu117 \
+    torch==${PYTORCH_VERSION}+cu117 \
+    torchvision==${TORCHVISION_VERSION}+cu117 \
+    torchaudio==${TORCHAUDIO_VERSION}
+
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+ARG FLASH_ATTEN_TAG
+
+RUN git submodule update --init --recursive \
+    && cd /InternLM/third_party/flash-attention \
+    && git checkout ${FLASH_ATTEN_TAG} \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/Dockerfile-ubuntu
+++ b/experiment/Dockerfile-ubuntu
@ -0,0 +1,142 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on ubuntu
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        ninja-build
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxJf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG PYTORCH_VERSION
+ARG TORCHVISION_VERSION
+ARG TORCHAUDIO_VERSION
+
+RUN /opt/conda/bin/pip --no-cache-dir install \
+    transformers==4.29.2 \
+    sentencepiece \
+    numpy \
+    tqdm \
+    psutil \
+    packaging \
+    pre-commit \
+    ninja \
+    gputil \
+    pytest \
+    packaging \
+    boto3 \
+    botocore \
+    torch-scatter \
+    pyecharts \
+    -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
+    && /opt/conda/bin/pip --no-cache-dir install \
+    --extra-index-url https://download.pytorch.org/whl/cu117 \
+    torch==${PYTORCH_VERSION}+cu117 \
+    torchvision==${TORCHVISION_VERSION}+cu117 \
+    torchaudio==${TORCHAUDIO_VERSION}
+
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+ARG FLASH_ATTEN_TAG
+
+RUN git submodule update --init --recursive \
+    && cd /InternLM/third_party/flash-attention \
+    && git checkout ${FLASH_ATTEN_TAG} \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/README-CN.md
+++ b/experiment/README-CN.md
@ -0,0 +1,25 @@
+## 实验性环境镜像
+本模块用于测试新版本环境，默认测试新环境 torch=2.0.1，flash-attention=2.1.0。新环境可能具有不稳定性，标准环境安装请参考：[安装文档](../doc/install.md)
+
+### 镜像构建及拉取
+构建镜像时请于 InternLM 根目录下执行 docker.Makefile，该文件与标准环境镜像共用，所使用的 Dockerfile 位于 experiment 目录下。也可直接从 https://hub.docker.com/r/internlm/internlm 拉取镜像，命令如下：
+```bash
+# 构建镜像
+# ubuntu20.04
+make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+# centos7
+make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+
+# 拉取镜像
+# ubuntu20.04
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
+# centos7
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
+```
+
+### 容器启动
+对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器：
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
+```
+容器内默认目录即 `/InternLM`，根据[使用文档](../doc/usage.md)即可启动训练。
--- a/experiment/README-EN.md
+++ b/experiment/README-EN.md
@ -0,0 +1,25 @@
+## Environment Image for experiment
+This module is used to test the new version environment, the default test new environment is torch=2.0.1, flash-attention=2.1.0. The new environment may be unstable, for the standard environment installation please refer to: [installation guide](../doc/en/install.md)
+
+### Build and Pull Image
+When building the image, please make docker.Makefile in the InternLM root directory. This Makefile is shared with the standard environment image, and the Dockerfile used is located in the experiment directory. You can also pull the image directly from https://hub.docker.com/r/internlm/internlm, the command is as follows:
+```bash
+# Build Image
+# ubuntu20.04
+make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+# centos7
+make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+
+# Pull Image
+# ubuntu20.04
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
+# centos7
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
+```
+
+### Run Container
+For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
+```
+The default directory in the container is `/InternLM`, please start training according to the [Usage](../doc/en/usage.md).
--- a/internlm/core/context/init.py
+++ b/internlm/core/context/init.py
@ -7,6 +7,7 @@ from .parallel_context import (
 from .process_group_initializer import (
    Initializer_Data,
    Initializer_Model,
+    Initializer_Nettest,
    Initializer_Pipeline,
    Initializer_Tensor,
    Initializer_Zero1,
@ -34,6 +35,7 @@ __all__ = [
    "Initializer_Pipeline",
    "Initializer_Data",
    "Initializer_Zero1",
+    "Initializer_Nettest",
    "ProcessGroupInitializer",
    "Initializer_Model",
    "seed",
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@ -18,6 +18,7 @@ import torch.distributed as dist

 from internlm.utils.common import SingletonMeta
 from internlm.utils.logger import get_logger
+from internlm.utils.timeout import LLM_NCCL_TIMEOUT

 from . import process_group_initializer as pgroup_initializer
 from .process_group_initializer import ParallelMode
@ -143,15 +144,21 @@ class ParallelContext(metaclass=SingletonMeta):
        self.pipeline_parallel_size = 1
        self.tensor_parallel_size = 1
        self.zero1_parallel_size = -1
-        self.expert_parallel_size = 1
+        self.nettest_parallel_size = 1
+        self.expert_parallel_size = -1
        self.num_processes_on_current_node = -1
        self.virtual_pipeline_parallel_size = None
        self.virtual_pipeline_parallel_rank = None
+        self._expert_parallel_group_names = []

    @property
    def config(self):
        return self._config

+    @property
+    def expert_parallel_group_names(self):
+        return self._expert_parallel_group_names
+
    def load_config(self, config: Union[dict, str]):
        """Loads the configuration from either a dict or a file.

@ -374,12 +381,22 @@ class ParallelContext(metaclass=SingletonMeta):
        """
        # initialize the default process group
        init_method = f"tcp://[{host}]:{port}"
-        dist.init_process_group(rank=rank, world_size=world_size, backend=backend, init_method=init_method)
+        dist.init_process_group(
+            rank=rank,
+            world_size=world_size,
+            backend=backend,
+            init_method=init_method,
+            timeout=LLM_NCCL_TIMEOUT,
+        )

        # None will give the default global process group for pytorch dist operations
        ranks = list(range(world_size))
        if use_cpu:
-            cpu_group = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else None
+            cpu_group = (
+                dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                if dist.get_backend() != "gloo"
+                else None
+            )
        else:
            cpu_group = None
        self._register_dist(rank, world_size, dist.GroupMember.WORLD, cpu_group, ranks, ParallelMode.GLOBAL)
@ -456,6 +473,7 @@ class ParallelContext(metaclass=SingletonMeta):
            self.pipeline_parallel_size,
            self.tensor_parallel_size,
            self.zero1_parallel_size,
+            self.nettest_parallel_size,
            self.expert_parallel_size,
        ]

@ -465,9 +483,10 @@ class ParallelContext(metaclass=SingletonMeta):
        initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
        if self.pipeline_parallel_size > 1:
            initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
-        if self.config.model.num_experts > 1:
+        if self.config.model.get("num_experts", 1) > 1:
            initializers.append(pgroup_initializer.Initializer_Expert_Data(*initializer_args))
        for initializer in initializers:
            parallel_setting = initializer.init_dist_group()
@ -525,6 +544,7 @@ class ParallelContext(metaclass=SingletonMeta):
        if dpseed_with_tpoffset:
            dp_seed = seed + pipeline_offset * 1024
        add_seed(ParallelMode.DATA, dp_seed)
+        add_seed(ParallelMode.DUMMY, dp_seed)

        # model parallel seeds are different across ranks
        if self.is_initialized(ParallelMode.TENSOR):
@ -532,7 +552,11 @@ class ParallelContext(metaclass=SingletonMeta):
            tp_seed = seed + tp_rank + pipeline_offset * 1024
            add_seed(ParallelMode.TENSOR, tp_seed)

-        set_mode(ParallelMode.DATA)
+        # we do not set the random state mode to ParallelMode.DATA until model is built (instead, we use a dummy mode
+        # during model construction), this is because the random state will be different in different tensor parallel
+        # device of the same data parallel group. The underlying reason is that the device of tp_rank = 0 will perform
+        # additional random operations during the RowParallelLinear module building process.
+        set_mode(ParallelMode.DUMMY)

        seeds = get_seeds()
        seed_str = ", ".join([f"{k}: {v}" for k, v in seeds.items()])
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@ -3,11 +3,14 @@

 # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context

+import math
 from abc import ABC, abstractmethod
 from enum import Enum

 import torch.distributed as dist

+from internlm.utils.timeout import LLM_NCCL_TIMEOUT
+

 # parallel modes
 class ParallelMode(Enum):
@ -31,12 +34,18 @@ class ParallelMode(Enum):
    # zero1 parallel
    ZERO1 = "zero1"

+    # runntime network test
+    NETTEST = "nettest"
+
    # expert parallel
    EXPERT = "expert"

    # expert data parallel
    EXPERT_DATA = "expert_data"

+    # dummy mode, only used during mode construction
+    DUMMY = "dummy"
+

 class ProcessGroupInitializer(ABC):
    """An object, knowing the parallelism configuration, that initializes parallel groups.
@ -59,6 +68,7 @@ class ProcessGroupInitializer(ABC):
        pipeline_parallel_size: int,
        tensor_parallel_size: int,
        zero1_parallel_size: int,
+        nettest_parallel_size: int,
        expert_parallel_size: int,
    ):
        self.rank = rank
@ -67,6 +77,7 @@ class ProcessGroupInitializer(ABC):
        self.pipeline_parallel_size = pipeline_parallel_size
        self.tensor_parallel_size = tensor_parallel_size
        self.zero1_parallel_size = zero1_parallel_size
+        self.nettest_parallel_size = nettest_parallel_size
        self.expert_parallel_size = expert_parallel_size
        super().__init__()

@ -110,9 +121,13 @@ class Initializer_Data(ProcessGroupInitializer):

        for i in range(self.rank_num_per_dp_group):
            ranks = [i + j * self.rank_num_per_dp_group for j in range(self.data_parallel_size)]
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None

@ -163,9 +178,13 @@ class Initializer_Model(ProcessGroupInitializer):

        for i in range(self.num_group):
            ranks = [i * self.rank_num_per_group + j for j in range(self.rank_num_per_group)]
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None

@ -224,9 +243,13 @@ class Initializer_Pipeline(ProcessGroupInitializer):
                    )
                )
                pipe_group_size = len(ranks)
-                pipe_group = dist.new_group(ranks)
+                pipe_group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
                if use_cpu:
-                    group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else pipe_group
+                    group_cpu = (
+                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else pipe_group
+                    )
                else:
                    group_cpu = None

@ -275,9 +298,13 @@ class Initializer_Tensor(ProcessGroupInitializer):

        for i in range(self.num_tensor_parallel_group):
            ranks = [i * self.tensor_parallel_size + j for j in range(self.tensor_parallel_size)]
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None

@ -332,9 +359,13 @@ class Initializer_Zero1(ProcessGroupInitializer):
                    i + (j * self.zero1_parallel_size + k) * self.rank_num_per_dp_group
                    for k in range(self.zero1_parallel_size)
                ]
-                group = dist.new_group(ranks)
+                group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
                if use_cpu:
-                    group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                    group_cpu = (
+                        dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                        if dist.get_backend() != "gloo"
+                        else group
+                    )
                else:
                    group_cpu = None

@ -348,8 +379,61 @@ class Initializer_Zero1(ProcessGroupInitializer):
        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode


+class Initializer_Nettest(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for network test, especailly for NCCL.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        nettest_parallel_size (int): Size of a network test group.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_nettest_group = math.ceil(self.world_size / self.nettest_parallel_size)
+
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Tensor parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.NETTEST
+
+        for i in range(self.num_nettest_group):
+            ranks = []
+            for j in range(self.nettest_parallel_size):
+                rank = i * self.nettest_parallel_size + j
+                if rank < self.world_size:
+                    ranks.append(rank)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
+            if use_cpu:
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
+            else:
+                group_cpu = None
+
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+
+
 class Initializer_Expert(ProcessGroupInitializer):
-    """A ProcessGroupInitializer for zero-1 parallelism.
+    """A ProcessGroupInitializer for expert parallelism.

    Args:
        rank (int): The rank of current process.
@ -390,9 +474,13 @@ class Initializer_Expert(ProcessGroupInitializer):

        for i in range(self.num_expert_parallel_group):
            ranks = list(range(i, self.world_size, self.num_expert_parallel_group))
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None
            if self.rank in ranks:
@ -406,7 +494,7 @@ class Initializer_Expert(ProcessGroupInitializer):


 class Initializer_Expert_Data(ProcessGroupInitializer):
-    """A ProcessGroupInitializer for zero-1 parallelism.
+    """A ProcessGroupInitializer for expert data parallelism.

    Args:
        rank (int): The rank of current process.
@ -466,9 +554,13 @@ class Initializer_Expert_Data(ProcessGroupInitializer):

        groups = []
        for ranks in expert_parallel_groups:
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None
            if self.rank in ranks:
@ -477,12 +569,18 @@ class Initializer_Expert_Data(ProcessGroupInitializer):
                process_group = group
                cpu_group = group_cpu
                ranks_in_group = ranks
-        groups.append((local_rank, group_world_size, process_group, cpu_group, ranks_in_group, ParallelMode.EXPERT))
+                groups.append(
+                    (local_rank, group_world_size, process_group, cpu_group, ranks_in_group, ParallelMode.EXPERT)
+                )

        for ranks in expert_data_parallel_groups:
-            group = dist.new_group(ranks)
+            group = dist.new_group(ranks, timeout=LLM_NCCL_TIMEOUT)
            if use_cpu:
-                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+                group_cpu = (
+                    dist.new_group(ranks, backend="gloo", timeout=LLM_NCCL_TIMEOUT)
+                    if dist.get_backend() != "gloo"
+                    else group
+                )
            else:
                group_cpu = None
            if self.rank in ranks:
@ -491,8 +589,8 @@ class Initializer_Expert_Data(ProcessGroupInitializer):
                process_group = group
                cpu_group = group_cpu
                ranks_in_group = ranks
-        groups.append(
-            (local_rank, group_world_size, process_group, cpu_group, ranks_in_group, ParallelMode.EXPERT_DATA)
-        )
+                groups.append(
+                    (local_rank, group_world_size, process_group, cpu_group, ranks_in_group, ParallelMode.EXPERT_DATA)
+                )

        return groups
--- a/internlm/core/scheduler/no_pipeline_scheduler.py
+++ b/internlm/core/scheduler/no_pipeline_scheduler.py
@ -7,8 +7,10 @@ from typing import Any, Callable, Iterable, List, Optional

 import torch

+from internlm.core.context import global_context as gpc
 from internlm.core.engine import Engine
 from internlm.utils.common import conditional_context
+from internlm.utils.timeout import llm_timeout

 from .base_scheduler import BaseScheduler, SchedulerHook

@ -25,13 +27,13 @@ class NonPipelineScheduler(BaseScheduler):
        gradient_accumulation_steps(int, optional): the steps of gradient accumulation, 1 for disable
            gradient accumulation.

-    Example:
-        # this shows an example of customized data_process_func
-        def data_process_func(dataloader_output):
-            item1, item2, item3 = dataloader_output
-            data = (item1, item2)
-            label = item3
-            return data, label
+    Examples:
+        >>> # this shows an example of customized data_process_func
+        >>> def data_process_func(dataloader_output):
+        >>>     item1, item2, item3 = dataloader_output
+        >>>     data = (item1, item2)
+        >>>     label = item3
+        >>>     return data, label
    """

    def __init__(
@ -88,7 +90,6 @@ class NonPipelineScheduler(BaseScheduler):
        forward_only: bool = False,
        return_loss: bool = True,
        scale_loss: int = 1,
-        moe_loss_coeff: float = 0.01,
    ):
        """Trains one batch of data.

@ -105,6 +106,7 @@ class NonPipelineScheduler(BaseScheduler):
        # forward
        with conditional_context(torch.no_grad(), enable=forward_only):
            self._call_hooks("before_forward", data)
+            # moe_losses contains the loss of each layer
            output, moe_losses = self._call_engine(engine, data)
            self._call_hooks("after_forward", output)

@ -114,7 +116,7 @@ class NonPipelineScheduler(BaseScheduler):
                self._call_hooks("before_criterion", output, label)
                loss = self._call_engine_criterion(engine, output, label)
                self._call_hooks("after_criterion", loss)
-                moe_loss = sum(moe_losses) * moe_loss_coeff
+                moe_loss = sum(moe_losses) * gpc.config.loss.moe_loss_coeff
                moe_loss /= scale_loss
                loss /= scale_loss
                loss += moe_loss
@ -130,6 +132,7 @@ class NonPipelineScheduler(BaseScheduler):

        return output, loss, moe_loss

+    @llm_timeout(func_name="nopp_forward_backward_step")
    def forward_backward_step(
        self,
        engine: Engine,
@ -137,7 +140,6 @@ class NonPipelineScheduler(BaseScheduler):
        forward_only: bool = False,
        return_loss: bool = True,
        return_output_label: bool = True,
-        moe_loss_coeff: float = 0.01,
    ):
        """The process function that loads a batch of dataset and feeds it to the model.
        The returned labels and loss will None if :attr:`return_loss` is False.
@ -183,7 +185,7 @@ class NonPipelineScheduler(BaseScheduler):
            _data, _label = self._load_accum_batch(data, label)

            _output, _loss, _moe_loss = self._train_one_batch(
-                _data, _label, engine, forward_only, return_loss, self._grad_accum_size, moe_loss_coeff
+                _data, _label, engine, forward_only, return_loss, self._grad_accum_size
            )

            if return_loss:
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@ -15,12 +15,10 @@ from internlm.core.context import global_context as gpc
 from internlm.core.engine import Engine
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.utils.common import get_current_device, move_to_device
-from internlm.utils.logger import get_logger
+from internlm.utils.timeout import llm_timeout

 from .base_scheduler import BaseScheduler, SchedulerHook

-logger = get_logger(__file__)
-

 def get_tensor_shape():
    if hasattr(gpc.config, "TENSOR_SHAPE"):
@ -31,7 +29,7 @@ def get_tensor_shape():

    if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
        if gpc.config.model.use_flash_attn:
-            if gpc.config.model.sequence_parallel:
+            if gpc.config.parallel.sequence_parallel:
                sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
                tensor_shape = (
                    gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"] // sequence_world_size,
@ -141,7 +139,7 @@ class PipelineScheduler(BaseScheduler):
            and gpc.get_world_size(ParallelMode.TENSOR) > 1
        )

-        if gpc.config.model.sequence_parallel:
+        if gpc.config.parallel.sequence_parallel:
            self.scatter_gather_tensors = False

        # cache for the batch data
@ -256,7 +254,6 @@ class PipelineScheduler(BaseScheduler):
        return_output_label=True,
        accum_loss=None,
        accum_moe_loss=None,
-        moe_loss_coeff=1.0,
    ):
        """
        Forward step for passed-in model. If it is the first stage, the input tensor
@ -278,6 +275,7 @@ class PipelineScheduler(BaseScheduler):
        data, label = self._get_data_label_for_current_step(input_obj, micro_batch_data)

        self._call_hooks("before_forward", data)
+        # moe_losses contains the loss of each layer in current stage
        output_obj, moe_losses = self._call_engine(engine.model, data)
        self._call_hooks("after_forward", output_obj)

@ -294,7 +292,7 @@ class PipelineScheduler(BaseScheduler):
                accum_loss.add_(loss_reduced.detach())
                output_obj = loss_reduced

-        moe_loss = sum(moe_losses) * moe_loss_coeff
+        moe_loss = sum(moe_losses) * gpc.config.loss.moe_loss_coeff
        moe_loss /= self.num_microbatches
        accum_moe_loss.add_(moe_loss.detach())

@ -334,7 +332,7 @@ class PipelineScheduler(BaseScheduler):

        self._call_hooks("before_backward", output_obj, output_obj_grad)
        with switch_optimizer_grad_sync_skip_mode(engine.optimizer, skip_grad_sync):
-            if moe_loss is None:
+            if moe_loss is None or moe_loss.item() == 0.0:
                if output_obj_grad is None:
                    engine.backward(output_obj)
                else:
@ -345,6 +343,9 @@ class PipelineScheduler(BaseScheduler):
                else:
                    # scale the latent loss
                    moe_loss = moe_loss * engine.optimizer.loss_scale
+                    # we perform chain rule here by projecting the grad to the direction of
+                    # [output_obj_grad, 1], Because moe_loss have no relation with subsequent
+                    # layer, we set it to None (will be ragarded as 1).
                    engine.backward_by_grad([output_obj, moe_loss], [output_obj_grad, None])

        # Collect the grad of the input_obj.
@ -360,7 +361,7 @@ class PipelineScheduler(BaseScheduler):

        return input_obj_grad

-    def _forward_only_step(self, engine, return_loss=True, return_output_label=True, moe_loss_coeff=1.0):
+    def _forward_only_step(self, engine, return_loss=True, return_output_label=True):
        """
        This function performs forward only computation process. The scheduling of microbatches is similar to the
        warmup phase, where each microbatch first receives the forward input from the previous stage, then performs
@ -415,7 +416,6 @@ class PipelineScheduler(BaseScheduler):
                return_output_label=return_output_label,
                accum_loss=accum_loss,
                accum_moe_loss=accum_moe_loss,
-                moe_loss_coeff=moe_loss_coeff,
            )

            if not gpc.is_last_rank(ParallelMode.PIPELINE):
@ -433,7 +433,7 @@ class PipelineScheduler(BaseScheduler):

        return output, label, accum_loss, accum_moe_loss

-    def _forward_backward_step(self, engine, return_loss=True, return_output_label=True, moe_loss_coeff=1.0):
+    def _forward_backward_step(self, engine, return_loss=True, return_output_label=True):
        """
        This function schedules the forward and backward computation of microbatches in the pipeline in a 1F1B manner.
        It consists of three stages: warmup, 1F1B, and cooldown.
@ -515,7 +515,6 @@ class PipelineScheduler(BaseScheduler):
                return_output_label=return_output_label,
                accum_loss=accum_loss,
                accum_moe_loss=accum_moe_loss,
-                moe_loss_coeff=moe_loss_coeff,
            )

            if not gpc.is_last_rank(ParallelMode.PIPELINE):
@ -536,7 +535,6 @@ class PipelineScheduler(BaseScheduler):
            input_objs.append(input_obj)
            output_objs.append(output_obj)
            moe_losses.append(moe_loss)
-
        # Before running 1F1B, need to receive first forward tensor.
        # If all microbatches are run in warmup / cooldown phase, then no need to
        # receive this tensor here.
@ -562,7 +560,6 @@ class PipelineScheduler(BaseScheduler):
                return_output_label=return_output_label,
                accum_loss=accum_loss,
                accum_moe_loss=accum_moe_loss,
-                moe_loss_coeff=moe_loss_coeff,
            )

            if gpc.is_last_rank(ParallelMode.PIPELINE):
@ -628,8 +625,6 @@ class PipelineScheduler(BaseScheduler):
            if not gpc.is_first_rank(ParallelMode.PIPELINE):
                comm.send_backward(input_obj_grad, scatter_gather_tensors=self.scatter_gather_tensors)

-        logger.info(f"{gpc.get_local_rank(ParallelMode.PIPELINE)}, moe_loss: {accum_moe_loss.item()}")
-
        output, label = pack_return_tensors(return_tensors) if len(return_tensors) > 0 else (None, None)
        dist.all_reduce(accum_moe_loss, group=gpc.get_group(ParallelMode.PIPELINE))

@ -638,9 +633,8 @@ class PipelineScheduler(BaseScheduler):

        return output, label, accum_loss, accum_moe_loss

-    def forward_backward_step(
-        self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True, moe_loss_coeff=1.0
-    ):
+    @llm_timeout(func_name="nointerleaved_forward_backward_step")
+    def forward_backward_step(self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.

@ -663,9 +657,9 @@ class PipelineScheduler(BaseScheduler):
        self.load_batch(engine, data_iter)

        if forward_only:
-            return self._forward_only_step(engine, return_loss, return_output_label, moe_loss_coeff)
+            return self._forward_only_step(engine, return_loss, return_output_label)
        else:
-            return self._forward_backward_step(engine, return_loss, return_output_label, moe_loss_coeff)
+            return self._forward_backward_step(engine, return_loss, return_output_label)


 class InterleavedPipelineScheduler(PipelineScheduler):
@ -782,7 +776,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        self.microbatch_offset[model_chunk_id] += self.microbatch_size
        return move_to_device(micro_batch_data)

-    def _forward_step(self, engine, chunk_id, moe_loss_coeff=1.0):
+    def _forward_step(self, engine, chunk_id):
        """Forward step for passed-in model. If it is the first stage, the input tensor
        is obtained from data_iterator, otherwise the passed-in input_obj is used.
        Returns output tensor. This is a helper function and can be ignored by users.
@ -824,7 +818,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
                self._accum_loss.add_(loss_reduced.detach())
                output_obj = loss_reduced

-        moe_loss = sum(moe_losses) * moe_loss_coeff
+        moe_loss = sum(moe_losses) * gpc.config.loss.moe_loss_coeff
        moe_loss /= self.num_microbatches

        if self._accum_moe_loss is not None:
@ -891,7 +885,6 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        num_warmup_microsteps: int,
        receive_extra_backward: bool = False,
        forward_only: bool = False,
-        moe_loss_coeff: float = 1.0,
    ) -> None:
        """
        Run the warm-up loop and prepare data for the 1F1B stage.
@ -929,7 +922,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        for k in range(num_warmup_microsteps):
            chunk_id = self._get_chunk_by_microbatch(k)

-            output_obj = self._forward_step(engine, chunk_id, moe_loss_coeff)
+            output_obj = self._forward_step(engine, chunk_id)

            if forward_only:
                # when forward-only, no need to save tensors for a backward pass
@ -1011,7 +1004,6 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        num_warmup_microsteps: int,
        num_1f1b_micropairs: int,
        all_warmup_microsteps: bool = False,
-        moe_loss_coeff: float = 1.0,
    ) -> None:
        """
        Run the 1F1B loop with overlap.
@ -1041,7 +1033,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
            backward_chunk_id = self._get_chunk_by_microbatch(backward_microstep_id, backward=True)

            # 1. Forward pass.
-            output_obj = self._forward_step(engine, forward_chunk_id, moe_loss_coeff)
+            output_obj = self._forward_step(engine, forward_chunk_id)

            # 2. Check if the backward input is ready.
            if backward_async_communicator is not None:
@ -1126,7 +1118,6 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        num_warmup_microsteps: int,
        num_1f1b_micropairs: int,
        all_warmup_microsteps: bool = False,
-        moe_loss_coeff: float = 1.0,
    ) -> None:
        """
        Run the 1F1B loop without overlap.
@ -1135,8 +1126,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
        1. Perform the forward pass.
        2. Perform the backward pass.
        3. Send the forward output of this iteration to the next stage, and send the backward output of this iteration
-           to the previous stage,
-        and receive the forward and backward inputs for the next iteration.
+           to the previous stage, and receive the forward and backward inputs for the next iteration.

        Args:
            engine (Engine): The engine to use for computation.
@ -1148,7 +1138,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
            # Forward pass.
            forward_microstep_id = k + num_warmup_microsteps
            forward_chunk_id = self._get_chunk_by_microbatch(forward_microstep_id)
-            output_obj = self._forward_step(engine, forward_chunk_id, moe_loss_coeff)
+            output_obj = self._forward_step(engine, forward_chunk_id)

            # Backward pass.
            backward_microstep_id = k
@ -1253,7 +1243,7 @@ class InterleavedPipelineScheduler(PipelineScheduler):
                )
            )

-    def _forward_only_step(self, engine: Engine, moe_loss_coeff: float = 1.0):
+    def _forward_only_step(self, engine: Engine):
        num_microsteps = self.num_microbatches * self._num_chunks
        num_warmup_microsteps = num_microsteps

@ -1263,10 +1253,9 @@ class InterleavedPipelineScheduler(PipelineScheduler):
            num_warmup_microsteps,
            receive_extra_backward=False,
            forward_only=True,
-            moe_loss_coeff=moe_loss_coeff,
        )

-    def _forward_backward_step(self, engine: Engine, moe_loss_coeff: float = 1.0):
+    def _forward_backward_step(self, engine: Engine):
        # Compute number of warmup and remaining microbatches.
        all_warmup_microsteps = False
        num_microsteps = self.num_microbatches * self._num_chunks
@ -1300,7 +1289,6 @@ class InterleavedPipelineScheduler(PipelineScheduler):
            num_microsteps,
            num_warmup_steps,
            receive_extra_backward=receive_extra_backward,
-            moe_loss_coeff=moe_loss_coeff,
        )

        # 2. 1F1B
@ -1309,15 +1297,13 @@ class InterleavedPipelineScheduler(PipelineScheduler):
            num_warmup_steps,
            num_1f1b_micropairs=num_1f1b_micropairs,
            all_warmup_microsteps=all_warmup_microsteps,
-            moe_loss_coeff=moe_loss_coeff,
        )

        # 3. Cooldown
        self._run_cooldown_loop(engine, num_microsteps, num_1f1b_micropairs=num_1f1b_micropairs)

-    def forward_backward_step(
-        self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True, moe_loss_coeff=1.0
-    ):
+    @llm_timeout(func_name="interleaved_forward_backward_step")
+    def forward_backward_step(self, engine, data_iter, forward_only=False, return_loss=True, return_output_label=True):
        """Run interleaved 1F1B schedule (model split into model chunks), with
        communication between pipeline stages as needed.

@ -1349,17 +1335,15 @@ class InterleavedPipelineScheduler(PipelineScheduler):
            self._return_tensors = []

        if forward_only:
-            self._forward_only_step(engine, moe_loss_coeff)
+            self._forward_only_step(engine)
        else:
-            self._forward_backward_step(engine, moe_loss_coeff)
+            self._forward_backward_step(engine)

        if return_output_label and len(self._return_tensors) > 0:
            output, label = pack_return_tensors(self._return_tensors)
        else:
            output, label = (None, None)

-        logger.info(f"{gpc.get_local_rank(ParallelMode.PIPELINE)}, moe_loss: {self._accum_moe_loss.item()}")
-
        dist.all_reduce(self._accum_moe_loss, group=gpc.get_group(ParallelMode.PIPELINE))
        accum_moe_loss = self._accum_moe_loss

--- a/internlm/core/trainer.py
+++ b/internlm/core/trainer.py
@ -4,6 +4,7 @@
 # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/engine

 import json
+from collections import deque
 from typing import Iterable, Optional

 from internlm.core.engine import Engine
@ -23,7 +24,15 @@ class TrainState:
        train_dl (DataLoader): The DataLoader object used for training.
    """

-    def __init__(self, config) -> None:
+    def __init__(self, config, batch_sampler) -> None:
+        """
+        Args:
+            config (Config): internlm config
+            batch_sampler (torch.utils.data.Sampler): Because the dataloader loading is
+            asynchronous and prefetched, the batch_sampler state maintained inside the
+            dataloader are faster then the actual training progress, so we copy the
+            batch_sampler as the anchor point of ckpt reload.
+        """
        # The number of batches produced by the data iterator
        self.batch_count: int = 0
        # Used to store the number of samples consumed in the current epoch
@ -38,9 +47,43 @@ class TrainState:
        # Total step count
        self.total_steps: int = config.data.total_steps

-    def init_batch_sampler(self, train_dl):
-        # Copy of the batch sampler from the DataLoader
-        self.batch_sampler = train_dl.batch_sampler.copy()
+        # resume tensorboard folder, need load from checkpoint or set manually.
+        self.resume_tb_folder = config.resume_tb_folder
+
+        self.tensorboard_folder = config.tensorboard_folder
+
+        # learning rate
+        self.lr = config.adam.lr
+
+        # smapler state
+        if batch_sampler:
+            self.init_batch_sampler(batch_sampler)
+
+        # tgs statistic
+        self.tgs_statistic = {
+            "sum_step": 0,
+            "sum_tg": 0,
+            "sum_time": 0,
+            "sum_last_tg_10": 0,
+            "sum_last_time_10": 0,
+            "sum_last_tg_50": 0,
+            "sum_last_time_50": 0,
+            "SMA_tg_50": 0,
+            "SMA_time_50": 0,
+            "SMA_tg_50_list": deque(),
+            "SMA_time_50_list": deque(),
+            "sum_tgs": 0,
+            "last_tgs_10": 0,
+            "last_tgs_50": 0,
+        }
+
+    def init_batch_sampler(self, batch_sampler):
+        """
+        Args:
+            batch_sampler (torch.utils.data.Sampler): sampler.
+        """
+        # make a copy of batch_sampler.
+        self.batch_sampler = batch_sampler.copy()
        # Iterator for the batch sampler
        self.batch_sampler_iter = iter(self.batch_sampler)

@ -56,25 +99,25 @@ class TrainState:

        return json.dumps(info, indent=4, sort_keys=True)

-    def load_state_dict(self, other_stuffs, train_dl):
+    def load_state_dict(self, other_stuffs):
        """
        Resumes training from a checkpoint.

        Args:
            other_stuffs (dict): Other information needed to resume training.
-            train_dl (DataLoader): The DataLoader object used for training.
        """
-
-        self.batch_count = other_stuffs["batch_count"] + 1  # here you need to shift a batch backward
        self.num_consumed_samples_in_epoch = other_stuffs["num_consumed_samples_in_epoch"]
        self.num_consumed_tokens = other_stuffs["num_consumed_tokens"]
        self.inf_nan_skip_batches = other_stuffs["inf_nan_skip_batches"]
-        # compatible with previous checkpoints without this parameter
-        self.step_count = other_stuffs.get("step_count", other_stuffs["batch_count"]) + 1

-        # track the actual updates of sampler when using weighted sampling
-        self.batch_sampler = train_dl.batch_sampler.copy()
-        self.batch_sampler_iter = iter(self.batch_sampler)
+        # Because the ckpt save occurs after updating 'step_count',
+        # there is no need to increment 'step_count' here (Does our step count start from 0 ?),
+        # However, 'batch_count' is updating before ckpt storage, so it need to inc 1 when resume.
+        self.batch_count = other_stuffs["batch_count"] + 1  # here you need to shift a batch backward
+        self.step_count = other_stuffs.get("step_count", self.batch_count)
+
+        # resume tensorboard from older tensorboard_folder
+        self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)

    def state_dict(self):
        return {
@ -83,6 +126,7 @@ class TrainState:
            "num_consumed_tokens": self.num_consumed_tokens,
            "inf_nan_skip_batches": self.inf_nan_skip_batches,
            "step_count": self.step_count,
+            "tensorboard_folder": self.tensorboard_folder,
        }


@ -121,10 +165,12 @@ class Trainer:

    @property
    def engine(self):
+        """Returns the engine that responsible for managing the training and evaluation process."""
        return self._engine

    @property
    def schedule(self):
+        """Returns the runtime scheduler."""
        return self._schedule

    @property
@ -133,15 +179,19 @@ class Trainer:
        return isinstance(self._schedule, (PipelineScheduler, InterleavedPipelineScheduler))

    def train(self):
+        """Sets the model to training mode."""
        self._engine.train()

    def eval(self):
+        """Sets the model to evaluation mode."""
        self._engine.eval()

    def zero_grad(self):
+        """Sets the gradient of all parameters in the model to zero."""
        self._engine.zero_grad()

    def step(self):
+        """Executes the parameter update step."""
        return self._engine.step()

    def execute_schedule(self, data_iter: Iterable, **kwargs):
--- a/internlm/data/utils.py
+++ b/internlm/data/utils.py
@ -5,7 +5,7 @@ import torch

 from internlm.core.context import global_context as gpc

-DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1}
+DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1, "code": 2}


 def get_dataset_type_id(path):
--- a/internlm/initialize/init.py
+++ b/internlm/initialize/init.py
@ -1,9 +1,15 @@
 from .initialize_trainer import initialize_trainer
-from .launch import get_default_parser, launch_from_slurm, launch_from_torch
+from .launch import (
+    get_default_parser,
+    initialize_distributed_env,
+    launch_from_slurm,
+    launch_from_torch,
+)

 __all__ = [
    "get_default_parser",
    "initialize_trainer",
    "launch_from_slurm",
    "launch_from_torch",
+    "initialize_distributed_env",
 ]
--- a/internlm/initialize/initialize_tensor.py
+++ b/internlm/initialize/initialize_tensor.py
@ -3,16 +3,15 @@

 import math

-import torch
 from torch import Tensor, nn


-def scaled_init_method_normal(sigma, num_layers):
+def scaled_init_method_normal(sigma: float = 1.0, num_layers: int = 1):
    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
    std = sigma / math.sqrt(2.0 * num_layers)

    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+        return nn.init.normal_(tensor, mean=0.0, std=std)

    return init_

@ -32,3 +31,33 @@ def normal_(mean: float = 0.0, std: float = 1.0):
        return nn.init.normal_(tensor, mean, std)

    return initializer
+
+
+def scaled_init_method_uniform(sigma: float = 1.0, num_layers: int = 1):
+    """Init method based on p(x)=Uniform(-a, a) where std(x)=sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    a = math.sqrt(3.0 * std)
+
+    def init_(tensor):
+        return nn.init.uniform_(tensor, -a, a)
+
+    return init_
+
+
+def uniform_(mean: float = 0.0, std: float = 1.0):
+    r"""Return the initializer filling the input Tensor with values drawn from the uniform distribution
+
+     .. math::
+        \mathcal{U}(mean-a, mean+a), where a satisfies \mathcal{U}_{std}=std.
+
+    Args:
+        mean (float): the mean of the uniform distribution. Defaults 0.0.
+        std (float): the standard deviation of the uniform distribution. Defaults 1.0.
+    """
+
+    a = math.sqrt(3.0 * std)
+
+    def initializer(tensor: Tensor):
+        return nn.init.uniform_(tensor, mean - a, mean + a)
+
+    return initializer
--- a/internlm/initialize/initialize_trainer.py
+++ b/internlm/initialize/initialize_trainer.py
@ -43,8 +43,8 @@ def initialize_trainer(
    loaded into gpc.config.

    Args:
-        model (:class:`torch.nn.Module` or Callbale): Your model instance or a function to build the model.
-        optimizer (:class:`BaseOptimizer`.
+        model (:class:`torch.nn.Module` or `Callable`): Your model instance or a function to build the model.
+        optimizer (:class:`BaseOptimizer`): Your optimizer for training.
        criterion (:class:`torch.nn.modules.loss._Loss`, optional): Your criterion instance.
        train_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for training.
        test_dataloader (:class:`torch.utils.data.DataLoader`, optional): Dataloader for testing.
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -10,8 +10,10 @@ import torch

 from internlm.core.context import Config
 from internlm.core.context import global_context as gpc
+from internlm.monitor import initialize_light_monitor
+from internlm.utils.common import get_master_node
 from internlm.utils.logger import get_logger
-from internlm.utils.storage_manager import init_storage_manager
+from internlm.utils.timeout import llm_timeout

 logger = get_logger(__file__)

@ -21,7 +23,7 @@ def get_default_parser():
    Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.

    Returns:
-       Namespace: Returns the parser with the default arguments, the user may add customized arguments into this parser.
+       Parser: Returns the parser with the default arguments, the user may add customized arguments into this parser.
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", type=str, help="path to the config file")
@ -99,6 +101,13 @@ def args_sanity_check():
    if "valid_every" not in data:
        data._add_item("valid_every", 0)

+    if "empty_cache_and_diag_interval" not in data:
+        data._add_item("empty_cache_and_diag_interval", 50)
+
+    if "diag_outlier_ratio" not in data:
+        data._add_item("diag_outlier_ratio", 1.1)
+    data.diag_outlier_ratio = max(1, data.diag_outlier_ratio)
+
    if gpc.is_rank_for_log():
        logger.info("+" * 15 + " Data Info " + "+" * 15)  # pylint: disable=W1201
        logger.info(f"seq_len: {data.seq_len}")
@ -113,7 +122,7 @@ def args_sanity_check():
    # processing the checkpoint config
    ckpt = gpc.config.ckpt
    if "enable_save_ckpt" not in ckpt:
-        ckpt._add_item("enable_save_ckpt", False)
+        ckpt._add_item("enable_save_ckpt", True)

    # Saving checkpoint args.
    if ckpt.enable_save_ckpt:
@ -139,9 +148,6 @@ def args_sanity_check():
        if not ckpt.async_upload:
            ckpt._add_item("async_upload_tmp_folder", None)

-        if "snapshot_ckpt_folder" not in ckpt:
-            ckpt._add_item("snapshot_ckpt_folder", os.path.join(ckpt.save_ckpt_folder, "snapshot"))
-
        if "oss_snapshot_freq" not in ckpt:
            ckpt._add_item("oss_snapshot_freq", float("inf"))  # if oss_snapshot_freq not given, we disable.
    else:
@ -151,44 +157,23 @@ def args_sanity_check():
        ckpt._add_item("async_upload", False)
        ckpt._add_item("async_upload_tmp_folder", None)
        ckpt._add_item("snapshot_ckpt_folder", None)
-        ckpt._add_item("snapshot_ckpt_folder", None)
-
-    # Loading checkpoint args.
-    if "load_model_only_folder" not in ckpt:
-        ckpt._add_item("load_model_only_folder", None)

    if "load_ckpt_folder" not in ckpt:
        ckpt._add_item("load_ckpt_folder", None)

-    if "load_optimizer" not in ckpt:
-        ckpt._add_item("load_optimizer", True)
-
    if "stop_file_path" not in ckpt:
        ckpt._add_item("stop_file_path", None)

-    if "load_given_ckpt" not in ckpt:
-        # If 'load_given_ckpt' is not given, we set it to False, so internlm can have opportunity
+    if "auto_resume" not in ckpt:
+        # If 'auto_resume' is not given, we set it to True, so internlm can have opportunity
        # to auto-load latest checkpoint.
-        ckpt._add_item("load_given_ckpt", False)
-
-    if ckpt.load_given_ckpt:
-        # Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
-        if ckpt.load_ckpt_folder and ckpt.load_model_only_folder:
-            logger.warning(
-                "Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
-and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
-            )
-            ckpt.load_model_only_folder = None
+        ckpt._add_item("auto_resume", True)

    if gpc.is_rank_for_log():
        logger.info("+" * 15 + " Ckpt Info " + "+" * 15)  # pylint: disable=W1201
        logger.info(f"is enable save ckpt: {ckpt.enable_save_ckpt}")
        logger.info(f"save_ckpt_folder: {ckpt.save_ckpt_folder}")
        logger.info(f"checkpoint_every: {ckpt.checkpoint_every}")
-        logger.info(f"load_given_ckpt: {ckpt.load_given_ckpt}")
-
-    # initialization storage manager
-    init_storage_manager(ckpt)

    # tensorboard writer config
    if "enable_tb" not in gpc.config:
@ -202,6 +187,10 @@ and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
            "resume_tb_folder", os.environ["resume_tb_folder"] if "resume_tb_folder" in os.environ else None
        )

+    if gpc.is_rank_for_log():
+        logger.info(f"tensorboard_folder: {gpc.config.tensorboard_folder}")
+        logger.info(f"resume_tb_folder: {gpc.config.resume_tb_folder}")
+
    # cudnn
    torch.backends.cudnn.benchmark = gpc.config.get("cudnn_benchmark", False)
    torch.backends.cudnn.deterministic = gpc.config.get("cudnn_deterministic", False)
@ -223,10 +212,8 @@ and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
        elif gpc.config.model.dtype in ("torch.float16", "torch.half"):
            gpc.config.model.dtype = torch.float16
        elif gpc.config.model.dtype == "torch.float32":
-            assert gpc.config.model.use_flash_attn is False, "when using float32, the use_flash_attn must be False"
            gpc.config.model.dtype = torch.float32
        elif gpc.config.model.dtype == "torch.tf32":
-            assert gpc.config.model.use_flash_attn is False, "when using tf32, the use_flash_attn must be False"
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cuda.matmul.allow_tf32 = True
            gpc.config.model.dtype = torch.float32
@ -268,6 +255,8 @@ and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
    # process the model config
    if "use_flash_attn" not in gpc.config.model:
        gpc.config.model._add_item("use_flash_attn", True)
+    if "num_experts" not in model:
+        model._add_item("num_experts", 1)

    # process the parallel config
    if "sequence_parallel" not in gpc.config.parallel:
@ -277,9 +266,44 @@ and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
            gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
        ), "sequence parallel does not support use_flash_attn=False"

-    # feishu webhook address for alerting
-    if "alert_address" not in gpc.config:
-        gpc.config._add_item("alert_address", None)
+    # currently only interleaved pipeline scheduler with overlap can guarantee loss accuracy
+    if hasattr(gpc.config.model, "num_chunks") and gpc.config.model.num_chunks > 1:
+        assert (
+            gpc.config.parallel["pipeline"].get("interleaved_overlap", False) is True
+        ), "only support interleaved pipeline scheduler with overlap"
+
+    # monitoring default config
+    monitor_default_config = {
+        "alert_address": None,  # compatible with old alert config
+        "monitor": {  # new monitoring config
+            "alert": {"enable_feishu_alert": False, "feishu_alert_address": None, "light_monitor_address": None}
+        },
+    }
+
+    for key, value in monitor_default_config.items():
+        if key not in gpc.config:
+            gpc.config._add_item(key, value)
+
+    alert = gpc.config.monitor.alert
+
+    if alert.enable_feishu_alert and not alert.feishu_alert_address and gpc.is_rank_for_log():
+        logger.warning("alert is enable but alert_address is not set")
+
+    optim_ckpt = gpc.config.hybrid_zero_optimizer
+    if "zero_overlap_communication" in optim_ckpt:
+        # Compatible with the old interfaces.
+        optim_ckpt._add_item("overlap_sync_grad", optim_ckpt.zero_overlap_communication)
+    if "overlap_sync_grad" not in optim_ckpt:
+        optim_ckpt._add_item("overlap_sync_grad", False)
+    if "overlap_sync_param" not in optim_ckpt:
+        optim_ckpt._add_item("overlap_sync_param", False)
+    if gpc.is_rank_for_log():
+        logger.info(
+            f"overlap_sync_grad:{optim_ckpt.overlap_sync_grad}, overlap_sync_param:{optim_ckpt.overlap_sync_param}"
+        )
+
+    if "moe_loss_coeff" not in gpc.config.loss:
+        gpc.config.loss._add_item("moe_loss_coeff", 1.0)


 def launch(
@ -327,8 +351,6 @@ def launch(
    # init process groups for different parallel modes from config
    gpc.init_parallel_groups()

-    args_sanity_check()
-
    # set cuda device
    if torch.cuda.is_available():
        # if local rank is not given, calculate automatically
@ -381,7 +403,11 @@ def launch_from_slurm(
    )


-def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nccl", seed: int = 1024):
+def launch_from_torch(
+    config: Union[str, Path, Config, Dict],
+    backend: str = "nccl",
+    seed: int = 1024,
+):
    """A wrapper for internlm.launch for torchrun or torch.distributed.launch by reading rank and world size
    from the environment variables set by PyTorch

@ -409,3 +435,56 @@ def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nc
        backend=backend,
        seed=seed,
    )
+
+
+@llm_timeout(func_name="initialize_distributed_env")
+def initialize_distributed_env(
+    config: str,
+    launcher: str = "slurm",
+    master_port: int = 8888,
+    seed: int = 1024,
+    args_check=True,
+):
+    """
+    Initialize distributed environment for distributed training.
+
+    Args:
+        config (str): Config file path.
+        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
+        master_port (str): The master port for distributed training. 8888 by default.
+        seed (int, optional): Specified random seed for every process. 1024 by default.
+    """
+
+    torch.cuda.empty_cache()
+
+    if launcher == "torch":
+        launch_from_torch(config=config, seed=seed)
+    elif launcher == "slurm":
+        launch_from_slurm(
+            config=config,
+            host=get_master_node(),
+            port=master_port,
+            seed=seed,
+        )
+    else:
+        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
+
+    if args_check:
+        args_sanity_check()
+
+    # init light monitor client
+    alert_config = gpc.config.monitor.alert
+    if alert_config.enable_feishu_alert and gpc.is_rank_for_log():
+        light_monitor_address = alert_config.light_monitor_address
+        if light_monitor_address:
+            initialize_light_monitor(light_monitor_address)
+        else:
+            logger.warning("monitor address is none, monitor could not be used!")
+
+
+def get_config_value(config, key, defalut):
+    try:
+        value = config[key]
+    except KeyError:
+        value = defalut
+    return value
--- a/internlm/initialize/legacy/init.py
+++ b/internlm/initialize/legacy/init.py
--- a/internlm/initialize/legacy/launch.py
+++ b/internlm/initialize/legacy/launch.py
@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from internlm.initialize.launch import get_config_value
+from internlm.utils.logger import get_logger
+
+logger = get_logger(__file__)
+
+
+def auto_resume_sanity_check(ckpt_config):
+    load_given_ckpt = get_config_value(ckpt_config, "load_given_ckpt", None)
+    if load_given_ckpt is None:
+        return True  # default value is True
+    else:
+        return not load_given_ckpt
+
+
+def ckpt_info_sanity_check(ckpt_config):
+    load_ckpt_folder = get_config_value(ckpt_config, "load_ckpt_folder", None)
+
+    load_model_only_folder = get_config_value(ckpt_config, "load_model_only_folder", None)
+
+    if load_model_only_folder is not None:
+        assert (
+            load_ckpt_folder is None
+        ), "Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
+# and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
+        return dict(path=load_model_only_folder, content=("model",), ckpt_type="internlm")
+    else:
+        load_optimizer = get_config_value(ckpt_config, "load_optimizer", True)
+
+        if isinstance(load_ckpt_folder, str):
+            if load_optimizer:
+                return dict(path=load_ckpt_folder, content=("model", "sampler", "optimizer"), ckpt_type="internlm")
+            else:
+                return dict(path=load_ckpt_folder, content=("model", "sampler"), ckpt_type="internlm")
+        elif load_ckpt_folder is None:
+            return None
+        else:
+            assert f"Unsupport data type:'{type(load_ckpt_folder)}' for config.ckpt arg: 'load_ckpt_folder'"
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@ -137,15 +137,13 @@ class RotaryEmbedding(torch.nn.Module):
        """ """
        super().__init__()
        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
-        self.register_buffer("inv_freq", inv_freq)
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
        self.scale_base = scale_base
-        scale = (
+        self.scale = (
            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
            if scale_base > 0
            else None
        )
-        self.register_buffer("scale", scale)

        self._seq_len_cached = 0
        self._cos_cached = None
@ -220,3 +218,15 @@ class RotaryEmbedding(torch.nn.Module):
                self._cos_k_cached[seqlen_offset:],
                self._sin_k_cached[seqlen_offset:],
            )
+
+    def _single_forward(self, x, indexes=0):
+        assert self.scale is None
+        self._update_cos_sin_cache(x, indexes)
+        x = x[None, ...]
+        ret = legacy_apply_rotary_embed(x, self._cos_cached[indexes], self._sin_cached[indexes]).squeeze(0)
+        return ret
+
+    def _single_eval_forward(self, x, seqlen_offset=0):
+        assert self.scale is None
+        self._update_cos_sin_cache(x, seqlen_offset + x.shape[1])
+        return legacy_apply_rotary_embed(x, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:])
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@ -4,14 +4,13 @@
 from typing import Optional

 import torch
-import torch.nn.functional as F
 from flash_attn.ops.fused_dense import ColumnParallelLinear, RowParallelLinear
 from flash_attn.utils.distributed import all_reduce, reduce_scatter
 from torch import nn

-from internlm.core.context import IS_TENSOR_PARALLEL, ParallelMode
+from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.utils import fused_dense_func_torch
+from internlm.model.utils import Silu, fused_dense_func_torch


 class ScaleColumnParallelLinear(nn.Linear):
@ -195,13 +194,9 @@ class FeedForward(nn.Module):
            device=device,
            dtype=dtype,
        )
-        # need to assign tp attribute so that colossalai know it is tensor parallel module
-
-        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
-            for name in ["w1", "w2", "w3"]:
-                for param in getattr(self, name).parameters():
-                    setattr(param, IS_TENSOR_PARALLEL, True)

    def forward(self, x):
-        out = self.w3(F.silu(self.w1(x)) * self.w2(x))
+        w1_o = self.w1(x)
+        w2_o = self.w2(x)
+        out = self.w3(Silu(w1_o, w2_o))
        return out
--- a/internlm/model/metrics.py
+++ b/internlm/model/metrics.py
@ -176,7 +176,7 @@ class AccPerplex:
            res.update(ds_acc)
            res.update(ds_tokens)

-        loss_res = self.loss_with_type_id.get_metric()
+        loss_res = self.loss_with_type_id.get_metric(reset)
        res.update(loss_res)

        return res
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -123,6 +123,11 @@ class PackedFlashBaseLayer1D(nn.Module):
            self.norm1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
            self.norm2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)

+        for param in self.norm1.parameters():
+            param.is_norm = True
+        for param in self.norm2.parameters():
+            param.is_norm = True
+
        self.num_experts = num_experts
        self.moe_gate_k = moe_gate_k
        self.moe_capacity_factor = moe_capacity_factor
@ -159,36 +164,13 @@ class PackedFlashBaseLayer1D(nn.Module):
                    device=device,
                    dtype=dtype,
                )
+            for _, param in self.mlp.named_parameters():
+                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                    setattr(param, IS_TENSOR_PARALLEL, True)
        else:
-            experts = torch.nn.ModuleList(
-                [
-                    FeedForward(
-                        hidden_size,
-                        int(hidden_size * gpc.config.model.mlp_ratio),
-                        out_features=hidden_size,
-                        process_group=gpc.get_group(ParallelMode.TENSOR),
-                        bias=False,
-                        device=torch.device("cuda"),
-                        dtype=torch.float,
-                    )
-                    for i in range(num_experts // ep_size)
-                ]
-            )
-
-            if moe_use_residual:
-                residual_mlp = FeedForward(
-                    hidden_size,
-                    int(hidden_size * gpc.config.model.mlp_ratio),
-                    out_features=hidden_size,
-                    process_group=gpc.get_group(ParallelMode.TENSOR),
-                    bias=False,
-                    device=torch.device("cuda"),
-                    dtype=torch.float,
-                )
-
+            # replace mlp by MoE module. The expert in MoE is a FeedForward module.
            self.mlp = MoE(
                hidden_size=hidden_size,
-                experts=experts,
                num_experts=num_experts,
                ep_size=ep_size,
                k=moe_gate_k,
@ -199,8 +181,12 @@ class PackedFlashBaseLayer1D(nn.Module):
                drop_tokens=moe_drop_tokens,
                use_rts=moe_use_rts,
                use_residual=moe_use_residual,
-                residual_mlp=residual_mlp if moe_use_residual else None,
+                device=device,
+                dtype=dtype,
            )
+            for _, param in self.mlp.moe_layer.experts.named_parameters():
+                if gpc.get_world_size(ParallelMode.TENSOR) > 1:
+                    setattr(param, IS_TENSOR_PARALLEL, True)

        self.dropout2 = nn.Dropout(drop_rate)
        self.use_swiglu = use_swiglu
@ -291,9 +277,9 @@ class PackedFlashBaseLayer1D(nn.Module):

        # MLP.
        moe_loss = torch.tensor(0.0, device=hidden_states.device, dtype=hidden_states.dtype)
-        if self.num_experts <= 1:
+        if self.num_experts <= 1:  # dense mlp output
            hidden_states = self.mlp(hidden_states)
-        else:
+        else:  # MoE output
            hidden_states, moe_loss, _ = self.mlp(hidden_states)

        return hidden_states + residual, moe_loss
@ -456,7 +442,8 @@ class PackedFlashInternLm1D(nn.Module):

    def forward(self, hidden_states=None, cu_seqlens=None, input_ids=None, indexes=None, inference_params=None):
        # attention_mask: compute attention on the places where the value is 1
-        if hasattr(self, "embedding"):
+        # old condition may fail when use shared embedding
+        if gpc.is_pipeline_first_stage():
            hidden_states = self.embedding(input_ids)
            if self.embed_grad_scale != 1:
                hidden_states = (
@ -514,7 +501,7 @@ def _build_generic_model_1d(num_layers, num_chunks, device=torch.device("cuda"),
    all_parts = partition_uniform(num_layers, pipeline_size, num_chunks)
    parts = all_parts[pipeline_rank]
    if gpc.is_rank_for_log():
-        logger.info(f"The layer sharding is {all_parts}.")  # pylint: disable=W1203
+        logger.info(f"The layer sharding is {all_parts}.")

    models = []

@ -561,7 +548,6 @@ def build_model_with_cfg(
    use_scaled_init: bool = True,
    use_swiglu: bool = True,
    use_flash_attn: bool = True,
-    sequence_parallel: bool = False,  # pylint: disable=W0613
    num_experts: int = 1,
    moe_gate_k: int = 1,
    moe_capacity_factor: float = 1.0,
@ -573,7 +559,7 @@ def build_model_with_cfg(
    moe_use_residual: bool = False,
 ):
    """
-    Builde model with config
+    Build model with config.

    Args:
        num_chunks (int): The number of partitions in pipeline parallel. 1 by default.
--- a/internlm/model/moe.py
+++ b/internlm/model/moe.py
@ -1,10 +1,10 @@
 import typing
-from typing import Dict, Tuple

 import torch

 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.model.linear import FeedForward
 from internlm.moe.experts import Experts
 from internlm.moe.sharded_moe import MOELayer, TopKGate
 from internlm.utils.logger import get_logger
@ -19,24 +19,6 @@ from internlm.utils.logger import get_logger
 logger = get_logger(__file__)


-def has_moe_layers(m):
-    has_moe = False
-    num_experts = 0
-
-    for _, module in m.named_modules():
-        if isinstance(module, MoE):
-            has_moe = True
-            num_experts = module.num_experts
-            break
-    return has_moe, num_experts
-
-
-def is_moe_param(param: torch.Tensor) -> bool:
-    if hasattr(param, "all_reduce") and not param.all_reduce:
-        return True
-    return False
-
-
 class MoE(torch.nn.Module):
    """Initialize an MoE layer.

@ -63,7 +45,6 @@ class MoE(torch.nn.Module):
    def __init__(
        self,
        hidden_size,
-        experts,
        num_experts=1,
        ep_size=1,
        k=1,
@ -75,7 +56,8 @@ class MoE(torch.nn.Module):
        use_rts: bool = True,
        using_default_moe: bool = True,
        use_residual=False,
-        residual_mlp=None,
+        device=None,
+        dtype=None,
    ):

        super().__init__()
@ -87,16 +69,34 @@ class MoE(torch.nn.Module):
        self.num_experts = num_experts
        self.num_local_experts = num_experts // self.ep_size

-        logger.info(  # pylint: disable=W1203
-            f"Creating MoE layer with num_experts: {num_experts} | num_local_experts:"
-            f"{self.num_local_experts} | expert_parallel_size: {self.ep_size}"
-        )
-
+        if gpc.is_rank_for_log():
+            logger.info(  # pylint: disable=W1203
+                f"Creating MoE layer with num_experts: {num_experts} | num_local_experts:"
+                f"{self.num_local_experts} | expert_parallel_size: {self.ep_size}"
+            )
        assert noisy_gate_policy is None or noisy_gate_policy in ["None", "Jitter", "RSample"], (
            "Unsupported noisy_gate_policy: " + noisy_gate_policy
        )

-        expert_group_name = f"ep_size_{self.ep_size}"
+        # for elastic expert paralle, experts may have multiple groups
+        expert_group_name = f"moe_ep_size_{self.ep_size}"
+        if expert_group_name not in gpc.expert_parallel_group_names:
+            gpc.expert_parallel_group_names.append(expert_group_name)
+        experts = torch.nn.ModuleList(
+            [
+                # TODO have trouble when use internlm.model.linear.FeedForward
+                FeedForward(
+                    hidden_size,
+                    int(hidden_size * gpc.config.model.mlp_ratio),
+                    out_features=hidden_size,
+                    process_group=gpc.get_group(ParallelMode.TENSOR),
+                    bias=False,
+                    device=device,
+                    dtype=dtype,
+                )
+                for _ in range(self.num_local_experts)
+            ]
+        )
        experts = Experts(experts, self.num_local_experts, expert_group_name)

        if using_default_moe:
@ -118,10 +118,19 @@ class MoE(torch.nn.Module):
                self.num_local_experts,
            )

+        # residual network, see https://arxiv.org/pdf/2201.05596.pdf, seems useful for convergence
        self.use_residual = use_residual
        if use_residual:
-            self.residual_mlp = residual_mlp
-            # coefficient is used for weighted sum of the output of expert and mlp
+            self.residual_mlp = FeedForward(
+                hidden_size,
+                int(hidden_size * gpc.config.model.mlp_ratio),
+                out_features=hidden_size,
+                process_group=gpc.get_group(ParallelMode.TENSOR),
+                bias=False,
+                device=device,
+                dtype=dtype,
+            )
+            # coefficient is used for weighted sum of the output of expert and residual mlp
            self.coefficient = torch.nn.Linear(hidden_size, 2)

    def forward(self, hidden_states, used_token=None):
@ -150,94 +159,3 @@ class MoE(torch.nn.Module):
            coef = torch.nn.functional.softmax(coef, dim=-1)
            output = output * coef[..., 0:1] + output_mlp * coef[..., 1:]
        return output, self.moe_layer.l_aux, self.moe_layer.exp_counts
-
-
-def split_params_into_different_moe_groups_for_optimizer(
-    param_groups: Tuple[Dict], max_group_size=178956971
-) -> Tuple[Dict]:
-    """Split parameters into different MoE groups for optimizer
-    Compatiable with muiltiple param groups, each should have a name
-
-    Args:
-        param_groups (Tuple[Dict]):
-            The list of parameter groups to split
-
-    Returns:
-        Tuple[Dict]:
-        list of MoE/non-MoE groups for optimizer
-    """
-    if isinstance(param_groups, tuple):
-        param_groups = list(param_groups)  # Tuple cannot be modified
-    elif isinstance(param_groups, dict):
-        param_groups = [param_groups]
-    elif not isinstance(param_groups, list):
-        raise ValueError(f"Unknown param group type of {type(param_groups)}")
-
-    # gather all data parallel group names
-    data_parallel_group_names = set()
-    for param_group in param_groups:
-        for param in param_group["params"]:
-            if is_moe_param(param):
-                data_parallel_group_names.add(param.group_name)
-    data_parallel_group_names = list(data_parallel_group_names)
-    group_moe = {}
-    # Create the param MoE groups, leave param assign to next step
-    for param_group in param_groups:
-        group_moe[param_group["name"]] = {}
-        for key in data_parallel_group_names:
-            group_moe[param_group["name"]][key] = {}
-            group_moe[param_group["name"]][key]["name"] = key
-            group_moe[param_group["name"]][key]["moe"] = True
-            for ori_key in param_group.keys():
-                if ori_key != "name":
-                    if ori_key == "params":
-                        group_moe[param_group["name"]][key][ori_key] = []
-                    else:
-                        group_moe[param_group["name"]][key][ori_key] = param_group[ori_key]
-    # Assign param
-    for param_group in param_groups:
-        new_params = []
-        for param in param_group["params"]:
-            if is_moe_param(param):
-                group_moe[param_group["name"]][param.group_name]["params"].append(param)
-                # param_group['params'].remove(param)
-            else:
-                new_params.append(param)
-        param_group["params"] = new_params
-
-    # Flatten the moe groups
-    if max_group_size is not None:
-        for _, v in group_moe.items():
-            for _, v1 in v.items():
-                cur_group = []
-                all_groups = []
-                size_of_cur_group = 0
-                for param in v1["params"]:
-                    if size_of_cur_group + param.numel() <= max_group_size:
-                        cur_group.append(param)
-                        size_of_cur_group += param.numel()
-                    else:
-                        all_groups.append(cur_group)
-                        cur_group = [param]
-                        size_of_cur_group = param.numel()
-                if cur_group:
-                    all_groups.append(cur_group)
-                for group in all_groups:
-                    new_dict = {}
-                    for key, val in v1.items():
-                        if key != "params":
-                            new_dict[key] = val
-                    new_dict["params"] = group
-                    param_groups.append(new_dict)
-    else:
-        for _, v in group_moe.items():
-            for _, v1 in v.items():
-                param_groups.append(v1)
-
-    return tuple(param_groups)
-
-
-def create_moe_param_groups(model, weight_decay):
-    parameters = {"params": list(model.parameters()), "name": "default", "weight_decay": weight_decay}
-
-    return split_params_into_different_moe_groups_for_optimizer(parameters)
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@ -132,7 +132,13 @@ class MHA(nn.Module):
            qkv = self.rotary_emb(qkv, **kwargs)

        if inference_params is None:
-            context = self.inner_attn(qkv)
+            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
+                        qkv = qkv.to(torch.bfloat16)
+                    context = self.inner_attn(qkv).to(x.dtype)
+            else:
+                context = self.inner_attn(qkv)
        else:
            q = qkv[:, :, 0]
            assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
@ -164,7 +170,14 @@ class MHA(nn.Module):
        kwargs.pop("indexes")

        if inference_params is None:
-            context = self.inner_attn(qkv, **kwargs)
+            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
+                        qkv = qkv.to(torch.bfloat16)
+                    context = self.inner_attn(qkv, **kwargs).to(x.dtype)
+            else:
+                context = self.inner_attn(qkv, **kwargs)
+
        else:
            raise RuntimeError("Not support this right now")

--- a/internlm/model/utils.py
+++ b/internlm/model/utils.py
@ -207,3 +207,28 @@ def try_import_RMSNorm():
        from internlm.model.norm import RMSNormTorch as RMSNorm

        return RMSNorm
+
+
+def is_moe_param(param: torch.Tensor) -> bool:
+    if hasattr(param, "is_expert") and param.is_expert:
+        return True
+    return False
+
+
+def is_gate_param(param: torch.Tensor) -> bool:
+    if hasattr(param, "is_gate") and param.is_gate:
+        return True
+    return False
+
+
+def is_norm_param(param: torch.Tensor) -> bool:
+    if hasattr(param, "is_norm") and param.is_norm:
+        return True
+    return False
+
+
+def Silu(w1_o, w2_o):
+    return F.silu(w1_o) * w2_o
+
+
+Silu = torch.jit.script(Silu)
--- a/internlm/moe/experts.py
+++ b/internlm/moe/experts.py
@ -37,7 +37,7 @@ class Experts(torch.nn.Module):
        for expert in self.experts:
            # TODO: Create param groups to handle expert + data case (e.g. param.group = moe_group)
            for _, param in expert.named_parameters():
-                param.all_reduce = False
+                param.is_expert = True
                param.group_name = expert_group_name

    def forward(self, inputs):
--- a/internlm/moe/sharded_moe.py
+++ b/internlm/moe/sharded_moe.py
@ -69,10 +69,6 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
    return gumbel(shape)


-# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity
-# See https://arxiv.org/pdf/2006.16668.pdf for details.
-
-
 # Based on https://github.com/pytorch/pytorch/pull/40762
 class _AllToAll(torch.autograd.Function):
    """
@ -167,11 +163,6 @@ def _top_idx(source, k):
    return torch.topk(source, k=k, dim=0)[1]


-@torch.jit.script
-def _one_hot_to_float(x, num_classes):
-    return F.one_hot(x, num_classes=num_classes).float()
-
-
 def top1gating(
    logits: Tensor,
    capacity_factor: float,
@ -210,7 +201,7 @@ def top1gating(

    # Compute l_aux
    me = torch.mean(gates, dim=0)
-    ce = torch.mean(mask1.float(), dim=0)
+    ce = torch.mean(mask1.type_as(logits), dim=0)
    l_aux = torch.sum(me * ce) * num_experts

    # Random Token Selection
@ -244,10 +235,10 @@ def top1gating(
    locations1_s = torch.sum(locations1 * mask1, dim=1)

    # Normalize gate probabilities
-    mask1_float = mask1.float()
+    mask1_float = mask1.type_as(logits)
    gates = gates * mask1_float

-    locations1_sc = _one_hot_to_float(locations1_s, capacity)
+    locations1_sc = F.one_hot(locations1_s, num_classes=capacity).type_as(logits)
    combine_weights = einsum("se,sc->sec", gates, locations1_sc)

    dispatch_mask = combine_weights.bool()
@ -271,7 +262,7 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup
    # https://timvieira.github.io/blog/post/2014/07/31/gumbel-max-trick/
    logits_w_noise = logits + gumbel_rsample(logits.shape, device=logits.device)
    # Replace top-expert with min value
-    logits_except1 = logits_w_noise.masked_fill(mask1.bool(), float("-inf"))
+    logits_except1 = logits_w_noise.masked_fill(mask1.bool(), torch.finfo(logits.dtype).min)
    indices2_s = torch.argmax(logits_except1, dim=1)
    mask2 = F.one_hot(indices2_s, num_classes=num_experts)

@ -286,7 +277,7 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup

    # Compute l_aux
    me = torch.mean(gates, dim=0)
-    ce = torch.mean(mask1.float(), dim=0)
+    ce = torch.mean(mask1.type_as(logits), dim=0)
    l_aux = torch.mean(me * ce) * num_experts * num_experts

    # Remove locations outside capacity from mask
@ -298,8 +289,8 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup
    locations2_s = torch.sum(locations2 * mask2, dim=1)

    # Normalize gate probabilities
-    mask1_float = mask1.float()
-    mask2_float = mask2.float()
+    mask1_float = mask1.type_as(logits)
+    mask2_float = mask2.type_as(logits)
    gates1_s = einsum("se,se->s", gates, mask1_float)
    gates2_s = einsum("se,se->s", gates, mask2_float)
    denom_s = gates1_s + gates2_s
@ -311,8 +302,8 @@ def top2gating(logits: Tensor, capacity_factor: float, min_capacity: int) -> Tup
    # Calculate combine_weights and dispatch_mask
    gates1 = einsum("s,se->se", gates1_s, mask1_float)
    gates2 = einsum("s,se->se", gates2_s, mask2_float)
-    locations1_sc = _one_hot_to_float(locations1_s, capacity)
-    locations2_sc = _one_hot_to_float(locations2_s, capacity)
+    locations1_sc = F.one_hot(locations1_s, num_classes=capacity).type_as(logits)
+    locations2_sc = F.one_hot(locations2_s, num_classes=capacity).type_as(logits)
    combine1_sec = einsum("se,sc->sec", gates1, locations1_sc)
    combine2_sec = einsum("se,sc->sec", gates2, locations2_sc)
    combine_weights = combine1_sec + combine2_sec
@ -356,9 +347,8 @@ class TopKGate(Module):
        # Only top-1 and top-2 are supported at the moment.
        if k not in (1, 2):
            raise ValueError("Only top-1 and top-2 gatings are supported.")
-        # TODO: can we use tensor parallel here?
        # Deepspeed's mechisms, alway use fp32
-        self.wg = torch.nn.Linear(model_dim, num_experts, bias=False).float()
+        self.wg = torch.nn.Linear(model_dim, num_experts, bias=False)
        self.k = k
        self.capacity_factor = capacity_factor
        self.eval_capacity_factor = eval_capacity_factor
@ -369,6 +359,9 @@ class TopKGate(Module):
        self.drop_tokens = drop_tokens
        self.use_rts = use_rts

+        for param in self.wg.parameters():
+            param.is_gate = True
+
    def forward(
        self, inputs: torch.Tensor, used_token: torch.Tensor = None
    ) -> Tuple[Tensor, Tensor, Tensor]:  # type: ignore
@ -376,13 +369,10 @@ class TopKGate(Module):
        if self.wall_clock_breakdown:
            timer("TopKGate").start()

-        if self.wg.weight.dtype != torch.float32:
-            self.wg = self.wg.float()
-        inputs_fp32 = inputs.float()
        # input jittering
        if self.noisy_gate_policy == "Jitter" and self.training:
-            inputs_fp32 = multiplicative_jitter(inputs_fp32, device=inputs.device)
-        logits = self.wg(inputs_fp32)
+            inputs = multiplicative_jitter(inputs, device=inputs.device)
+        logits = self.wg(inputs)

        if self.k == 1:
            gate_output = top1gating(
@ -437,9 +427,6 @@ class MOELayer(Base):
        self.time_moe = 0.0
        self.wall_clock_breakdown = False

-    def _set_ep_group(self, ep_group):
-        self.ep_group = ep_group
-
    def forward(self, *inputs: Tensor) -> Tensor:

        if self.wall_clock_breakdown:
@ -486,10 +473,10 @@ class MOELayer(Base):

        combined_output = einsum("sec,ecm->sm", combine_weights.type_as(inputs[0]), expert_output)

-        a = combined_output.reshape(inputs[0].shape)
+        out = combined_output.reshape(inputs[0].shape)

        if self.wall_clock_breakdown:
            timer("moe").stop()
            self.time_moe = timer("moe").elapsed(reset=False)

-        return a
+        return out
--- a/internlm/monitor/init.py
+++ b/internlm/monitor/init.py
@ -1,4 +1,11 @@
+from .alert import initialize_light_monitor, send_heartbeat
 from .monitor import initialize_monitor_manager, send_alert_message
 from .utils import set_env_var

-__all__ = ["send_alert_message", "initialize_monitor_manager", "set_env_var"]
+__all__ = [
+    "send_alert_message",
+    "initialize_monitor_manager",
+    "set_env_var",
+    "initialize_light_monitor",
+    "send_heartbeat",
+]
--- a/internlm/monitor/alert.py
+++ b/internlm/monitor/alert.py
@ -1,8 +1,59 @@
 import json
+import math
+import os
+import re
 import time
+from typing import Dict

 import requests

+from internlm.utils.logger import get_logger
+
+logger = get_logger(__file__)
+
+
+def initialize_light_monitor(monitor_address: str = None):
+    try:
+        from uniscale_monitoring import init_monitor
+
+        init_monitor(monitor_address)
+    except Exception as e:
+        logger.warning(f"init monitor meet error: {e}")
+
+
+def send_heartbeat(msg_type: str, msg: Dict):
+    def nan2none(v):
+        if isinstance(v, float) and math.isnan(v):
+            return None
+        return v
+
+    try:
+        from uniscale_monitoring import send_meta
+
+        data = {}
+        for k, v in msg.items():
+            if isinstance(v, Dict):
+                for k1, v1 in v.items():
+                    new_k = f"{k}_{k1}".split(" ")[0]
+                    new_k = re.sub(r"[^a-zA-Z0-9_]", "_", new_k)
+                    data[new_k] = nan2none(v1)
+            else:
+                new_k = k.split(" ")[0]
+                new_k = re.sub(r"[^a-zA-Z0-9_]", "_", new_k)
+                data[new_k] = nan2none(v)
+
+        if os.getenv("CLUSTER_NAME"):
+            data.update({"cluster": os.getenv("CLUSTER_NAME")})
+        if msg_type == "train_metrics":
+            data.update({"msg_type": "train_metrics"})
+        elif msg_type == "init_time":
+            data.update({"msg_type": "init_time"})
+        elif msg_type == "stage_time":
+            data.update({"msg_type": "stage_time"})
+        send_meta(data, timeout=0.1)
+    except Exception as e:
+        logger.warning(f"send heartbeat meet error: {e}")
+

 def send_feishu_msg_with_webhook(webhook: str, title: str, message: str):
    """
--- a/internlm/monitor/monitor.py
+++ b/internlm/monitor/monitor.py
@ -211,6 +211,14 @@ monitor_manager = MonitorManager()

@contextmanager
 def initialize_monitor_manager(job_name: str = None, alert_address: str = None):
+    """
+    Initialize monitor manager for monitoring training lifetime and alerting exception info to Feishu.
+
+    Args:
+        job_name (str): The training job name.
+        alert_address (str): The Feishu webhook address for sending alert messages.
+    """
+
    if alert_address is not None:
        try:
            monitor_manager.start_monitor(job_name=job_name, alert_address=alert_address)
@ -218,9 +226,7 @@ def initialize_monitor_manager(job_name: str = None, alert_address: str = None):
            send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} is starting.")
            yield
        finally:
-            send_alert_message(
-                address=gpc.config.alert_address, message=f"Training in {socket.gethostname()} completed."
-            )
+            send_alert_message(address=alert_address, message=f"Training in {socket.gethostname()} completed.")
            monitor_manager.stop_monitor()
    else:
        yield
--- a/internlm/solver/optimizer/init.py
+++ b/internlm/solver/optimizer/init.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

-from .hybrid_zero_optim import HybridZeroOptimizer
+from .hybrid_zero_optim import HybridZeroOptimizer, reload_zero_fp32_buff

-__all__ = ["HybridZeroOptimizer"]
+__all__ = ["HybridZeroOptimizer", "reload_zero_fp32_buff"]
--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@ -11,7 +11,7 @@ from torch.optim import Optimizer

 from internlm.core.context import Config, ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.model.moe import is_moe_param
+from internlm.model.utils import is_moe_param
 from internlm.monitor import send_alert_message
 from internlm.solver.optimizer.store import (
    BucketStore,
@ -33,6 +33,7 @@ from internlm.solver.optimizer.utils import (
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
+from internlm.utils.timeout import llm_timeout

 from .utils import compute_norm

@ -92,7 +93,6 @@ class HybridZeroOptimizer(BaseOptimizer):
        cpu_offload=False,
        grad_scal_cfg: Config = None,
        zero_cfg: Config = None,
-        has_moe: bool = False,
        param_bcast_sync_handler: ParamBcastSyncHandler = None,
    ):
        # DynamicGradScaler related args
@ -108,14 +108,13 @@ class HybridZeroOptimizer(BaseOptimizer):
        max_scale = grad_scal_cfg.max_scale

        # Zero related args
-        overlap_communication = zero_cfg.zero_overlap_communication
        reduce_bucket_size = zero_cfg.reduce_bucket_size
        clip_grad_norm = zero_cfg.clip_grad_norm
+        self._overlap_sync_grad = zero_cfg.overlap_sync_grad
+        self._overlap_sync_param = zero_cfg.overlap_sync_param

        super().__init__(optim=optimizer)

-        self.has_moe = has_moe
-
        self._dtype = self.optim.param_groups[0]["params"][0].dtype
        self._cpu_offload = cpu_offload
        self._zero_local_rank = gpc.get_local_rank(ParallelMode.ZERO1)
@ -128,15 +127,18 @@ class HybridZeroOptimizer(BaseOptimizer):
        self._grad_store = GradientStore(ParallelMode.DATA)
        self._non_moe_bucket_store = BucketStore(ParallelMode.DATA)
        self._moe_bucket_store = BucketStore(ParallelMode.EXPERT_DATA)
+        self._bucket_in_progress = []

        # fp16 and fp32 params for mixed precision training
        self._fp16_param_groups = dict()
        self._fp32_flat_param_groups_of_current_rank = dict()

        # communication params
-        self._overlap_communication = overlap_communication
+        # self._overlap_communication = overlap_communication
        self._reduce_bucket_size = reduce_bucket_size

+        self._comm_bcast_stream = torch.cuda.Stream()
+
        # gradient scaler
        self.grad_scaler = DynamicGradScaler(
            initial_scale=initial_scale,
@ -166,7 +168,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        )
        self.params_per_rank_id_dict = []
        self._param_bcast_sync_handler = param_bcast_sync_handler
-        if self._overlap_communication:
+        if self._overlap_sync_param:
            assert self._param_bcast_sync_handler is not None

        # iterate over the param group in the optimizer
@ -238,16 +240,9 @@ class HybridZeroOptimizer(BaseOptimizer):
        # flag used to skip unnecessary gradient reduce operation when gradient accumulation is enabled.
        self.skip_grad_reduce = False

-        # initialize communication stream for
-        # communication-computation overlapping
-        if self._overlap_communication:
-            self._comm_stream = torch.cuda.Stream()
-        else:
-            self._comm_stream = torch.cuda.current_stream()
-
        # reduction hook is only used if overlapping communication
        # if it is stage 1 without overlapping, no hook will be attached
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            self._attach_reduction_hook()

    @property
@ -278,7 +273,8 @@ class HybridZeroOptimizer(BaseOptimizer):
        param_list = param_group["params"]

        if self._is_moe_group(param_group):
-            # just add current params to params_per_rank[_zero_local_rank]
+            # for moe group, we do not need to partition the params, just add current
+            # params to params_per_rank[_zero_local_rank]
            params_per_rank[self._zero_local_rank] = list(param_list)
            self.params_per_rank_id_dict[-1][self._zero_local_rank].append(None)
            no_params_ranks = list(range(self._zero_world_size))
@ -290,7 +286,7 @@ class HybridZeroOptimizer(BaseOptimizer):
                global_id = str(i)
                for j in range(len(param.size())):
                    global_id = "_".join([global_id, str(param.size()[j])])
-                if self._overlap_communication:
+                if self._overlap_sync_param:
                    rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param)
                else:
                    rank_to_go = numel_per_rank.index(min(numel_per_rank))
@ -313,6 +309,12 @@ class HybridZeroOptimizer(BaseOptimizer):
    def _is_moe_group(self, param_group):
        return "moe" in param_group.keys() and param_group["moe"]

+    def _is_norm_group(self, param_group):
+        return "norm" in param_group.keys() and param_group["norm"]
+
+    def _is_gate_group(self, param_group):
+        return "gate" in param_group.keys() and param_group["gate"]
+
    def _attach_reduction_hook(self):
        # we iterate over the fp16 params
        # on each param, we register a hook to its AccumulateGrad object
@ -411,36 +413,41 @@ class HybridZeroOptimizer(BaseOptimizer):

    def _reduce_grads_by_rank(self, reduce_rank, grads, bucket_size, dp_parallel_mode):
        grad_buckets_by_dtype = split_half_float_double(grads)
-
+        next_bucket_list = []
+        # add parameters into bucket for reduction
        for tensor_list in grad_buckets_by_dtype:
            param_bucket = TensorBucket(size=bucket_size)
            for tensor in tensor_list:
                param_bucket.add_to_bucket(tensor, allow_oversize=True)
-                if param_bucket.is_full_or_oversized():
-                    self._reduce_and_copy(
-                        bucket=param_bucket, reduce_rank=reduce_rank, dp_parallel_mode=dp_parallel_mode
-                    )
-                    param_bucket.empty()
            if not param_bucket.is_empty():
                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank, dp_parallel_mode=dp_parallel_mode)
+            next_bucket_list.append(param_bucket)
+
+        # wait for the completion of previouce bucket list reduction, and do unflatten_and_copy()
+        # here we can also overlap the communication with some memcpy operation caused by bucket.flatten()
+        for bucket in self._bucket_in_progress:
+            bucket.commu_handle.wait()
+            bucket.unflatten_and_copy()
+            bucket.empty()
+        self._bucket_in_progress = []
+        self._param_store.clear_grads_of_previous_reduced_params()
+
+        # after the completion of bucket list reduction, add new buckets into _bucket_in_progress
+        self._bucket_in_progress = next_bucket_list.copy()

    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank, dp_parallel_mode):
-        if self._overlap_communication:
-            self._comm_stream.synchronize()
-            self._param_store.clear_grads_of_previous_reduced_params()
+        # flatten the tensors and do allreduce
+        bucket.flatten()
+        bucket.commu_handle = reduce_tensor(
+            tensor=bucket.get_flat_tensor(),
+            dtype=None,
+            dst_rank=reduce_rank,
+            parallel_mode=dp_parallel_mode,
+        )

-        with torch.cuda.stream(self._comm_stream):
-            flat = bucket.flatten()
-            reduced_flat = reduce_tensor(
-                tensor=flat,
-                dtype=self.dtype,
-                dst_rank=reduce_rank,
-                parallel_mode=dp_parallel_mode,
-            )
-
-            # update the reduced tensor
-            if reduce_rank is None or reduce_rank == self._zero_local_rank:
-                bucket.unflatten_and_copy(reduced_flat)
+        # update the reduced tensor
+        if reduce_rank is None or reduce_rank == self._zero_local_rank:
+            bucket.set_unflatten_and_copy_flag(flag=True)

    def _has_inf_or_nan(self, tensor):
        try:
@ -524,6 +531,7 @@ class HybridZeroOptimizer(BaseOptimizer):
            grads = [self.padding_grad]
            params = [self.padding_tensor]

+        norm = 0
        if self._clip_grad_norm > 0:
            # this norm is before scaling, it will be very large
            norm = compute_norm(
@ -536,15 +544,23 @@ class HybridZeroOptimizer(BaseOptimizer):
        return norm

    def _compute_norm_with_moe_group(self, group_id):
-        parameters = self._param_store.get_fp16_params_by_rank_group(group_id=group_id, rank=self._zero_local_rank)
-        # wo do not get the average grad for moe parameters, so we have to constuct
-        # the gradients list hear. Maybe this can be optimized.
-        gradients = [p.grad for p in parameters]
-        norm = compute_norm(
-            gradients=gradients,
-            parameters=parameters,
-            last_stage=True,
-        )
+        params = self._param_store.get_fp16_params_by_rank_group(group_id=group_id, rank=self._zero_local_rank)
+        # wo do not get the average grad for moe parameters, so we have to constuct the gradients list here.
+        # Maybe this can be optimized.
+        grads = [p.grad for p in params]
+
+        if len(params) == 0:
+            grads = [self.padding_grad]
+            params = [self.padding_tensor]
+
+        norm = 0
+        if self._clip_grad_norm > 0:
+            norm = compute_norm(
+                gradients=grads,
+                parameters=params,
+                last_stage=True,
+                is_moe_group=True,
+            )

        # Need to allreduce(avg) the norms across different ranks because moe params will not be synced during allreduce
        # model and zero have been reduced!!!
@ -555,6 +571,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        all_groups_norm = scaled_norm_tensor.item()
        return all_groups_norm

+    @llm_timeout(func_name="optim_step")
    def step(self, closure=None):
        """Performs a single optimization step.

@ -568,7 +585,7 @@ class HybridZeroOptimizer(BaseOptimizer):

        # if not overlapping communication (no reduction hook is attached)
        # we need to manually reduce these gradients
-        if not self._overlap_communication:
+        if not self._overlap_sync_grad:
            for group_id in range(len(self._fp16_param_groups)):
                for param in self._fp16_param_groups[group_id]:
                    # we should not reduce the param in moe
@ -582,25 +599,35 @@ class HybridZeroOptimizer(BaseOptimizer):
        # compute norm for gradients in the before bucket
        groups_norms = []
        for group_id in range(self.num_param_groups):
-            groups_norms.append(self._compute_norm_with_stage(group_id=group_id))
+            if self._is_moe_group(self.optim.param_groups[group_id]):
+                groups_norms.append(None)
+            else:
+                groups_norms.append(self._compute_norm_with_stage(group_id=group_id))

        # clear reduced grads
-        if self._overlap_communication:
-            # grads in the last bucket is reduced
-            self._comm_stream.synchronize()
-            self._param_store.clear_grads_of_previous_reduced_params()
+        # grads in the last bucket is reduced
+        for bucket in self._bucket_in_progress:
+            bucket.commu_handle.wait()
+            bucket.unflatten_and_copy()
+            bucket.empty()
+        self._bucket_in_progress = []
+        self._param_store.clear_grads_of_previous_reduced_params()

        # compute norm for gradients in the last bucket
-        total_norms = []
+        total_norms = {}
        for group_id in range(self.num_param_groups):
+            group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
+            group_name = f"{group_id}_{group_name}"
            if self._is_moe_group(self.optim.param_groups[group_id]):
-                total_norms.append(self._compute_norm_with_moe_group(group_id=group_id))
+                total_norms[group_name] = self._compute_norm_with_moe_group(group_id=group_id)
            else:
-                total_norms.append(
-                    self._compute_norm_with_stage(
-                        group_id=group_id, last_bucket=True, last_stage=True, previous_norm=groups_norms[group_id]
-                    )
+                total_norms[group_name] = self._compute_norm_with_stage(
+                    group_id=group_id,
+                    last_bucket=True,
+                    last_stage=True,
+                    previous_norm=groups_norms[group_id],
                )
+
        timer("sync_grad").start()
        self._sync_grad()
        timer("sync_grad").stop()
@ -612,27 +639,44 @@ class HybridZeroOptimizer(BaseOptimizer):

        # check for overflow
        found_inf = False
+        found_nan = False
        # if there is INF values in grades, compute_norm func would also returns -1
        # thus, we try to avoid call _check_overflow here
        # found_inf = self._check_overflow()
        # Because you may encounter inf when computing norm

-        if -1 in norms:
+        if -1 in norms.values():
            found_inf = True
+
+        if -2 in norms.values():
+            found_nan = True
+
        loss_scale = float(self.loss_scale.item())  # backup
        if gpc.config.model.dtype is not torch.float32:
            self.grad_scaler.update(found_inf)
+
        # update loss scale if overflow occurs
        if found_inf:
            if gpc.is_rank_for_log():
                logger.warning("Overflow occurs, please check it.")
                send_alert_message(
-                    address=gpc.config.alert_address,
+                    address=gpc.config.monitor.alert.feishu_alert_address,
                    message="Overflow occurs, please check it.",
                )
            self._grad_store._averaged_gradients = dict()
            self.zero_grad()
-            return False, None
+            return False, norms
+
+        if found_nan:
+            if gpc.is_rank_for_log():
+                logger.warning("Nan grad norm occurs, please check it.")
+                send_alert_message(
+                    address=gpc.config.monitor.alert.feishu_alert_address,
+                    message="Nan grad norm  occurs, please check it.",
+                )
+            self._grad_store._averaged_gradients = dict()
+            self.zero_grad()
+            return False, norms

        # copy the grad of fp16 param to fp32 param
        single_grad_partition_groups = []
@ -658,21 +702,42 @@ class HybridZeroOptimizer(BaseOptimizer):
                param_shape == flat_fp32_avg_grads.shape
            ), f"fp32 param and grad have different shape {param_shape} vs {flat_fp32_avg_grads.shape}"

+            # Parameters shared within a TP group, such as norm and moe gate, have precision inconsistency in gradients.
+            # Therefore, it is recommended to synchronize gradients within the TP group to eliminate accumulated errors.
+            if self._is_norm_group(self.optim.param_groups[group_id]):
+                dist.all_reduce(
+                    flat_fp32_avg_grads,
+                    op=dist.ReduceOp.AVG,
+                    group=gpc.get_group(ParallelMode.TENSOR),
+                )
+
+            if self._is_gate_group(self.optim.param_groups[group_id]):
+                dist.all_reduce(
+                    flat_fp32_avg_grads,
+                    op=dist.ReduceOp.AVG,
+                    group=gpc.get_group(ParallelMode.TENSOR),
+                )
+
            single_grad_partition_groups.append(flat_fp32_avg_grads)
            device = self._fp32_flat_param_groups_of_current_rank[group_id].device
            self._fp32_flat_param_groups_of_current_rank[group_id].grad = flat_fp32_avg_grads.to(device)

        # unscale and clip grads
        # get the global norm
-        global_norm_groups = []
+        global_norm_groups = {}
        if self._clip_grad_norm > 0:
-            for group_id in range(self.num_param_groups):
-                global_norm_groups.append(norms[group_id] ** 0.5)
+            for group_name, norm in norms.items():
+                global_norm_groups[group_name] = norm**0.5

        # the following operations are performed only on the rank to which parameters are assigned.
        if gpc.config.model.dtype is not torch.float32:
-            if len(single_grad_partition_groups) != 0:
-                self._unscale_and_clip_grads(single_grad_partition_groups, global_norm_groups, loss_scale)
+            if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
+                self._unscale_and_clip_grads(
+                    single_grad_partition_groups,
+                    list(global_norm_groups.values()),
+                    loss_scale,
+                )
+
        # update the parameters
        timer("step").start()

@ -691,14 +756,17 @@ class HybridZeroOptimizer(BaseOptimizer):
                    fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
                    fp16_param.data.copy_(fp32_param)

-        with torch.cuda.stream(self._comm_stream):
+        torch.cuda.synchronize()
+        with torch.cuda.stream(self._comm_bcast_stream):
            self.broadcast_params()

        timer("step").stop()

        # update gradients may not be needed here, because the sync_params function is used in initialization,
        # so synchronization is maintained
-        return True, [global_norm / loss_scale for global_norm in global_norm_groups]
+        for group_name, global_norm in global_norm_groups.items():
+            global_norm_groups[group_name] = global_norm / loss_scale
+        return True, global_norm_groups

    def broadcast_params(self):
        handles = []
@ -720,7 +788,7 @@ class HybridZeroOptimizer(BaseOptimizer):
                async_op=True,
            )

-            if self._overlap_communication:
+            if self._overlap_sync_param:
                self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
            else:
                handles.append(handle)
@ -818,3 +886,17 @@ class HybridZeroOptimizer(BaseOptimizer):

        if "zero_devide_optim_plan" in states:
            self.params_per_rank_id_dict = states["zero_devide_optim_plan"]
+
+
+def reload_zero_fp32_buff(optimizer):
+    # If we use AMP optimizer, we need to update its fp32 buffer as newly loaded weights value.
+    # Or we must ensure that loading model weights must be done before zero is initialized.
+    if isinstance(optimizer, HybridZeroOptimizer):
+        for group_id, param_group in enumerate(optimizer.optim.param_groups):
+            if optimizer.param_group_has_params[group_id]:
+                # flatten fp16 params have already been updated by 'load_model_checkpoint'
+                fp16_flat_current_rank = optimizer._param_store.get_flat_fp16_param_by_rank_group(
+                    optimizer._zero_local_rank, group_id
+                )
+                # param_group["params"] is fp32 flatten optimizer states of this zero rank.
+                param_group["params"][0].data.copy_(fp16_flat_current_rank.float())
--- a/internlm/solver/optimizer/store.py
+++ b/internlm/solver/optimizer/store.py
@ -253,11 +253,17 @@ class ParameterStore(BaseStore):
        if not last_bucket:
            if group_id not in self._former_bucket_reduced_param:
                return [], []
-            return self._former_bucket_reduced_param[group_id], self._former_bucket_reduced_grad[group_id]
+            return (
+                self._former_bucket_reduced_param[group_id],
+                self._former_bucket_reduced_grad[group_id],
+            )
        else:
            if group_id not in self._last_bucket_reduced_param:
                return [], []
-            return self._last_bucket_reduced_param[group_id], self._last_bucket_reduced_grad[group_id]
+            return (
+                self._last_bucket_reduced_param[group_id],
+                self._last_bucket_reduced_grad[group_id],
+            )

    def reset_reduced_data_for_compute_norm(self):
        self._former_bucket_reduced_param = {}
@ -281,6 +287,9 @@ class TensorBucket:
        self._max_size = size
        self._current_size = 0
        self._bucket = []
+        self._flat_tensor = None
+        self._unflatten_and_copy_flag = False
+        self.commu_handle = None

    @property
    def max_size(self):
@ -296,6 +305,15 @@ class TensorBucket:
    def is_empty(self):
        return len(self._bucket) == 0

+    def set_unflatten_and_copy_flag(self, flag):
+        self._unflatten_and_copy_flag = flag
+
+    def get_unflatten_and_copy_flag(self):
+        return self._unflatten_and_copy_flag
+
+    def get_flat_tensor(self):
+        return self._flat_tensor
+
    def add_to_bucket(self, tensor, allow_oversize=False):
        tensor_size = tensor.numel()

@ -316,11 +334,14 @@ class TensorBucket:
    def empty(self):
        self._bucket = []
        self._size = 0
+        self._flat_tensor = None
+        self.commu_handle = None

    def flatten(self):
-        return _flatten_dense_tensors(self._bucket)
+        self._flat_tensor = _flatten_dense_tensors(self._bucket)

-    def unflatten_and_copy(self, flat_tensor):
-        unflattened_tensor_list = _unflatten_dense_tensors(flat_tensor, self._bucket)
-        for old, new in zip(self._bucket, unflattened_tensor_list):
-            old.copy_(new)
+    def unflatten_and_copy(self):
+        if self._unflatten_and_copy_flag:
+            unflattened_tensor_list = _unflatten_dense_tensors(self._flat_tensor, self._bucket)
+            for old, new in zip(self._bucket, unflattened_tensor_list):
+                old.copy_(new)
--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@ -95,37 +95,34 @@ def reduce_tensor(tensor, dtype=None, dst_rank=None, parallel_mode=ParallelMode.
    :type parallel_mode: ParallelMode, optional
    """
    # use the original dtype
-    if dtype is None:
-        dtype = tensor.dtype
+    # if dtype is None:
+    assert dtype is None
+    dtype = tensor.dtype

    # cast the data to specified dtype for reduce/all-reduce
-    if tensor.dtype != dtype:
-        tensor_to_reduce = tensor.to(dtype)
-    else:
-        tensor_to_reduce = tensor
+    # if tensor.dtype != dtype:
+    #     tensor_to_reduce = tensor.to(dtype)
+    # else:
+    #     tensor_to_reduce = tensor

-    world_size = gpc.get_world_size(parallel_mode)
+    # world_size = gpc.get_world_size(parallel_mode)
+    # tensor.div_(world_size)
    group = gpc.get_group(parallel_mode)
-    tensor_to_reduce.div_(world_size)

    # if rank is None, all reduce will be used
    # else, reduce is used
    use_all_reduce = dst_rank is None

    if use_all_reduce:
-        dist.all_reduce(tensor_to_reduce, group=group)
+        handle = dist.all_reduce(tensor=tensor, group=group, op=torch.distributed.ReduceOp.AVG, async_op=True)
    else:
        ranks_in_group = gpc.get_ranks_in_group(parallel_mode)
        global_rank = ranks_in_group[dst_rank]
-        dist.reduce(tensor=tensor_to_reduce, dst=global_rank, group=group)
+        handle = dist.reduce(
+            tensor=tensor, dst=global_rank, group=group, op=torch.distributed.ReduceOp.AVG, async_op=True
+        )

-    # recover the original dtype
-    if tensor.dtype != dtype and tensor is not tensor_to_reduce:
-        local_rank = gpc.get_local_rank(parallel_mode)
-        if use_all_reduce or dst_rank == local_rank:
-            tensor.copy_(tensor_to_reduce)
-
-    return tensor
+    return handle


 def has_inf_or_nan(tensor):
@ -212,7 +209,7 @@ def calc_lp(grads, norm_type):
    return norm


-def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, norm_type=2):
+def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, norm_type=2, is_moe_group=False):
    """Get the norm
    Arguments:
        gradients (Iterable[Tensor]): The gradient value.
@ -305,7 +302,8 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no

        # This is because we use zero1, so we need to use this reduction.
        # TODO: Check zero group to be a subset of dp group.
-        dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.ZERO1))
+        if not is_moe_group:
+            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.ZERO1))

        if torch.is_tensor(total_norm):
            total_norm = total_norm.item()
@ -314,6 +312,9 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no
    if total_norm == float("inf") or total_norm == -float("inf"):
        total_norm = -1

+    if math.isnan(total_norm):
+        total_norm = -2
+
    return total_norm


--- a/internlm/train/init.py
+++ b/internlm/train/init.py
@ -1,7 +1,6 @@
 from .training_internlm import (
    get_train_data_loader,
    get_validation_data_loader,
-    initialize_distributed_env,
    initialize_llm_profile,
    initialize_model,
    initialize_optimizer,
@ -12,7 +11,6 @@ from .training_internlm import (
 __all__ = [
    "get_train_data_loader",
    "get_validation_data_loader",
-    "initialize_distributed_env",
    "initialize_llm_profile",
    "initialize_model",
    "initialize_optimizer",
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -10,9 +10,9 @@ import torch.distributed as dist
 from torch import nn
 from torch.utils.data import ConcatDataset, DataLoader

-import internlm
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.core.context.random import set_mode
 from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.trainer import TrainState
 from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
@ -25,14 +25,14 @@ from internlm.data.packed_dataset import (
    get_packed_dataset_without_short_length,
 )
 from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
-from internlm.model.moe import create_moe_param_groups, has_moe_layers
-from internlm.monitor import set_env_var
+from internlm.monitor import send_heartbeat, set_env_var
 from internlm.monitor.monitor import monitor_manager as mm
 from internlm.solver.beta2_scheduler import Beta2Scheduler
 from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
 from internlm.solver.optimizer import HybridZeroOptimizer
 from internlm.solver.optimizer.utils import ParamBcastSyncHandler
-from internlm.utils.common import DummyProfile, get_master_node
+from internlm.train.utils import create_param_groups
+from internlm.utils.common import DummyProfile
 from internlm.utils.logger import get_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
 from internlm.utils.parallel import (
@ -41,41 +41,19 @@ from internlm.utils.parallel import (
    sync_model_param_within_tp,
 )
 from internlm.utils.registry import MODEL_INITIALIZER
+from internlm.utils.timeout import llm_timeout

 logger = get_logger(__file__)


-def initialize_distributed_env(config: str, launcher: str = "slurm", master_port: int = 8888, seed: int = 1024):
-    """
-    Initialize distributed environment for distributed training.
-
-    Args:
-        config (str): Config file path.
-        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
-        master_port (str): The master port for distributed training. 8888 by default.
-        seed (int, optional): Specified random seed for every process. 1024 by default.
-    """
-
-    torch.cuda.empty_cache()
-
-    if launcher == "torch":
-        internlm.launch_from_torch(config=config, seed=seed)
-    elif launcher == "slurm":
-        internlm.launch_from_slurm(
-            config=config,
-            host=get_master_node(),
-            port=master_port,
-            seed=seed,
-        )
-    else:
-        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
-
-
+@llm_timeout(func_name="initialize_model")
 def initialize_model():
    """
-    Initialize model.
+    Initialize model with Automatic Mixed Precision.

-    Returns: The neural network model to be trained or evaluated.
+    Returns:
+        torch.nn.Module:
+            The neural network model to be trained or evaluated.
    """

    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
@ -108,22 +86,33 @@ def initialize_model():
    # the same across tensor parallelism.
    sync_model_param_within_tp(model)

+    # Change random state mode to ParallelMode.DATA after model is built, guaranteeing the random
+    # state in the same dp group are all the same.
+    set_mode(ParallelMode.DATA)
+
    return model


+@llm_timeout(func_name="initialize_optimizer")
 def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
    """
    Initialize optimizer.

    Args:
-        model (torch.nn.Module): Your model instance to be trained or evaluated.
+        model (:class:`torch.nn.Module`): Your model instance to be trained or evaluated.

-    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
+    Returns:
+        A tuple of (optimizer, beta2_scheduler, lr_scheduler).
    """
-    param_bcast_sync_handler = ParamBcastSyncHandler(model)
+    if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
+        param_bcast_sync_handler = ParamBcastSyncHandler(model)
+    else:
+        param_bcast_sync_handler = None
+
    adam_cfg = gpc.config.adam
+    # split the moe parameters into different groups
    if gpc.config.model.num_experts > 1:
-        params = create_moe_param_groups(model, adam_cfg.weight_decay)
+        params = create_param_groups(model, adam_cfg.weight_decay)
    else:
        params = [{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}]
    naive_optimizer = torch.optim.AdamW(
@ -133,12 +122,10 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
        eps=adam_cfg.adam_eps,
    )

-    has_moe = has_moe_layers(model)
    optimizer = HybridZeroOptimizer(
        naive_optimizer,
        grad_scal_cfg=gpc.config.grad_scaler,
        zero_cfg=gpc.config.hybrid_zero_optimizer,
-        has_moe=has_moe,
        param_bcast_sync_handler=param_bcast_sync_handler,
    )

@ -149,13 +136,21 @@ def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
    return optimizer, beta2_scheduler, lr_scheduler


+@llm_timeout(func_name="get_train_data_loader")
 def get_train_data_loader(
    num_worker: int = 0, dataset_generate_func: Callable = None, train_sampler=None, train_collate_fn=None
 ):
    """
    Generate and return the training data loader.

-    Returns: A tuple of (train_dl, dataset_types).
+    Args:
+        num_worker (:class:`int`): number of subprocesses used for dataloader.
+        dataset_generate_func (:class:`Callable`, optional): generate function for dataset.
+        train_sampler (:class:`torch.utils.data.sampler`, optional): dataset sampler for training dataloader.
+        train_collate_fn (:class:`Callable`, optional): collate function for training dataloader.
+
+    Returns:
+        A tuple of (train_dl, dataset_types).
    """

    # Get the dataset types
@ -221,6 +216,7 @@ def get_train_data_loader(
    return train_dl, dataset_types


+@llm_timeout(func_name="get_validation_data_loader")
 def get_validation_data_loader(
    num_worker: int = 0, dataset_generate_func: Callable = None, val_collate_fn=None, dataloader_func=None
 ):
@ -282,6 +278,7 @@ def get_validation_data_loader(
    return val_dls


+@llm_timeout(func_name="load_new_batch")
 def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
    """
    Load and return the new batch data based on training data loader.
@ -339,6 +336,7 @@ def initialize_llm_profile(profiling: bool = False, start_time: str = None):
    )


+@llm_timeout(func_name="record_current_batch_training_metrics")
 def record_current_batch_training_metrics(
    get_tflops_func,
    logger,
@ -363,6 +361,7 @@ def record_current_batch_training_metrics(

    set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))

+    timer.store_last_timers()
    if success_update in (0, True):
        train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
    if is_no_pp_or_last_stage():
@ -380,9 +379,52 @@ def record_current_batch_training_metrics(
        max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
        max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
        min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
-
-        tk_per_gpu = 0
+        time_cost = time.time() - start_time
        tk_per_gpu = round(
+            num_tokens_in_batch * gpc.get_world_size(ParallelMode.DATA) / gpc.get_world_size(ParallelMode.GLOBAL),
+            4,
+        )
+        tgs_statistic = train_state.tgs_statistic
+        tgs_statistic["sum_step"] += 1
+        tgs_statistic["sum_tg"] += tk_per_gpu
+        tgs_statistic["sum_time"] += time_cost
+        tgs_statistic["sum_last_tg_10"] += tk_per_gpu
+        tgs_statistic["sum_last_time_10"] += time_cost
+        tgs_statistic["sum_last_tg_50"] += tk_per_gpu
+        tgs_statistic["sum_last_time_50"] += time_cost
+        tgs_statistic["SMA_tg_50"] += tk_per_gpu
+        tgs_statistic["SMA_time_50"] += time_cost
+        tgs_statistic["SMA_tg_50_list"].append(tk_per_gpu)
+        tgs_statistic["SMA_time_50_list"].append(time_cost)
+        if tgs_statistic["sum_step"] > 50:
+            tgs_statistic["SMA_tg_50"] -= tgs_statistic["SMA_tg_50_list"][0]
+            tgs_statistic["SMA_time_50"] -= tgs_statistic["SMA_time_50_list"][0]
+            tgs_statistic["SMA_tg_50_list"].popleft()
+            tgs_statistic["SMA_time_50_list"].popleft()
+
+        last_tgs_1 = round(tk_per_gpu / time_cost, 2)
+        tgs_statistic["sum_tgs"] += last_tgs_1
+
+        if tgs_statistic["sum_step"] % 10 == 0:
+            tgs_statistic["last_tgs_10"] = round(tgs_statistic["sum_last_tg_10"] / tgs_statistic["sum_last_time_10"], 2)
+            tgs_statistic["sum_last_tg_10"] = 0
+            tgs_statistic["sum_last_time_10"] = 0
+
+        if tgs_statistic["sum_step"] % 50 == 0:
+            tgs_statistic["last_tgs_50"] = round(tgs_statistic["sum_last_tg_50"] / tgs_statistic["sum_last_time_50"], 2)
+            tgs_statistic["sum_last_tg_50"] = 0
+            tgs_statistic["sum_last_time_50"] = 0
+
+        last_tgs_10 = tgs_statistic["last_tgs_10"]
+        last_tgs_50 = tgs_statistic["last_tgs_50"]
+
+        tgs_all = round(tgs_statistic["sum_tg"] / tgs_statistic["sum_time"], 2)
+        tgs_avg = round(tgs_statistic["sum_tgs"] / tgs_statistic["sum_step"], 2)
+        tgs_SMA = round(tgs_statistic["SMA_tg_50"] / tgs_statistic["SMA_time_50"], 2)
+
+        tflops = get_tflops_func((time.time() - start_time))
+
+        tgs_origin = round(
            num_tokens_in_batch
            * gpc.get_world_size(ParallelMode.DATA)
            / gpc.get_world_size(ParallelMode.GLOBAL)
@ -390,14 +432,18 @@ def record_current_batch_training_metrics(
            2,
        )

-        tflops = get_tflops_func((time.time() - start_time))
-
        infos = {
            "tflops": tflops,
            "step": batch_count,
-            "loss": loss.item(),
+            "loss": loss.item() - moe_loss.item(),
            "moe_loss": moe_loss.item(),
-            "tgs (tokens/gpu/second)": tk_per_gpu,
+            "tgs (tokens/gpu/second)": tgs_origin,
+            "tgs/last_tgs_1": last_tgs_1,
+            "tgs/tgs_all": tgs_all,
+            "tgs/tgs_avg": tgs_avg,
+            "tgs/tgs_SMA": tgs_SMA,
+            "tgs/last_tgs_10": last_tgs_10,
+            "tgs/last_tgs_50": last_tgs_50,
            "lr": lr,
            "loss_scale": scaler,
            "grad_norm": grad_norm,
@ -421,27 +467,41 @@ def record_current_batch_training_metrics(
        line = ""
        for key, value in infos.items():
            line += f"{key}={value} "
-            writer.add_scalar(key=key, value=value, step=train_state.step_count)
+            if isinstance(value, dict):
+                writer.add_scalars(key=key, value=value, step=train_state.step_count)
+            else:
+                writer.add_scalar(key=key, value=value, step=train_state.step_count)
+
+        if gpc.config.monitor.alert.get("light_monitor_address", None) and batch_count % 50 == 0:
+            send_heartbeat("train_metrics", infos)

        if update_panel:
+            # metrics shown with dashboard panels
+            panel_metrics = {
+                "step": batch_count,
+                "lr": lr,
+                "num_consumed_tokens": train_state.num_consumed_tokens,
+                "loss": loss.item() - moe_loss.item(),
+                "flops": tflops,
+                "tgs": last_tgs_1,
+                "acc": acc_perplex["acc"],
+                "perplexity": acc_perplex["perplexity"],
+                "fwd_bwd_time": fwd_bwd_time,
+            }
+            for norm_key, norm_value in grad_norm.items():
+                panel_metrics[norm_key] = norm_value
+
            logger.info(
-                line,
-                extra={
-                    "step": batch_count,
-                    "lr": lr,
-                    "num_consumed_tokens": train_state.num_consumed_tokens,
-                    "grad_norm": grad_norm,
-                    "loss": loss.item(),
-                    "moe_loss": moe_loss.item(),
-                    "flops": tflops,
-                    "tgs": tk_per_gpu,
-                    "acc": acc_perplex["acc"],
-                    "perplexity": acc_perplex["perplexity"],
-                    "fwd_bwd_time": fwd_bwd_time,
-                },
+                "{line}",
+                line=line,
+                extra=panel_metrics,
            )
        else:
            logger.info(line)

        # if loss spike occurs, send alert info to feishu
-        mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
+        mm.monitor_loss_spike(
+            alert_address=gpc.config.monitor.alert.feishu_alert_address,
+            step_count=batch_count,
+            cur_step_loss=loss.item(),
+        )
--- a/Show More
+++ b/Show More