Merge branch 'main' into develop

2023-09-05 17:45:26 +08:00 · 2023-09-05 17:45:26 +08:00 · 0e62d41137
parent 53648dc0e9 5238f15e2d
commit 0e62d41137
71 changed files with 3116 additions and 898 deletions
--- a/.github/workflows/demo_in_readme.yaml
+++ b/.github/workflows/demo_in_readme.yaml
@ -66,9 +66,23 @@ jobs:
    - uses: actions/checkout@v3

    - name: slurm-train
+      id: basic_train
      run: |
        source activate internlm-env-test
        sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
+
+    - name: load_preset_ckpt
+      if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
+      run: |
+        source activate internlm-env-test
+        export PYTHONPATH=$PWD:$PYTHONPATH
+        sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
+
+    - name: load_new_ckpt
+      run: |
+        source activate internlm-env-test
+        export PYTHONPATH=$PWD:$PYTHONPATH
+        sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
        rm -rf $GITHUB_WORKSPACE/llm_ckpts

    - name: torchrun-train
@ -96,7 +110,6 @@ jobs:
        srun -p ${SLURM_PARTITION} --job-name=${GITHUB_RUN_ID}-${GITHUB_JOB} --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
        cd ..
        rm -rf $GITHUB_WORKSPACE/hf_ckpt
-  
  load-chat-model-in-hf:
    if: ${{ always() }}
    needs: check-requirements
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@ -0,0 +1,28 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: doc/code-docs/source/conf.py
+  fail_on_warning: false
+
+# Optionally build your docs in additional formats such as PDF
+formats:
+  - pdf
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: doc/code-docs/requirements.txt
--- a/README-ja-JP.md
+++ b/README-ja-JP.md
@ -40,6 +40,10 @@ InternLM は、70 億のパラメータを持つベースモデルと、実用

 さらに、大規模な依存関係を必要とせずにモデルの事前学習をサポートする軽量な学習フレームワークが提供されます。単一のコードベースで、数千の GPU を持つ大規模クラスタでの事前学習と、単一の GPU での微調整をサポートし、顕著な性能最適化を達成します。InternLM は、1024GPU でのトレーニングにおいて 90% 近いアクセラレーション効率を達成しています。

+## 新闻
+
+InternLM-7B-Chat v1.1 は、コード インタプリタと関数呼び出し機能を備えてリリースされました。 [Lagent](https://github.com/InternLM/lagent) で試すことができます。
+
 ## InternLM-7B

 ### パフォーマンス評価
@ -80,8 +84,8 @@ Transformers を使用して InternLM 7B チャットモデルをロードする

 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "こんにちは", history=[])
 >>> print(response)
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@ -45,6 +45,10 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数

 提供了支持模型预训练的轻量级训练框架，无需安装大量依赖包，一套代码支持千卡预训练和单卡人类偏好对齐训练，同时实现了极致的性能优化，实现千卡训练下近90%加速效率。

+## 新闻
+
+我们开源了 InternLM-Chat-7B v1.1。该模型能够调用代码解释器和工具插件。你可以在 [Lagent](https://github.com/InternLM/lagent) 中体验这些新功能。
+
 ## InternLM-7B

 ### 性能评测
@ -74,6 +78,7 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数
 | 模型                 | InternLM 格式权重下载地址                                                                                                                      | Transformers 格式权重下载地址                    |
 | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------ |
 | **InternLM 7B**      | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b) | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b) |
+| **InternLM Chat 7B v1.1**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1)    | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1)       |
 | **InternLM Chat 7B** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b) | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)
 | **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k)

@ -85,8 +90,8 @@ InternLM ，即书生·浦语大模型，包含面向实用场景的70亿参数

 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "你好", history=[])
 >>> print(response)
@ -117,26 +122,44 @@ streamlit run web_demo.py

 我们使用 [LMDeploy](https://github.com/InternLM/LMDeploy) 完成 InternLM 的一键部署。

-1. 首先安装 LMDeploy:
-
 ```bash
 python3 -m pip install lmdeploy
 ```

-2. 快速的部署命令如下：
+执行以下命令，可以在终端与 `internlm-chat-7b` 模型进行交互式对话，或者通过 WebUI 与它聊天。

 ```bash
-   python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-7b/model
-   ```
+# 转换权重格式
+python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b

-3. 在导出模型后，你可以直接通过如下命令启动服务，并在客户端与AI对话
+# 在终端进行交互式对话
+python3 -m lmdeploy.turbomind.chat ./workspace
+
+# 启动 gradio 服务
+python3 -m lmdeploy.serve.gradio.app ./workspace
+```
+以上过程中，LMDeploy 使用的是 FP16 的计算精度。
+
+除了 FP16 精度，LMDeploy 还支持 `internlm-chat-7b` 4bit 权重模型推理。它不仅把模型的显存减少到 6G，大约只有 FP16 的 40%，更重要的是，经过 kernel 层面的极致优化，其推理性能在 A100-80G 上可达到 FP16 的 2.4 倍以上。
+
+以下是`internlm-chat-7b` 4bit 权重模型的部署方法。推理速度的 bechmark 请参考[这里](https://github.com/InternLM/lmdeploy/blob/main/docs/zh_cn/w4a16.md#%E6%8E%A8%E7%90%86%E9%80%9F%E5%BA%A6)

 ```bash
-   bash workspace/service_docker_up.sh
-   python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
-   ```
+# download prequnantized internlm-chat-7b model from huggingface
+git-lfs install
+git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
+
+# Convert the model's layout and store it in the default path, ./workspace.
+python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
+
+# inference lmdeploy's turbomind engine
+python3 -m lmdeploy.turbomind.chat ./workspace
+
+# serving with gradio
+python3 -m lmdeploy.serve.gradio.app ./workspace
+```
+LMDeploy 是涵盖了 LLM 任务的全套轻量化、部署和服务的工具箱。请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。

-[LMDeploy](https://github.com/InternLM/LMDeploy) 支持了 InternLM 部署的完整流程，请参考 [部署教程](https://github.com/InternLM/LMDeploy) 了解 InternLM 的更多部署细节。

 ## 微调&训练

--- a/README.md
+++ b/README.md
@ -45,6 +45,10 @@ InternLM has open-sourced a 7 billion parameter base model and a chat model tail

 Additionally, a lightweight training framework is offered to support model pre-training without the need for extensive dependencies. With a single codebase, it supports pre-training on large-scale clusters with thousands of GPUs, and fine-tuning on a single GPU while achieving remarkable performance optimizations. InternLM achieves nearly 90% acceleration efficiency during training on 1024 GPUs.

+## News
+
+InternLM-7B-Chat v1.1 is released with code interpreter and function calling capability. You can try it with [Lagent](https://github.com/InternLM/lagent).
+
 ## InternLM-7B

 ### Performance Evaluation
@ -74,6 +78,7 @@ InternLM 7B and InternLM 7B Chat, trained using InternLM, have been open-sourced
 | Model                         | InternLM Format Weight Download Link                                                                                                                 | Transformers Format Weight Download Link                                         |
 | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------- |
 | **InternLM 7B**         | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-7b)         | [🤗internlm/intern-7b](https://huggingface.co/internlm/internlm-7b)                 |
+| **InternLM Chat 7B v1.1**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-v1.1)    | [🤗internlm/intern-chat-7b-v1.1](https://huggingface.co/internlm/internlm-chat-7b-v1.1)       |
 | **InternLM Chat 7B**    | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b)    | [🤗internlm/intern-chat-7b](https://huggingface.co/internlm/internlm-chat-7b)       |
 | **InternLM Chat 7B 8k** | [![Open in OpenXLab](https://cdn-static.openxlab.org.cn/header/openxlab_models.svg)](https://openxlab.org.cn/models/detail/OpenLMLab/InternLM-chat-7b-8k) | [🤗internlm/intern-chat-7b-8k](https://huggingface.co/internlm/internlm-chat-7b-8k) |

@ -85,8 +90,8 @@ To load the InternLM 7B Chat model using Transformers, use the following code:

 ```python
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM
->>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
->>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
+>>> tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True)
+>>> model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b-v1_1", trust_remote_code=True).cuda()
 >>> model = model.eval()
 >>> response, history = model.chat(tokenizer, "hello", history=[])
 >>> print(response)
@ -118,28 +123,45 @@ The effect is as follows

 ### Deployment

-We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the one-click deployment of InternLM.
-
-1. First, install LMDeploy:
+We use [LMDeploy](https://github.com/InternLM/LMDeploy) to complete the workflow of InternLM deployment.

 ```bash
 python3 -m pip install lmdeploy
 ```

-2. Use the following command for quick deployment:
+You can utilize the following commands to conduct `internlm-chat-7b` FP16 inference, serve it and interact with AI assistant via WebUI:

 ```bash
-    python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b /path/to/internlm-chat-7b/model
+# convert weight layout
+python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b
+
+# inference lmdeploy's turbomind engine
+python3 -m lmdeploy.turbomind.chat ./workspace
+
+# serving with gradio
+python3 -m lmdeploy.serve.gradio.app ./workspace
 ```

-3. After exporting the model, you can start a server and have a conversation with the deployed model using the following command:
+You can also deploy 4-bit quantized `internlm-chat-7b` model via LMDeploy. It greatly trims down the model's memory overhead to 6G, just 40% of what FP16 inference would take. More importantly, with extreme optimized kernel, the inference performance achieves 2.4x faster than FP16 inference on A100-80G.
+
+Try the followings to enjoy 4-bit `internlm-chat-7b` on a Geforce RTX 30x GPU card. You can find the inference benchmark from [here](https://github.com/InternLM/lmdeploy/blob/main/docs/en/w4a16.md#inference-performance).

 ```bash
-    bash workspace/service_docker_up.sh
-    python3 -m lmdeploy.serve.client {server_ip_addresss}:33337
+# download prequnantized internlm-chat-7b model from huggingface
+git-lfs install
+git clone https://huggingface.co/lmdeploy/llama2-chat-7b-w4
+
+# Convert the model's layout and store it in the default path, ./workspace.
+python3 -m lmdeploy.serve.turbomind.deploy internlm-chat-7b ./llama2-chat-7b-w4 awq --group-size 128
+
+# inference lmdeploy's turbomind engine
+python3 -m lmdeploy.turbomind.chat ./workspace
+
+# serving with gradio
+python3 -m lmdeploy.serve.gradio.app ./workspace
 ```

-[LMDeploy](https://github.com/InternLM/LMDeploy) provides a complete workflow for deploying InternLM. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.
+LMDeploy is an efficient toolkit for compressing, deploying, and serving LLM models. Please refer to the [deployment tutorial](https://github.com/InternLM/LMDeploy) for more details on deploying InternLM.

 ## Fine-tuning & Training

--- a/ci_scripts/common/com_func.py
+++ b/ci_scripts/common/com_func.py
@ -0,0 +1,29 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+
+def merge_dicts(dict_a: dict, dict_b: dict):
+    for key in dict_b.keys():
+        if isinstance(dict_b[key], dict):
+            dict_b[key] = {**dict_a[key], **dict_b[key]}
+            merge_dicts(dict_a[key], dict_b[key])
+    dict_c = {**dict_a, **dict_b}
+    return dict_c
+
+
+def format_dict_to_py_string(data: dict, indent=0, is_nested=False):
+    result = ""
+    for key, value in data.items():
+        if isinstance(value, dict):
+            result += f"{' ' * indent}{key} = dict(\n"
+            result += format_dict_to_py_string(value, indent + 4, is_nested=True)
+            result += f"{' ' * indent})"
+        else:
+            result += f"{' ' * indent}{key} = {repr(value)}"
+        if is_nested:
+            result += ","
+        result += "\n"
+    result = f"""\
+{result}
+"""
+    return result
--- a/ci_scripts/model/demo_load_7B_chat_model.py
+++ b/ci_scripts/model/demo_load_7B_chat_model.py
@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained("internlm/internlm-chat-7b", trust_remote_code=True).cuda()
--- a/ci_scripts/train/ci_7B_sft.py
+++ b/ci_scripts/train/ci_7B_sft.py
@ -10,7 +10,7 @@ VOCAB_SIZE = 103168
 # Ckpt folder format:
 # fs: 'local:/mnt/nfs/XXX'
 # oss: 'boto3:s3://model_weights/XXX'
-MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
+# MODEL_ONLY_FOLDER = "local:llm_ckpts/xxxx"
 # SAVE_CKPT_FOLDER = "local:llm_ckpts"
 SAVE_CKPT_FOLDER = "local:llm_ckpts"
 # LOAD_CKPT_FOLDER = "local:llm_ckpts/49"
@ -27,7 +27,7 @@ ckpt = dict(
    load_optimizer=True,
 )

-TRAIN_FOLDER = "/mnt/petrelfs/qa-caif-cicd/data/lm_data/alpaca_data/train/en"
+TRAIN_FOLDER = "local:../lm_data/alpaca_data/train/en"
 data = dict(
    seq_len=SEQ_LEN,
    # micro_num means the number of micro_batch contained in one gradient update
@ -120,8 +120,8 @@ zero1 parallel:
    2. if zero1 == 1, zero is not used, and all dp groups retain the full amount of model parameters.
    3. zero1 > 1 and zero1 <= dp world size, the world size of zero is a subset of dp world size.
        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
-pipeline parallel: pipeline parallel size.
-tensor parallel: tensor parallel size, usually the number of GPUs per node.
+pipeline parallel: pipeline parallel size, only 1 is accepted currently.
+tensor parallel: tensor parallel size, usually the number of GPUs per node, only 1 is accepted currently.
 """
 parallel = dict(
    zero1=8,
--- a/ci_scripts/train/generate_config.py
+++ b/ci_scripts/train/generate_config.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import argparse
+import json
+import os
+
+from ci_scripts.common import com_func
+from internlm.core.context import Config
+
+
+def generate_new_config(config_py_file, test_config_json, case_name):
+    # generate path of the new config py
+    config_path = os.path.split(config_py_file)
+    new_config_py_file = os.path.join(config_path[0], case_name + ".py")
+
+    # merge dict
+    origin_config = Config.from_file(config_py_file)
+    with open(test_config_json) as f:
+        test_config = json.load(f)
+    if test_config:
+        if case_name not in test_config.keys():
+            raise KeyError(f"the {case_name} doesn't exist.Please check {test_config} again!")
+    new_config = com_func.merge_dicts(origin_config, test_config[case_name])
+    print(f"new config is:\n{new_config}")
+
+    # write new config to py file
+    file_content = com_func.format_dict_to_py_string(new_config)
+    with open(new_config_py_file, "w") as f:
+        f.write(file_content)
+    print(f"The new test train config file is {new_config_py_file}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--origin_config",
+        type=str,
+        default="./ci_scripts/train/ci_7B_sft.py",
+        help="path to the origin train config file",
+    )
+    parser.add_argument(
+        "--test_config",
+        type=str,
+        default="./ci_scripts/train/test_config.json",
+        help="path to the test train config file",
+    )
+    parser.add_argument("--case_name", type=str, help="name of the case which will be runned ")
+    args = parser.parse_args()
+    generate_new_config(args.origin_config, args.test_config, args.case_name)
--- a/ci_scripts/train/load_ckpt.sh
+++ b/ci_scripts/train/load_ckpt.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+set -x
+
+[[ -n ${GITHUB_WORKSPACE} ]] || { echo "should set GITHUB_WORKSPACE first before ci, exit."; exit 1; }
+readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
+readonly CKPTS40_PATH="$GITHUB_WORKSPACE/llm_ckpts/40"
+readonly CKPTS40_OUTPUT="${CKPTS40_PATH}/*.pt"
+expected_num=22
+exit_code=0
+
+source ./ci_scripts/common/basic_func.sh
+
+echo "start to test slurm training with loading checkpoint."
+
+python ./ci_scripts/train/generate_config.py --case_name $1
+file="./ci_scripts/train/$1.py"
+if [[ ! -f ${file} ]]; then
+        echo "expect: ${file} exists, actual: not exist."
+        exit_code=$(($exit_code + 1))
+    fi
+
+srun -p ${SLURM_PARTITION} --exclusive --job-name=$2 -n 8 --ntasks-per-node=8 --gpus-per-task=1 python train.py --config ${file}
+[[ $? -ne 0 ]] && { echo "test slurm training failed.";  exit_code=$(($exit_code + 1)); }
+
+
+num=$(num_files "${CKPTS40_OUTPUT}")
+if [[ ${num} -ne ${expected_num} ]]; then
+    echo "expect: ${expected_num} files, actual: ${num} files."
+    exit_code=$(($exit_code + 1))
+fi
+
+# clean the test files.
+if ! rm -rf ${CKPTS_PATH}/*; then
+    echo "cleaning cached file in ${CKPTS_PATH} failed."
+    exit_code=$(($exit_code + 1))
+fi
+
+exit $exit_code
--- a/ci_scripts/train/slurm_train.sh
+++ b/ci_scripts/train/slurm_train.sh
@ -5,7 +5,7 @@ set -x
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
 readonly CKPTS20_OUTPUT="${CKPTS20_PATH}/*.pt"
-expected_num=21
+expected_num=22
 exit_code=0

 source ./ci_scripts/common/basic_func.sh
@ -28,10 +28,4 @@ if [[ ${num} -ne ${expected_num} ]]; then
    exit_code=$(($exit_code + 1))
 fi

-# clean the test files.
-if ! rm -rf ${CKPTS_PATH}/*; then
-    echo "cleaning cached file in ${CKPTS_PATH} failed."
-    exit_code=$(($exit_code + 1))
-fi
-
 exit $exit_code
--- a/ci_scripts/train/test_config.json
+++ b/ci_scripts/train/test_config.json
@ -0,0 +1,45 @@
+{
+    "7B_basic_train": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "ckpt": {
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 20
+        }
+    },
+    "7B_load_new_ckpt": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "LOAD_CKPT_FOLDER": "local:llm_ckpts/20",
+        "ckpt": {
+            "load_ckpt_folder": "local:llm_ckpts/20",
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 40
+        }
+    },
+    "7B_load_preset_ckpt": {
+        "SEQ_LEN": 1024,
+        "HIDDEN_SIZE": 2048,
+        "NUM_ATTENTION_HEAD": 16,
+        "NUM_LAYER": 16,
+        "TRAIN_FOLDER":"local:../lm_data/alpaca_data/train/en",
+        "LOAD_CKPT_FOLDER": "local:../lm_data/alpaca_data/llm_ckpts/20",
+        "ckpt": {
+            "load_ckpt_folder": "local:../lm_data/alpaca_data/llm_ckpts/20",
+            "checkpoint_every": 20
+        },
+        "data": {
+            "total_steps": 40
+        }
+    }
+}
--- a/ci_scripts/train/torchrun.sh
+++ b/ci_scripts/train/torchrun.sh
@ -5,7 +5,7 @@ set -x
 readonly CKPTS_PATH="$GITHUB_WORKSPACE/llm_ckpts"
 readonly CKPTS20_PATH="$GITHUB_WORKSPACE/llm_ckpts/20"
 readonly CKPTS_OUTPUT="${CKPTS20_PATH}/*.pt"
-expected_num=21
+expected_num=22
 exit_code=0

 source ./ci_scripts/common/basic_func.sh
--- a/configs/7B_sft.py
+++ b/configs/7B_sft.py
@ -75,7 +75,8 @@ grad_scaler = dict(

 hybrid_zero_optimizer = dict(
    # Enable low_level_optimzer overlap_communication
-    zero_overlap_communication=True,
+    overlap_sync_grad=True,
+    overlap_sync_param=True,
    # bucket size for nccl communication params
    reduce_bucket_size=512 * 1024 * 1024,
    # grad clipping
@ -120,12 +121,11 @@ model = dict(
    num_layers=NUM_LAYER,
    mlp_ratio=MLP_RATIO,
    apply_post_layer_norm=False,
-    dtype="torch.bfloat16",
+    dtype="torch.float16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
    norm_type="rmsnorm",
    layer_norm_epsilon=1e-5,
    use_flash_attn=True,
    num_chunks=1,  # if num_chunks > 1, interleaved pipeline scheduler is used.
-    sequence_parallel=False,
 )
 """
 zero1 parallel:
@ -142,6 +142,7 @@ tensor parallel: tensor parallel size, usually the number of GPUs per node.
 parallel = dict(
    zero1=8,
    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
 )

 cudnn_deterministic = False
--- a/doc/code-docs/Makefile
+++ b/doc/code-docs/Makefile
@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/code-docs/make.bat
+++ b/doc/code-docs/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/doc/code-docs/requirements.txt
+++ b/doc/code-docs/requirements.txt
@ -0,0 +1,10 @@
+Sphinx
+sphinx-autobuild
+recommonmark
+sphinx_rtd_theme
+sphinx_markdown_tables
+autodoc_pydantic==1.9
+enum_tools
+numpy
+torch
+tqdm
--- a/doc/code-docs/source/checkpoint.rst
+++ b/doc/code-docs/source/checkpoint.rst
@ -0,0 +1,2 @@
+Model Checkpointing
+===================
--- a/doc/code-docs/source/conf.py
+++ b/doc/code-docs/source/conf.py
@ -0,0 +1,91 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+import os
+import sys
+
+project = "InternLM"
+copyright = "2023, InternLM Team"
+author = "InternLM Team"
+release = "v0.2.0"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "recommonmark",
+    "sphinx_rtd_theme",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.autodoc",
+    "sphinxcontrib.autodoc_pydantic",
+    "sphinx.ext.autosectionlabel",
+    "sphinx.ext.napoleon",
+]
+
+pygments_style = "sphinx"
+
+# autodoc_pyandtic config
+autodoc_pydantic_model_show_field_summary = False
+autodoc_pydantic_field_signature_prefix = " "
+autodoc_pydantic_model_signature_prefix = "class"
+autodoc_pydantic_model_show_json = False
+autodoc_pydantic_model_show_config_summary = False
+autodoc_pydantic_model_show_config_member = False
+autodoc_pydantic_model_show_validator_summary = False
+autodoc_pydantic_model_show_validator_members = False
+autodoc_pydantic_model_summary_list_order = "bysource"
+autodoc_pydantic_model_member_order = "bysource"
+autodoc_pydantic_field_list_validators = False
+
+# Napoleon settings
+napoleon_google_docstring = True
+napoleon_numpy_docstring = True
+napoleon_include_init_with_doc = False
+napoleon_include_private_with_doc = False
+napoleon_include_special_with_doc = True
+napoleon_use_admonition_for_examples = False
+napoleon_use_admonition_for_notes = False
+napoleon_use_admonition_for_references = False
+napoleon_use_ivar = False
+napoleon_use_param = True
+napoleon_use_rtype = True
+napoleon_preprocess_types = False
+napoleon_type_aliases = None
+napoleon_attr_annotations = True
+
+templates_path = ["_templates"]
+
+exclude_patterns = []
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "sphinx_rtd_theme"
+html_static_path = ["_static"]
+
+# GitHub integration
+html_context = {
+    "display_github": True,
+    "github_user": "pjlab",
+    "github_repo": "InternLM",
+    "github_version": "master",
+    "conf_py_path": "/doc/code-docs/source/",
+}
+
+sys.path.insert(0, os.path.abspath("../../../"))
+
+# Prepend module names to class descriptions
+add_module_names = True
+
+autoclass_content = "class"
+
+autodoc_mock_imports = [
+    "apex",
+    "torch",
+    "numpy",
+]
--- a/doc/code-docs/source/index.rst
+++ b/doc/code-docs/source/index.rst
@ -0,0 +1,70 @@
+.. InternLM documentation master file, created by
+   sphinx-quickstart on Mon Aug 28 17:33:28 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+InternLM
+========
+
+Environment Setup
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   install
+
+Model Setup
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   initialize
+
+Training API
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   training
+
+Parallel Training
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   parallel
+
+Model Checkpointing
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   checkpoint
+
+Profiler
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   profiler
+
+Monitor
+-------------------
+
+.. toctree::
+   :maxdepth: 2
+
+   monitor
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
--- a/doc/code-docs/source/initialize.rst
+++ b/doc/code-docs/source/initialize.rst
@ -0,0 +1,35 @@
+Training Setup
+==============
+
+.. _InternLM-args:
+
+Argument Parsing
+----------------
+InternLM uses the `argparse <https://docs.python.org/3/library/argparse.html>`_ library to supply commandline
+configuration to the InternLM runtime. Use ``internlm.initialize.get_default_parser()`` to get InternLM's default
+parser with some builtin arguments, users can add custom parameters to this parser.
+
+.. code-block:: python
+
+    # Get InternLM default parser
+    parser = internlm.initialize.get_default_parser()
+    # Add new argument
+    parser.add_argument("--user_arg", type=int, default=-1, help="arguments add by user.")
+    cmd_args = parser.parse_args()
+
+.. autofunction:: internlm.initialize.get_default_parser
+
+
+.. _InternLM-init:
+
+Model Initialization
+-------------------------
+
+Optimizer Initialization
+-------------------------
+
+Dataloader Initialization
+-------------------------
+
+Trainer Initialization
+-------------------------
--- a/doc/code-docs/source/install.md
+++ b/doc/code-docs/source/install.md
@ -0,0 +1,70 @@
+## Installation
+
+### Environment Preparation
+The required packages and corresponding version are shown as follows:
+- Python == 3.10
+- GCC == 10.2.0
+- MPFR == 4.1.0
+- CUDA >= 11.7
+- Pytorch >= 1.13.1
+- Transformers >= 4.28.0
+- Flash-Attention >= v1.0.5
+- Apex == 23.05
+- GPU with Ampere or Hopper architecture (such as H100, A100)
+- Linux OS
+
+After installing the above dependencies, some system environment variables need to be updated:
+```bash
+export CUDA_PATH={path_of_cuda_11.7}
+export GCC_HOME={path_of_gcc_10.2.0}
+export MPFR_HOME={path_of_mpfr_4.1.0}
+export LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+export PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+export CC=${GCC_HOME}/bin/gcc
+export CXX=${GCC_HOME}/bin/c++
+```
+
+### Environment Installation
+Clone the project `internlm` and its dependent submodules from the github repository, as follows:
+```bash
+git clone git@github.com:InternLM/InternLM.git --recurse-submodules
+```
+
+It is recommended to build a Python-3.10 virtual environment using conda and install the required dependencies based on the `requirements/` files:
+```bash
+conda create --name internlm-env python=3.10 -y
+conda activate internlm-env
+cd internlm
+pip install -r requirements/torch.txt 
+pip install -r requirements/runtime.txt 
+```
+
+Install flash-attention (version v1.0.5):
+```bash
+cd ./third_party/flash-attention
+python setup.py install
+cd ./csrc
+cd fused_dense_lib && pip install -v .
+cd ../xentropy && pip install -v .
+cd ../rotary && pip install -v .
+cd ../layer_norm && pip install -v .
+cd ../../../../
+```
+
+Install Apex (version 23.05):
+```bash
+cd ./third_party/apex
+pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+cd ../../
+```
+
+### Environment Image
+Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
+
+```bash
+# pull image
+docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
+# start container
+docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
+docker exec -it myinternlm bash
+```
--- a/doc/code-docs/source/monitor.rst
+++ b/doc/code-docs/source/monitor.rst
@ -0,0 +1,10 @@
+Monitor and Alert
+=================
+
+
+Monitoring
+-----------------
+
+
+Alerting
+-----------------
--- a/doc/code-docs/source/parallel.rst
+++ b/doc/code-docs/source/parallel.rst
@ -0,0 +1,23 @@
+Parallel Training
+=================
+
+.. 整体说一下并行配置使用方式，接下来再分模块详细说明
+
+Tensor Parallel
+-----------------
+
+
+Pipeline Parallel
+-----------------
+
+
+Sequence Parallel
+-----------------
+
+
+Data Parallel
+-----------------
+
+
+ZeRO1.5
+-----------------
--- a/doc/code-docs/source/profiler.rst
+++ b/doc/code-docs/source/profiler.rst
@ -0,0 +1,11 @@
+Profiler
+========
+
+.. 可介绍torch profiler, memory profiler的使用
+
+Torch Profiler
+-----------------
+
+
+Memory Profiler
+-----------------
--- a/doc/code-docs/source/training.rst
+++ b/doc/code-docs/source/training.rst
@ -0,0 +1,2 @@
+Training API
+============
--- a/doc/en/install.md
+++ b/doc/en/install.md
@ -59,12 +59,28 @@ cd ../../
 ```

 ### Environment Image
-Users can obtain an image with the InternLM runtime environment installed from https://hub.docker.com/r/sunpengsdu/internlm. The commands for pulling the image and starting the container are as follows:
+Users can use the provided dockerfile combined with docker.Makefile to build their own images, or obtain images with InternLM runtime environment installed from https://hub.docker.com/r/internlm/internlm.
+
+#### Image Configuration and Build
+The configuration and build of the Dockerfile are implemented through the docker.Makefile. To build the image, execute the following command in the root directory of InternLM:
+``` bash
+make -f docker.Makefile BASE_OS=centos7
+``` 
+In docker.Makefile, you can customize the basic image, environment version, etc., and the corresponding parameters can be passed directly through the command line. For BASE_OS, ubuntu20.04 and centos7 are respectively supported.
+
+#### Pull Standard Image
+The standard image based on ubuntu and centos has been built and can be directly pulled:

 ```bash
-# pull image
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-# start container
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-docker exec -it myinternlm bash
+# ubuntu20.04
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
+# centos7
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
 ```
+
+#### Run Container
+For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
+```
+The default directory in the container is `/InternLM`, please start training according to the [Usage](./usage.md).
--- a/doc/en/structure.md
+++ b/doc/en/structure.md
@ -6,11 +6,14 @@ The system code file structure is shown below:
 ├── internlm                                 # Main directory of the system code
 │   ├── apis                                 # Interface module, containing some interface functions related to inference, etc.
 │   ├── core                                 # Core module, managing parallel context and training scheduling engine for training and inference
+│   │   ├── communication                    # Communication module, responsible for p2p communication in pipeline parallel scheduling
 │   │   ├── context                          # Context module, mainly responsible for initializing parallel process groups and managing parallel context
 │   │   │   ├── parallel_context.py
 │   │   │   └── process_group_initializer.py
+│   │   ├── scheduler                        # Scheduling module, which manages schedulers for parallel training, including non-pipeline and pipeline parallel schedulers
+│   │   │   ├── no_pipeline_scheduler.py
+│   │   │   └── pipeline_scheduler.py
 │   │   ├── engine.py                        # Responsible for managing the training and evaluation process of the model
-│   │   ├── no_pipeline_scheduler.py         # Scheduler for parallel training
 │   │   └── trainer.py                       # Responsible for managing the training engine and scheduler
 │   ├── data                                 # Data module, responsible for managing dataset generation and processing
 │   ├── initialize                           # Initialization module, responsible for managing distributed environment startup and trainer initialization
--- a/doc/en/usage.md
+++ b/doc/en/usage.md
@ -165,8 +165,9 @@ Training parallel configuration example:
 ```python
 parallel = dict(
    zero1=8,
-    pipeline=1,
    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
 )
 ```

@ -174,8 +175,11 @@ parallel = dict(
  - When `size <= 0`, the size of the zero1 process group is equal to the size of the data parallel process group, so the optimizer state parameters will be split within the data parallel range.
  - When `size == 1`, zero1 is not used, and all data parallel groups retain the complete optimizer state parameters.
  - When `size > 1` and `size <= data_parallel_world_size`, the zero1 process group is a subset of the data parallel process group.
- pipeline: pipeline parallel size, default value is 1
- tensor: tensor parallel size, usually the number of GPUs per node, default value is 1
+- tensor: tensor parallel size, usually the number of GPUs per node, default is 1
+- pipeline: pipeline parallel strategy
+   - size: pipeline parallel size, the default value is 1
+   - interleaved_overlap: bool type, when interleaved scheduling, enable or disable communication optimization, the default value is False
+- sequence_parallel: Whether to enable sequence parallelism, the default value is False

 Note: `Data parallel size = Total number of GPUs / Pipeline parallel size / Tensor parallel size`

--- a/doc/install.md
+++ b/doc/install.md
@ -59,11 +59,28 @@ cd ../../
 ```

 ### 环境镜像
-用户可以从 https://hub.docker.com/r/sunpengsdu/internlm 获取安装了 InternLM 运行环境的镜像，拉取镜像及启动容器的命令如下：
+用户可以使用提供的 dockerfile 结合 docker.Makefile 来构建自己的镜像，或者也可以从 https://hub.docker.com/r/internlm/internlm 获取安装了 InternLM 运行环境的镜像。
+
+#### 镜像配置及构造
+dockerfile 的配置以及构造均通过 docker.Makefile 文件实现，在 InternLM 根目录下执行如下命令即可 build 镜像：
 ``` bash
-# 拉取镜像
-docker pull sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-# 启动容器
-docker run --gpus all -d -it --shm-size=2gb --name myinternlm sunpengsdu/internlm:torch1.13-cuda11.7-flashatten1.0.5-centos
-docker exec -it myinternlm bash
+make -f docker.Makefile BASE_OS=centos7
 ``` 
+在 docker.Makefile 中可自定义基础镜像，环境版本等内容，对应参数可直接通过命令行传递。对于 BASE_OS 分别支持 ubuntu20.04 和 centos7。
+
+#### 镜像拉取
+基于 ubuntu 和 centos 的标准镜像已经 build 完成也可直接拉取使用：
+
+```bash
+# ubuntu20.04
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-ubuntu20.04
+# centos7
+docker pull internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7
+```
+
+#### 容器启动
+对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器：
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:torch1.13.1-cuda11.7.1-flashatten1.0.5-centos7 bash
+```
+容器内默认目录即 `/InternLM`，根据[使用文档](./usage.md)即可启动训练。
--- a/doc/structure.md
+++ b/doc/structure.md
@ -6,11 +6,14 @@
 ├── internlm                                 # 系统代码的主目录
 │   ├── apis                                 # 接口模块，包含一些关于推理等的接口函数
 │   ├── core                                 # 核心模块，管理用于训练和推理的 parallel context 和训练调度引擎
+│   │   ├── communication                    # 通信模块，负责流水线并行调度中的p2p通信
 │   │   ├── context                          # context 模块，主要负责初始化并行进程组，并管理 parallel context
 │   │   │   ├── parallel_context.py
 │   │   │   └── process_group_initializer.py
+│   │   ├── scheduler                        # 调度模块，管理并行训练的调度器，包括非流水线并行调度器和流水线并行调度器
+│   │   │   ├── no_pipeline_scheduler.py
+│   │   │   └── pipeline_scheduler.py
 │   │   ├── engine.py                        # 负责管理模型的训练和评估过程
-│   │   ├── no_pipeline_scheduler.py         # 并行训练的调度器
 │   │   └── trainer.py                       # 负责管理训练引擎和调度器
 │   ├── data                                 # 数据模块，负责管理数据集生成和处理
 │   ├── initialize                           # 初始化模块，负责管理分布式环境启动和训练器初始化
--- a/doc/usage.md
+++ b/doc/usage.md
@ -151,16 +151,20 @@ model = dict(
 ```python
 parallel = dict(
    zero1=8,
-    pipeline=1,
    tensor=1,
+    pipeline=dict(size=1, interleaved_overlap=True),
+    sequence_parallel=False,
 )
 ```
 - zero1：zero 并行策略，分如下三种情况，默认值为 -1
  - 当`size <= 0`，则 zero1 进程组的大小等于数据并行进程组的大小，因此优化器状态参数将在数据并行范围内分配
  - 当`size == 1`，则不使用 zero1 ，所有数据并行组保留完整的优化器状态参数
  - 当`size > 1`且`size <= data_parallel_world_size`，则 zero1 进程组是数据并行进程组的子集
- pipeline：流水线并行大小，默认值为 1
 - tensor：张量并行大小，通常是每个节点的 GPU 数量，默认值为 1
+- pipeline：流水线并行策略
+  - size：流水线并行大小，默认值为 1
+  - interleaved_overlap：bool 类型，交错式调度时，开启或关闭通信优化，默认值为关闭
+- sequence_parallel：是否开启序列化并行，默认值为 False

 注意：`数据并行大小 = 总的 GPU 数目 / 流水线并行大小 / 张量并行大小`

--- a/docker.Makefile
+++ b/docker.Makefile
@ -0,0 +1,107 @@
+DOCKER_REGISTRY          ?= docker.io
+DOCKER_ORG               ?= my
+DOCKER_IMAGE             ?= internlm
+DOCKER_FULL_NAME          = $(DOCKER_REGISTRY)/$(DOCKER_ORG)/$(DOCKER_IMAGE)
+
+CUDA_VERSION              = 11.7.1
+GCC_VERSION               = 10.2.0
+
+CUDNN_VERSION             = 8
+BASE_RUNTIME              =
+# ubuntu20.04  centos7
+BASE_OS                   = centos7
+BASE_DEVEL                = nvidia/cuda:$(CUDA_VERSION)-cudnn$(CUDNN_VERSION)-devel-${BASE_OS}
+# The conda channel to use to install cudatoolkit
+CUDA_CHANNEL              = nvidia
+# The conda channel to use to install pytorch / torchvision
+INSTALL_CHANNEL          ?= pytorch
+
+PYTHON_VERSION           ?= 3.10
+PYTORCH_VERSION          ?= 1.13.1
+TORCHVISION_VERSION      ?= 0.14.1
+TORCHAUDIO_VERSION       ?= 0.13.1
+BUILD_PROGRESS           ?= auto
+TRITON_VERSION           ?=
+GMP_VERSION              ?= 6.2.1
+MPFR_VERSION             ?= 4.1.0
+MPC_VERSION              ?= 1.2.1
+GCC_VERSION              ?= 10.2.0
+HTTPS_PROXY_I            ?=
+HTTP_PROXY_I             ?=
+FLASH_ATTEN_VERSION      ?= 1.0.5
+FLASH_ATTEN_TAG          ?= v${FLASH_ATTEN_VERSION}
+
+BUILD_ARGS                = --build-arg BASE_IMAGE=$(BASE_IMAGE) \
+                            --build-arg PYTHON_VERSION=$(PYTHON_VERSION) \
+                            --build-arg CUDA_VERSION=$(CUDA_VERSION) \
+                            --build-arg CUDA_CHANNEL=$(CUDA_CHANNEL) \
+                            --build-arg PYTORCH_VERSION=$(PYTORCH_VERSION) \
+                            --build-arg TORCHVISION_VERSION=$(TORCHVISION_VERSION) \
+                            --build-arg TORCHAUDIO_VERSION=$(TORCHAUDIO_VERSION) \
+                            --build-arg INSTALL_CHANNEL=$(INSTALL_CHANNEL) \
+                            --build-arg TRITON_VERSION=$(TRITON_VERSION) \
+                            --build-arg GMP_VERSION=$(GMP_VERSION) \
+                            --build-arg MPFR_VERSION=$(MPFR_VERSION) \
+                            --build-arg MPC_VERSION=$(MPC_VERSION) \
+                            --build-arg GCC_VERSION=$(GCC_VERSION) \
+                            --build-arg https_proxy=$(HTTPS_PROXY_I) \
+                            --build-arg http_proxy=$(HTTP_PROXY_I) \
+                            --build-arg FLASH_ATTEN_TAG=$(FLASH_ATTEN_TAG)
+
+EXTRA_DOCKER_BUILD_FLAGS ?=
+
+BUILD                    ?= build
+# Intentionally left blank
+PLATFORMS_FLAG           ?=
+PUSH_FLAG                ?=
+USE_BUILDX               ?=1
+BUILD_PLATFORMS          ?=
+WITH_PUSH                ?= false
+BUILD_TYPE               ?= intrenlm-dev
+
+# Setup buildx flags
+ifneq ("$(USE_BUILDX)","")
+BUILD                     =  buildx build
+ifneq ("$(BUILD_PLATFORMS)","")
+PLATFORMS_FLAG            = --platform="$(BUILD_PLATFORMS)"
+endif
+endif
+# endif
+
+# # Only set platforms flags if using buildx
+# ifeq ("$(WITH_PUSH)","true")
+# PUSH_FLAG               = --push
+# endif
+# endif
+
+ifeq ($(findstring centos,$(BASE_OS)),centos)
+    DOCKERFILE_PATH ?= ./docker/Dockerfile-centos
+else
+    DOCKERFILE_PATH ?= ./docker/Dockerfile-ubuntu
+endif
+
+#use -f to specify dockerfile
+DOCKER_BUILD              = DOCKER_BUILDKIT=1 \
+                            docker $(BUILD) \
+                                   --progress=$(BUILD_PROGRESS) \
+                                   $(EXTRA_DOCKER_BUILD_FLAGS) \
+                                   $(PLATFORMS_FLAG) \
+                                   $(PUSH_FLAG) \
+                                   -f $(DOCKERFILE_PATH) \
+                                   -t $(DOCKER_FULL_NAME):$(DOCKER_TAG) \
+                                   $(BUILD_ARGS) .
+
+                                   # --target $(BUILD_TYPE)
+
+.PHONY: all
+all: devel-image
+
+.PHONY: devel-image
+devel-image: BASE_IMAGE := $(BASE_DEVEL)
+devel-image: DOCKER_TAG := torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}-flashatten${FLASH_ATTEN_VERSION}-${BASE_OS}
+devel-image:
+	$(DOCKER_BUILD)
+
+.PHONY: clean
+clean:
+	-docker rmi -f $(shell docker images -q $(DOCKER_FULL_NAME))
--- a/docker/Dockerfile-centos
+++ b/docker/Dockerfile-centos
@ -0,0 +1,131 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on centos
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN yum install deltarpm -y && yum update -y \
+    && yum install -y \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        bzip2 \
+        gcc \
+        gcc-c++ \
+        file \
+        texinfo \
+        which
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && git clone https://github.com/ninja-build/ninja.git \
+    && cd ninja \
+    && git checkout release \
+    && ./configure.py --bootstrap \
+    && mv ./ninja /usr/bin \
+    && cd ..
+
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
+
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+RUN git submodule update --init --recursive \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
+    && cd /InternLM/third_party/flash-attention \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/docker/Dockerfile-ubuntu
+++ b/docker/Dockerfile-ubuntu
@ -0,0 +1,112 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on ubuntu
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        ninja-build
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxJf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+RUN git submodule update --init --recursive \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/torch.txt \
+    && /opt/conda/bin/pip --no-cache-dir install -r requirements/runtime.txt \
+    && cd /InternLM/third_party/flash-attention \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/Dockerfile-centos
+++ b/experiment/Dockerfile-centos
@ -0,0 +1,161 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on centos
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN yum install deltarpm -y && yum update -y \
+    && yum install -y \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        bzip2 \
+        gcc \
+        gcc-c++ \
+        file \
+        texinfo \
+        which
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && git clone https://github.com/ninja-build/ninja.git \
+    && cd ninja \
+    && git checkout release \
+    && ./configure.py --bootstrap \
+    && mv ./ninja /usr/bin \
+    && cd ..
+
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${MPFR_HOME}/lib:$LD_LIBRARY_PATH
+
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-threads=posix --disable-checking --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG PYTORCH_VERSION
+ARG TORCHVISION_VERSION
+ARG TORCHAUDIO_VERSION
+
+RUN /opt/conda/bin/pip --no-cache-dir install \
+    transformers==4.29.2 \
+    sentencepiece \
+    numpy \
+    tqdm \
+    psutil \
+    packaging \
+    pre-commit \
+    ninja \
+    gputil \
+    pytest \
+    packaging \
+    boto3 \
+    botocore \
+    torch-scatter \
+    pyecharts \
+    -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
+    && /opt/conda/bin/pip --no-cache-dir install \
+    --extra-index-url https://download.pytorch.org/whl/cu117 \
+    torch==${PYTORCH_VERSION}+cu117 \
+    torchvision==${TORCHVISION_VERSION}+cu117 \
+    torchaudio==${TORCHAUDIO_VERSION}
+
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+ARG FLASH_ATTEN_TAG
+
+RUN git submodule update --init --recursive \
+    && cd /InternLM/third_party/flash-attention \
+    && git checkout ${FLASH_ATTEN_TAG} \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/Dockerfile-ubuntu
+++ b/experiment/Dockerfile-ubuntu
@ -0,0 +1,142 @@
+ARG BASE_IMAGE
+ARG https_proxy
+ARG http_proxy
+
+##############################################################################
+# Install the basic environment on ubuntu
+##############################################################################
+FROM ${BASE_IMAGE} as base
+ARG https_proxy
+ARG http_proxy
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        wget \
+        tar \
+        m4 \
+        ninja-build
+
+
+##############################################################################
+# Install the conda environment
+##############################################################################
+FROM base as conda
+ARG PYTHON_VERSION=3.10
+ARG TARGETPLATFORM
+ARG https_proxy
+ARG http_proxy
+RUN case ${TARGETPLATFORM} in \
+         "linux/arm64")  MINICONDA_ARCH=aarch64  ;; \
+         *)              MINICONDA_ARCH=x86_64   ;; \
+    esac && \
+    curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh"
+
+RUN chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /opt/conda && \
+    rm ~/miniconda.sh && \
+    /opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython && \
+    /opt/conda/bin/conda clean -ya
+
+
+##############################################################################
+# Install environment dependencies
+##############################################################################
+FROM conda as dep
+WORKDIR /dep
+ARG https_proxy
+ARG http_proxy
+ARG GCC_VERSION
+ARG GMP_VERSION
+ARG MPFR_VERSION
+ARG MPC_VERSION
+RUN wget https://ftp.gnu.org/gnu/gmp/gmp-${GMP_VERSION}.tar.bz2 \
+    && tar -vxf gmp-${GMP_VERSION}.tar.bz2 \
+    && cd gmp-${GMP_VERSION}/ \
+    && ./configure --prefix=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/mpfr/mpfr-${MPFR_VERSION}.tar.gz \
+    && tar -vxf mpfr-${MPFR_VERSION}.tar.gz \
+    && cd mpfr-${MPFR_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpfr-${MPFR_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget http://www.multiprecision.org/downloads/mpc-${MPC_VERSION}.tar.gz \
+    && tar -vxf mpc-${MPC_VERSION}.tar.gz \
+    && cd mpc-${MPC_VERSION}/ \
+    && ./configure --prefix=/usr/local/mpc-${MPC_VERSION} --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} \
+    && make -j64 && make install \
+    && cd .. \
+    && wget https://ftp.gnu.org/gnu/gcc/gcc-${GCC_VERSION}/gcc-${GCC_VERSION}.tar.xz \
+    && tar -vxJf gcc-${GCC_VERSION}.tar.xz \
+    && mkdir build \
+    && cd build/ \
+    && ../gcc-${GCC_VERSION}/configure --prefix=/usr/local/gcc-${GCC_VERSION}/ --enable-checking=release --enable-languages=c,c++ --disable-multilib \
+       --with-gmp=/usr/local/gmp-${GMP_VERSION} --with-mpfr=/usr/local/mpfr-${MPFR_VERSION} --with-mpc=/usr/local/mpc-${MPC_VERSION} \
+    && make -j64 && make install
+
+ENV GCC_HOME=/usr/local/gcc-${GCC_VERSION}
+ENV MPFR_HOME=/usr/local/mpfr-${MPFR_VERSION}
+ENV LD_LIBRARY_PATH=${GCC_HOME}/lib64:${MPFR_HOME}/lib:${CUDA_PATH}/lib64:$LD_LIBRARY_PATH
+ENV PATH=${GCC_HOME}/bin:${CUDA_PATH}/bin:$PATH
+ENV CC=${GCC_HOME}/bin/gcc
+ENV CXX=${GCC_HOME}/bin/c++
+
+
+##############################################################################
+# Install InternLM development environment, including flash-attention and apex
+##############################################################################
+FROM dep as intrenlm-dev
+COPY . /InternLM
+WORKDIR /InternLM
+ARG https_proxy
+ARG http_proxy
+ARG PYTORCH_VERSION
+ARG TORCHVISION_VERSION
+ARG TORCHAUDIO_VERSION
+
+RUN /opt/conda/bin/pip --no-cache-dir install \
+    transformers==4.29.2 \
+    sentencepiece \
+    numpy \
+    tqdm \
+    psutil \
+    packaging \
+    pre-commit \
+    ninja \
+    gputil \
+    pytest \
+    packaging \
+    boto3 \
+    botocore \
+    torch-scatter \
+    pyecharts \
+    -f https://data.pyg.org/whl/torch-${PYTORCH_VERSION}+cu117.html \
+    && /opt/conda/bin/pip --no-cache-dir install \
+    --extra-index-url https://download.pytorch.org/whl/cu117 \
+    torch==${PYTORCH_VERSION}+cu117 \
+    torchvision==${TORCHVISION_VERSION}+cu117 \
+    torchaudio==${TORCHAUDIO_VERSION}
+
+ARG https_proxy
+ARG http_proxy
+ARG TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX"
+ARG FLASH_ATTEN_TAG
+
+RUN git submodule update --init --recursive \
+    && cd /InternLM/third_party/flash-attention \
+    && git checkout ${FLASH_ATTEN_TAG} \
+    && /opt/conda/bin/python setup.py install \
+    && cd ./csrc \
+    && cd fused_dense_lib && /opt/conda/bin/pip install -v . \
+    && cd ../xentropy && /opt/conda/bin/pip install -v . \
+    && cd ../rotary && /opt/conda/bin/pip install -v . \
+    && cd ../layer_norm && /opt/conda/bin/pip install -v . \
+    && cd ../../../../ \
+    && cd ./third_party/apex \
+    && /opt/conda/bin/pip --no-cache-dir install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+    && /opt/conda/bin/pip cache purge \
+    && rm -rf ~/.cache/pip
--- a/experiment/README-CN.md
+++ b/experiment/README-CN.md
@ -0,0 +1,25 @@
+## 实验性环境镜像
+本模块用于测试新版本环境，默认测试新环境 torch=2.0.1，flash-attention=2.1.0。新环境可能具有不稳定性，标准环境安装请参考：[安装文档](../doc/install.md)
+
+### 镜像构建及拉取
+构建镜像时请于 InternLM 根目录下执行 docker.Makefile，该文件与标准环境镜像共用，所使用的 Dockerfile 位于 experiment 目录下。也可直接从 https://hub.docker.com/r/internlm/internlm 拉取镜像，命令如下：
+```bash
+# 构建镜像
+# ubuntu20.04
+make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+# centos7
+make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+
+# 拉取镜像
+# ubuntu20.04
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
+# centos7
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
+```
+
+### 容器启动
+对于使用 dockerfile 构建或拉取的本地标准镜像，使用如下命令启动并进入容器：
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
+```
+容器内默认目录即 `/InternLM`，根据[使用文档](../doc/usage.md)即可启动训练。
--- a/experiment/README-EN.md
+++ b/experiment/README-EN.md
@ -0,0 +1,25 @@
+## Environment Image for experiment
+This module is used to test the new version environment, the default test new environment is torch=2.0.1, flash-attention=2.1.0. The new environment may be unstable, for the standard environment installation please refer to: [installation guide](../doc/en/install.md)
+
+### Build and Pull Image
+When building the image, please make docker.Makefile in the InternLM root directory. This Makefile is shared with the standard environment image, and the Dockerfile used is located in the experiment directory. You can also pull the image directly from https://hub.docker.com/r/internlm/internlm, the command is as follows:
+```bash
+# Build Image
+# ubuntu20.04
+make -f docker.Makefile BASE_OS=ubuntu20.04 DOCKERFILE_PATH=./experiment/Dockerfile-ubuntu PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+# centos7
+make -f docker.Makefile BASE_OS=centos7 DOCKERFILE_PATH=./experiment/Dockerfile-centos PYTORCH_VERSION=2.0.1 TORCHVISION_VERSION=0.15.2 TORCHAUDIO_VERSION=2.0.2 FLASH_ATTEN_VERSION=2.1.0
+
+# Pull Image
+# ubuntu20.04
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-ubuntu20.04
+# centos7
+docker pull internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7
+```
+
+### Run Container
+For the local standard image built with dockerfile or pulled, use the following command to run and enter the container:
+```bash
+docker run --gpus all -it -m 500g --cap-add=SYS_PTRACE --cap-add=IPC_LOCK --shm-size 20g --network=host --name myinternlm internlm/internlm:experiment-torch2.0.1-flashatten2.1.0-centos7 bash
+```
+The default directory in the container is `/InternLM`, please start training according to the [Usage](../doc/en/usage.md).
--- a/internlm/core/context/init.py
+++ b/internlm/core/context/init.py
@ -7,6 +7,7 @@ from .parallel_context import (
 from .process_group_initializer import (
    Initializer_Data,
    Initializer_Model,
+    Initializer_Nettest,
    Initializer_Pipeline,
    Initializer_Tensor,
    Initializer_Zero1,
@ -34,6 +35,7 @@ __all__ = [
    "Initializer_Pipeline",
    "Initializer_Data",
    "Initializer_Zero1",
+    "Initializer_Nettest",
    "ProcessGroupInitializer",
    "Initializer_Model",
    "seed",
--- a/internlm/core/context/parallel_context.py
+++ b/internlm/core/context/parallel_context.py
@ -143,6 +143,7 @@ class ParallelContext(metaclass=SingletonMeta):
        self.pipeline_parallel_size = 1
        self.tensor_parallel_size = 1
        self.zero1_parallel_size = -1
+        self.nettest_parallel_size = 1
        self.num_processes_on_current_node = -1
        self.virtual_pipeline_parallel_size = None
        self.virtual_pipeline_parallel_rank = None
@ -442,6 +443,9 @@ class ParallelContext(metaclass=SingletonMeta):
        # instead, it should be calculated based on other parallel config
        self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)

+        # the recommended nettest_parallel_size is 32 GPUs
+        self.nettest_parallel_size = 32
+
        if self.zero1_parallel_size <= 0:
            self.zero1_parallel_size = self.data_parallel_size

@ -454,6 +458,7 @@ class ParallelContext(metaclass=SingletonMeta):
            self.pipeline_parallel_size,
            self.tensor_parallel_size,
            self.zero1_parallel_size,
+            self.nettest_parallel_size,
        ]

        # run initialization of different process groups
@ -462,6 +467,7 @@ class ParallelContext(metaclass=SingletonMeta):
        initializers.append(pgroup_initializer.Initializer_Model(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Tensor(*initializer_args))
        initializers.append(pgroup_initializer.Initializer_Zero1(*initializer_args))
+        initializers.append(pgroup_initializer.Initializer_Nettest(*initializer_args))
        if self.pipeline_parallel_size > 1:
            initializers.append(pgroup_initializer.Initializer_Pipeline(*initializer_args))
        for initializer in initializers:
--- a/internlm/core/context/process_group_initializer.py
+++ b/internlm/core/context/process_group_initializer.py
@ -3,6 +3,7 @@

 # adopted from https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context

+import math
 from abc import ABC, abstractmethod
 from enum import Enum

@ -31,6 +32,9 @@ class ParallelMode(Enum):
    # zero1 parallel
    ZERO1 = "zero1"

+    # runntime network test
+    NETTEST = "nettest"
+

 class ProcessGroupInitializer(ABC):
    """An object, knowing the parallelism configuration, that initializes parallel groups.
@ -52,6 +56,7 @@ class ProcessGroupInitializer(ABC):
        pipeline_parallel_size: int,
        tensor_parallel_size: int,
        zero1_parallel_size: int,
+        nettest_parallel_size: int,
    ):
        self.rank = rank
        self.world_size = world_size
@ -59,6 +64,7 @@ class ProcessGroupInitializer(ABC):
        self.pipeline_parallel_size = pipeline_parallel_size
        self.tensor_parallel_size = tensor_parallel_size
        self.zero1_parallel_size = zero1_parallel_size
+        self.nettest_parallel_size = nettest_parallel_size
        super().__init__()

    @abstractmethod
@ -332,3 +338,52 @@ class Initializer_Zero1(ProcessGroupInitializer):
                    ranks_in_group = ranks

        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
+
+
+class Initializer_Nettest(ProcessGroupInitializer):
+    """A ProcessGroupInitializer for network test, especailly for NCCL.
+
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        nettest_parallel_size (int): Size of a network test group.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.num_nettest_group = math.ceil(self.world_size / self.nettest_parallel_size)
+
+    def init_dist_group(self, use_cpu: bool = False):
+        """Initialize tensor parallel groups, and assign local_ranks and groups to each gpu.
+
+        Returns:
+            Tuple (local_rank, group_world_size, process_group, ranks_in_group, mode):
+                A Tensor parallelism's information tuple.
+        """
+        local_rank = None
+        ranks_in_group = None
+        process_group = None
+        cpu_group = None
+        group_world_size = None
+        mode = ParallelMode.NETTEST
+
+        for i in range(self.num_nettest_group):
+            ranks = []
+            for j in range(self.nettest_parallel_size):
+                rank = i * self.nettest_parallel_size + j
+                if rank < self.world_size:
+                    ranks.append(rank)
+            group = dist.new_group(ranks)
+            if use_cpu:
+                group_cpu = dist.new_group(ranks, backend="gloo") if dist.get_backend() != "gloo" else group
+            else:
+                group_cpu = None
+
+            if self.rank in ranks:
+                local_rank = ranks.index(self.rank)
+                group_world_size = len(ranks)
+                process_group = group
+                cpu_group = group_cpu
+                ranks_in_group = ranks
+
+        return local_rank, group_world_size, process_group, cpu_group, ranks_in_group, mode
--- a/internlm/core/scheduler/pipeline_scheduler.py
+++ b/internlm/core/scheduler/pipeline_scheduler.py
@ -30,7 +30,7 @@ def get_tensor_shape():

    if hasattr(gpc.config, "SEQ_LEN") and hasattr(gpc.config.data, "micro_bsz") and hasattr(gpc.config, "HIDDEN_SIZE"):
        if gpc.config.model.use_flash_attn:
-            if gpc.config.model.sequence_parallel:
+            if gpc.config.parallel.sequence_parallel:
                sequence_world_size = gpc.get_world_size(ParallelMode.TENSOR)
                tensor_shape = (
                    gpc.config.SEQ_LEN * gpc.config.data["micro_bsz"] // sequence_world_size,
@ -140,7 +140,7 @@ class PipelineScheduler(BaseScheduler):
            and gpc.get_world_size(ParallelMode.TENSOR) > 1
        )

-        if gpc.config.model.sequence_parallel:
+        if gpc.config.parallel.sequence_parallel:
            self.scatter_gather_tensors = False

        # cache for the batch data
--- a/internlm/core/trainer.py
+++ b/internlm/core/trainer.py
@ -38,6 +38,11 @@ class TrainState:
        # Total step count
        self.total_steps: int = config.data.total_steps

+        # resume tensorboard folder, need load from checkpoint or set manually.
+        self.resume_tb_folder = config.resume_tb_folder
+
+        self.tensorboard_folder = config.tensorboard_folder
+
    def init_batch_sampler(self, train_dl):
        # Copy of the batch sampler from the DataLoader
        self.batch_sampler = train_dl.batch_sampler.copy()
@ -73,9 +78,13 @@ class TrainState:
        self.step_count = other_stuffs.get("step_count", other_stuffs["batch_count"]) + 1

        # track the actual updates of sampler when using weighted sampling
+        if hasattr(self, "batch_sampler"):
            self.batch_sampler = train_dl.batch_sampler.copy()
            self.batch_sampler_iter = iter(self.batch_sampler)

+        # resume tensorboard from older tensorboard_folder
+        self.resume_tb_folder = other_stuffs.get("tensorboard_folder", None)
+
    def state_dict(self):
        return {
            "batch_count": self.batch_count,
@ -83,6 +92,7 @@ class TrainState:
            "num_consumed_tokens": self.num_consumed_tokens,
            "inf_nan_skip_batches": self.inf_nan_skip_batches,
            "step_count": self.step_count,
+            "tensorboard_folder": self.tensorboard_folder,
        }


--- a/internlm/data/utils.py
+++ b/internlm/data/utils.py
@ -5,7 +5,7 @@ import torch

 from internlm.core.context import global_context as gpc

-DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1}
+DATASET_TYPE_IDS_MAP = {"en": 0, "cn": 1, "code": 2}


 def get_dataset_type_id(path):
--- a/internlm/initialize/init.py
+++ b/internlm/initialize/init.py
@ -1,9 +1,15 @@
 from .initialize_trainer import initialize_trainer
-from .launch import get_default_parser, launch_from_slurm, launch_from_torch
+from .launch import (
+    get_default_parser,
+    initialize_distributed_env,
+    launch_from_slurm,
+    launch_from_torch,
+)

 __all__ = [
    "get_default_parser",
    "initialize_trainer",
    "launch_from_slurm",
    "launch_from_torch",
+    "initialize_distributed_env",
 ]
--- a/internlm/initialize/initialize_tensor.py
+++ b/internlm/initialize/initialize_tensor.py
@ -3,16 +3,15 @@

 import math

-import torch
 from torch import Tensor, nn


-def scaled_init_method_normal(sigma, num_layers):
+def scaled_init_method_normal(sigma: float = 1.0, num_layers: int = 1):
    """Init method based on N(0, sigma/sqrt(2*num_layers)."""
    std = sigma / math.sqrt(2.0 * num_layers)

    def init_(tensor):
-        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+        return nn.init.normal_(tensor, mean=0.0, std=std)

    return init_

@ -32,3 +31,33 @@ def normal_(mean: float = 0.0, std: float = 1.0):
        return nn.init.normal_(tensor, mean, std)

    return initializer
+
+
+def scaled_init_method_uniform(sigma: float = 1.0, num_layers: int = 1):
+    """Init method based on p(x)=Uniform(-a, a) where std(x)=sigma/sqrt(2*num_layers)."""
+    std = sigma / math.sqrt(2.0 * num_layers)
+    a = math.sqrt(3.0 * std)
+
+    def init_(tensor):
+        return nn.init.uniform_(tensor, -a, a)
+
+    return init_
+
+
+def uniform_(mean: float = 0.0, std: float = 1.0):
+    r"""Return the initializer filling the input Tensor with values drawn from the uniform distribution
+
+     .. math::
+        \mathcal{U}(mean-a, mean+a), where a satisfies \mathcal{U}_{std}=std.
+
+    Args:
+        mean (float): the mean of the uniform distribution. Defaults 0.0.
+        std (float): the standard deviation of the uniform distribution. Defaults 1.0.
+    """
+
+    a = math.sqrt(3.0 * std)
+
+    def initializer(tensor: Tensor):
+        return nn.init.uniform_(tensor, mean - a, mean + a)
+
+    return initializer
--- a/internlm/initialize/launch.py
+++ b/internlm/initialize/launch.py
@ -10,6 +10,7 @@ import torch

 from internlm.core.context import Config
 from internlm.core.context import global_context as gpc
+from internlm.utils.common import get_master_node
 from internlm.utils.logger import get_logger
 from internlm.utils.storage_manager import init_storage_manager

@ -108,67 +109,100 @@ def args_sanity_check():
        logger.info(f"valid_every: {data.valid_every}")

    # processing the checkpoint config
-    if "enable_save_ckpt" not in gpc.config.ckpt:
-        gpc.config.ckpt._add_item("enable_save_ckpt", False)
+    ckpt = gpc.config.ckpt
+    if "enable_save_ckpt" not in ckpt:
+        ckpt._add_item("enable_save_ckpt", False)

-    if "checkpoint_every" not in gpc.config.ckpt or gpc.config.ckpt.checkpoint_every <= 0:
-        gpc.config.ckpt._add_item("checkpoint_every", float("inf"))
+    # Saving checkpoint args.
+    if ckpt.enable_save_ckpt:
+        assert "checkpoint_every" in ckpt, "If enable save checkpoint, must give checkpoint_every in config.data!"
+        assert ckpt.checkpoint_every > 0
+        assert "save_ckpt_folder" in ckpt, "If enable save checkpoint, must give save_ckpt_folder in config.data!"

-    if "load_optimizer" not in gpc.config.ckpt:
-        gpc.config.ckpt._add_item("load_optimizer", True)
-
-    if "save_ckpt_folder" not in gpc.config.ckpt:
-        gpc.config.ckpt._add_item("save_ckpt_folder", None)
-
-    if "load_ckpt_folder" not in gpc.config.ckpt:
-        gpc.config.ckpt._add_item("load_ckpt_folder", None)
-
-    if "load_model_only_folder" not in gpc.config.ckpt:
-        gpc.config.ckpt._add_item("load_model_only_folder", None)
-
-    if "async_upload" not in gpc.config.ckpt:
-        gpc.config.ckpt._add_item("async_upload", False)
-
-    if "async_upload_tmp_folder" not in gpc.config.ckpt:
-        gpc.config.ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")
-
-    if gpc.config.ckpt.async_upload:
-        assert "save_ckpt_folder" in gpc.config.ckpt
-        if "boto3:" not in gpc.config.ckpt.save_ckpt_folder:
+        if "async_upload" not in ckpt:
+            ckpt._add_item("async_upload", False)  # async defalut is False.
+        else:
+            if ckpt.async_upload:
+                assert "save_ckpt_folder" in ckpt
+                if "boto3:" not in ckpt.save_ckpt_folder:
                    if gpc.is_rank_for_log():
-                logger.warning("Storing ckpt on file system does not support asynchronous storage, will use sync save!")
-            gpc.config.ckpt.async_upload = False
+                        logger.warning(
+                            "Storing ckpt on file system does not support asynchronous storage, will use sync save!"
+                        )
+                    ckpt.async_upload = False
+                else:
+                    if "async_upload_tmp_folder" not in ckpt:
+                        ckpt._add_item("async_upload_tmp_folder", "/dev/shm/internlm_tmp_ckpt/")

-    if "snapshot_ckpt_folder" not in gpc.config.ckpt:
-        gpc.config.ckpt._add_item("snapshot_ckpt_folder", os.path.join(gpc.config.ckpt.save_ckpt_folder, "snapshot"))
+        if not ckpt.async_upload:
+            ckpt._add_item("async_upload_tmp_folder", None)

-    if "oss_snapshot_freq" not in gpc.config.ckpt and gpc.config.ckpt.checkpoint_every != float("inf"):
-        gpc.config.ckpt._add_item("oss_snapshot_freq", gpc.config.ckpt.checkpoint_every / 2)
-        assert gpc.config.ckpt.oss_snapshot_freq > 0
+        if "snapshot_ckpt_folder" not in ckpt:
+            ckpt._add_item("snapshot_ckpt_folder", os.path.join(ckpt.save_ckpt_folder, "snapshot"))

-    assert not (
-        gpc.config.ckpt.load_ckpt_folder is not None and gpc.config.ckpt.load_model_only_folder is not None
-    ), "'load_ckpt_folder' and 'load_model_only_folder' cannot be set at the same time."
+        if "oss_snapshot_freq" not in ckpt:
+            ckpt._add_item("oss_snapshot_freq", float("inf"))  # if oss_snapshot_freq not given, we disable.
+    else:
+        ckpt._add_item("checkpoint_every", float("inf"))
+        ckpt._add_item("oss_snapshot_freq", float("inf"))
+        ckpt._add_item("save_ckpt_folder", None)
+        ckpt._add_item("async_upload", False)
+        ckpt._add_item("async_upload_tmp_folder", None)
+        ckpt._add_item("snapshot_ckpt_folder", None)
+        ckpt._add_item("snapshot_ckpt_folder", None)
+
+    # Loading checkpoint args.
+    if "load_model_only_folder" not in ckpt:
+        ckpt._add_item("load_model_only_folder", None)
+
+    if "load_ckpt_folder" not in ckpt:
+        ckpt._add_item("load_ckpt_folder", None)
+
+    if "load_optimizer" not in ckpt:
+        ckpt._add_item("load_optimizer", True)
+
+    if "stop_file_path" not in ckpt:
+        ckpt._add_item("stop_file_path", None)
+
+    if "load_given_ckpt" not in ckpt:
+        # If 'load_given_ckpt' is not given, we set it to False, so internlm can have opportunity
+        # to auto-load latest checkpoint.
+        ckpt._add_item("load_given_ckpt", False)
+
+    if ckpt.load_given_ckpt:
+        # Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
+        if ckpt.load_ckpt_folder and ckpt.load_model_only_folder:
+            logger.warning(
+                "Detect 'load_ckpt_folder' and 'load_model_only_folder' set at the same time, \
+and 'load_given_ckpt' is True, so internlm will load from 'load_ckpt_folder'"
+            )
+            ckpt.load_model_only_folder = None

    if gpc.is_rank_for_log():
        logger.info("+" * 15 + " Ckpt Info " + "+" * 15)  # pylint: disable=W1201
-        logger.info(f"is enable save ckpt: {gpc.config.ckpt.enable_save_ckpt}")
-        logger.info(f"save_ckpt_folder: {gpc.config.ckpt.save_ckpt_folder}")
-        logger.info(f"checkpoint_every: {gpc.config.ckpt.checkpoint_every}")
-        logger.info(f"async_upload: {gpc.config.ckpt.async_upload}")
-        if gpc.config.ckpt.async_upload:
-            logger.info(f"async_upload_tmp_folder: {gpc.config.ckpt.async_upload_tmp_folder}")
+        logger.info(f"is enable save ckpt: {ckpt.enable_save_ckpt}")
+        logger.info(f"save_ckpt_folder: {ckpt.save_ckpt_folder}")
+        logger.info(f"checkpoint_every: {ckpt.checkpoint_every}")
+        logger.info(f"load_given_ckpt: {ckpt.load_given_ckpt}")

    # initialization storage manager
-    init_storage_manager(gpc.config.ckpt)
+    init_storage_manager(ckpt)

    # tensorboard writer config
    if "enable_tb" not in gpc.config:
        gpc.config._add_item("enable_tb", True)
    if "tensorboard_folder" not in gpc.config:
-        gpc.config._add_item("tensorboard_folder", None)
+        gpc.config._add_item(
+            "tensorboard_folder", os.environ["tensorboard_folder"] if "tensorboard_folder" in os.environ else None
+        )
    if "resume_tb_folder" not in gpc.config:
-        gpc.config._add_item("resume_tb_folder", None)
+        gpc.config._add_item(
+            "resume_tb_folder", os.environ["resume_tb_folder"] if "resume_tb_folder" in os.environ else None
+        )
+
+    if gpc.is_rank_for_log():
+        logger.info(f"tensorboard_folder: {gpc.config.tensorboard_folder}")
+        logger.info(f"resume_tb_folder: {gpc.config.resume_tb_folder}")

    # cudnn
    torch.backends.cudnn.benchmark = gpc.config.get("cudnn_benchmark", False)
@ -191,10 +225,8 @@ def args_sanity_check():
        elif gpc.config.model.dtype in ("torch.float16", "torch.half"):
            gpc.config.model.dtype = torch.float16
        elif gpc.config.model.dtype == "torch.float32":
-            assert gpc.config.model.use_flash_attn is False, "when using float32, the use_flash_attn must be False"
            gpc.config.model.dtype = torch.float32
        elif gpc.config.model.dtype == "torch.tf32":
-            assert gpc.config.model.use_flash_attn is False, "when using tf32, the use_flash_attn must be False"
            torch.backends.cudnn.allow_tf32 = True
            torch.backends.cuda.matmul.allow_tf32 = True
            gpc.config.model.dtype = torch.float32
@ -236,17 +268,32 @@ def args_sanity_check():
    # process the model config
    if "use_flash_attn" not in gpc.config.model:
        gpc.config.model._add_item("use_flash_attn", True)
-    if "sequence_parallel" not in gpc.config.model:
-        gpc.config.model._add_item("sequence_parallel", False)
+
+    # process the parallel config
+    if "sequence_parallel" not in gpc.config.parallel:
+        gpc.config.parallel._add_item("sequence_parallel", False)
    else:
        assert not (
-            gpc.config.model.sequence_parallel is True and gpc.config.model.use_flash_attn is False
+            gpc.config.parallel.sequence_parallel is True and gpc.config.model.use_flash_attn is False
        ), "sequence parallel does not support use_flash_attn=False"

    # feishu webhook address for alerting
    if "alert_address" not in gpc.config:
        gpc.config._add_item("alert_address", None)

+    optim_ckpt = gpc.config.hybrid_zero_optimizer
+    if "zero_overlap_communication" in optim_ckpt:
+        # Compatible with the old interfaces.
+        optim_ckpt._add_item("overlap_sync_grad", optim_ckpt.zero_overlap_communication)
+    if "overlap_sync_grad" not in optim_ckpt:
+        optim_ckpt._add_item("overlap_sync_grad", False)
+    if "overlap_sync_param" not in optim_ckpt:
+        optim_ckpt._add_item("overlap_sync_param", False)
+    if gpc.is_rank_for_log():
+        logger.info(
+            f"overlap_sync_grad:{optim_ckpt.overlap_sync_grad}, overlap_sync_param:{optim_ckpt.overlap_sync_param}"
+        )
+

 def launch(
    config: Union[str, Path, Config, Dict],
@ -293,8 +340,6 @@ def launch(
    # init process groups for different parallel modes from config
    gpc.init_parallel_groups()

-    args_sanity_check()
-
    # set cuda device
    if torch.cuda.is_available():
        # if local rank is not given, calculate automatically
@ -347,7 +392,11 @@ def launch_from_slurm(
    )


-def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nccl", seed: int = 1024):
+def launch_from_torch(
+    config: Union[str, Path, Config, Dict],
+    backend: str = "nccl",
+    seed: int = 1024,
+):
    """A wrapper for internlm.launch for torchrun or torch.distributed.launch by reading rank and world size
    from the environment variables set by PyTorch

@ -375,3 +424,38 @@ def launch_from_torch(config: Union[str, Path, Config, Dict], backend: str = "nc
        backend=backend,
        seed=seed,
    )
+
+
+def initialize_distributed_env(
+    config: str,
+    launcher: str = "slurm",
+    master_port: int = 8888,
+    seed: int = 1024,
+    args_check=True,
+):
+    """
+    Initialize distributed environment for distributed training.
+
+    Args:
+        config (str): Config file path.
+        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
+        master_port (str): The master port for distributed training. 8888 by default.
+        seed (int, optional): Specified random seed for every process. 1024 by default.
+    """
+
+    torch.cuda.empty_cache()
+
+    if launcher == "torch":
+        launch_from_torch(config=config, seed=seed)
+    elif launcher == "slurm":
+        launch_from_slurm(
+            config=config,
+            host=get_master_node(),
+            port=master_port,
+            seed=seed,
+        )
+    else:
+        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
+
+    if args_check:
+        args_sanity_check()
--- a/internlm/model/embedding.py
+++ b/internlm/model/embedding.py
@ -7,6 +7,7 @@ import rotary_emb
 import torch
 import torch.nn.functional as F
 from einops import rearrange
+from flash_attn.layers.rotary import ApplyRotaryEmb as LegacyApplyRotaryEmb
 from flash_attn.layers.rotary import ApplyRotaryEmbQKV_ as LegacyApplyRotaryEmbQKV_
 from torch import Tensor, nn

@ -56,7 +57,7 @@ class Embedding1D(nn.Module):

        output = gather_forward_split_backward(output_parallel, ParallelMode.TENSOR, dim=-1)

-        if gpc.config.model.sequence_parallel:
+        if gpc.config.parallel.sequence_parallel:
            output = split_forward_gather_backward(output, ParallelMode.TENSOR, dim=1)

        return output
@ -111,6 +112,7 @@ class ApplyRotaryEmbQKV_(torch.autograd.Function):

 apply_rotary_emb_qkv_ = ApplyRotaryEmbQKV_.apply
 legacy_apply_rotary_embed_qkv = LegacyApplyRotaryEmbQKV_.apply
+legacy_apply_rotary_embed = LegacyApplyRotaryEmb.apply


 class RotaryEmbedding(torch.nn.Module):
@ -135,15 +137,13 @@ class RotaryEmbedding(torch.nn.Module):
        """ """
        super().__init__()
        # Generate and save the inverse frequency buffer (non trainable)
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
-        self.register_buffer("inv_freq", inv_freq)
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim))
        self.scale_base = scale_base
-        scale = (
+        self.scale = (
            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
            if scale_base > 0
            else None
        )
-        self.register_buffer("scale", scale)

        self._seq_len_cached = 0
        self._cos_cached = None
@ -218,3 +218,15 @@ class RotaryEmbedding(torch.nn.Module):
                self._cos_k_cached[seqlen_offset:],
                self._sin_k_cached[seqlen_offset:],
            )
+
+    def _single_forward(self, x, indexes=0):
+        assert self.scale is None
+        self._update_cos_sin_cache(x, indexes)
+        x = x[None, ...]
+        ret = legacy_apply_rotary_embed(x, self._cos_cached[indexes], self._sin_cached[indexes]).squeeze(0)
+        return ret
+
+    def _single_eval_forward(self, x, seqlen_offset=0):
+        assert self.scale is None
+        self._update_cos_sin_cache(x, seqlen_offset + x.shape[1])
+        return legacy_apply_rotary_embed(x, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:])
--- a/internlm/model/linear.py
+++ b/internlm/model/linear.py
@ -62,7 +62,7 @@ class ScaleColumnParallelLinear(nn.Linear):
            weight,
            self.bias,
            process_group=self.process_group,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
        )


@ -111,7 +111,7 @@ class RewardModelLinear(ScaleColumnParallelLinear):
            weight,
            self.bias,
            process_group=self.process_group,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
        )


@ -173,7 +173,7 @@ class FeedForward(nn.Module):
            hidden_features,
            process_group,
            bias,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
            device=device,
            dtype=dtype,
        )
@ -182,7 +182,7 @@ class FeedForward(nn.Module):
            hidden_features,
            process_group,
            bias,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
            device=device,
            dtype=dtype,
        )
@ -191,7 +191,7 @@ class FeedForward(nn.Module):
            out_features,
            process_group,
            bias=bias,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
            device=device,
            dtype=dtype,
        )
--- a/internlm/model/metrics.py
+++ b/internlm/model/metrics.py
@ -176,7 +176,7 @@ class AccPerplex:
            res.update(ds_acc)
            res.update(ds_tokens)

-        loss_res = self.loss_with_type_id.get_metric()
+        loss_res = self.loss_with_type_id.get_metric(reset)
        res.update(loss_res)

        return res
--- a/internlm/model/modeling_internlm.py
+++ b/internlm/model/modeling_internlm.py
@ -121,7 +121,7 @@ class PackedFlashBaseLayer1D(nn.Module):
                process_group=gpc.get_group(ParallelMode.TENSOR),
                bias1=False,
                bias2=False,
-                sequence_parallel=gpc.config.model.sequence_parallel,
+                sequence_parallel=gpc.config.parallel.sequence_parallel,
                checkpoint_lvl=0,
                heuristic="auto",
                device=device,
@ -294,7 +294,7 @@ class PackedFlashInternLm1D(nn.Module):
                    max_position_embeddings=-1,
                    process_group=gpc.get_group(ParallelMode.TENSOR),
                    padding_idx=None,
-                    sequence_parallel=gpc.config.model.sequence_parallel,
+                    sequence_parallel=gpc.config.parallel.sequence_parallel,
                    device=device,
                    dtype=dtype,
                )
--- a/internlm/model/multi_head_attention.py
+++ b/internlm/model/multi_head_attention.py
@ -82,7 +82,7 @@ class MHA(nn.Module):
            3 * embed_dim,
            process_group,
            bias=True,
-            sequence_parallel=gpc.config.model.sequence_parallel,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
            **factory_kwargs,
        )  # according to https://spaces.ac.cn/archives/9577

@ -95,7 +95,11 @@ class MHA(nn.Module):

        # output projection always have the bias (for now)
        self.out_proj = RowParallelLinearTorch(
-            embed_dim, embed_dim, process_group, sequence_parallel=gpc.config.model.sequence_parallel, **factory_kwargs
+            embed_dim,
+            embed_dim,
+            process_group,
+            sequence_parallel=gpc.config.parallel.sequence_parallel,
+            **factory_kwargs,
        )
        # need to assign tp attribute so that internlm know it is tensor parallel module
        if gpc.get_world_size(ParallelMode.TENSOR) > 1:
@ -128,6 +132,12 @@ class MHA(nn.Module):
            qkv = self.rotary_emb(qkv, **kwargs)

        if inference_params is None:
+            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
+                        qkv = qkv.to(torch.bfloat16)
+                    context = self.inner_attn(qkv).to(x.dtype)
+            else:
                context = self.inner_attn(qkv)
        else:
            q = qkv[:, :, 0]
@ -160,7 +170,14 @@ class MHA(nn.Module):
        kwargs.pop("indexes")

        if inference_params is None:
+            if gpc.config.model.dtype is torch.float32 and gpc.config.model.use_flash_attn:
+                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                    if qkv.dtype not in [torch.float16, torch.bfloat16]:
+                        qkv = qkv.to(torch.bfloat16)
+                    context = self.inner_attn(qkv, **kwargs).to(x.dtype)
+            else:
                context = self.inner_attn(qkv, **kwargs)
+
        else:
            raise RuntimeError("Not support this right now")

--- a/internlm/solver/optimizer/hybrid_zero_optim.py
+++ b/internlm/solver/optimizer/hybrid_zero_optim.py
@ -3,6 +3,7 @@

 import math
 from functools import partial
+from itertools import product

 import torch
 import torch.distributed as dist
@ -19,6 +20,7 @@ from internlm.solver.optimizer.store import (
 )
 from internlm.solver.optimizer.utils import (
    DynamicGradScaler,
+    ParamBcastSyncHandler,
    flatten,
    get_grad_accumulate_object,
    has_inf_or_nan,
@ -87,9 +89,9 @@ class HybridZeroOptimizer(BaseOptimizer):
        self,
        optimizer: Optimizer,
        cpu_offload=False,
-        overlap_broadcast=False,
        grad_scal_cfg: Config = None,
        zero_cfg: Config = None,
+        param_bcast_sync_handler: ParamBcastSyncHandler = None,
    ):
        # DynamicGradScaler related args
        if gpc.config.model.dtype is torch.float32:
@ -104,9 +106,10 @@ class HybridZeroOptimizer(BaseOptimizer):
        max_scale = grad_scal_cfg.max_scale

        # Zero related args
-        overlap_communication = zero_cfg.zero_overlap_communication
        reduce_bucket_size = zero_cfg.reduce_bucket_size
        clip_grad_norm = zero_cfg.clip_grad_norm
+        self._overlap_sync_grad = zero_cfg.overlap_sync_grad
+        self._overlap_sync_param = zero_cfg.overlap_sync_param

        super().__init__(optim=optimizer)

@ -127,7 +130,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        self._fp32_flat_param_groups_of_current_rank = dict()

        # communication params
-        self._overlap_communication = overlap_communication
+        # self._overlap_communication = overlap_communication
        self._reduce_bucket_size = reduce_bucket_size

        # gradient scaler
@ -158,7 +161,12 @@ class HybridZeroOptimizer(BaseOptimizer):
            + f"zo-{self._zero_local_rank}.pt"
        )
        self.params_per_rank_id_dict = []
-        self.overlap_broadcast = overlap_broadcast
+        self._param_bcast_sync_handler = param_bcast_sync_handler
+        if self._overlap_sync_param:
+            assert self._param_bcast_sync_handler is not None
+            self._broadcast_comm_stream = torch.cuda.Stream()
+        else:
+            self._broadcast_comm_stream = torch.cuda.current_stream()

        # iterate over the param group in the optimizer
        # partition these param groups for data parallel training
@ -228,12 +236,14 @@ class HybridZeroOptimizer(BaseOptimizer):

        # initialize communication stream for
        # communication-computation overlapping
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            self._comm_stream = torch.cuda.Stream()
+        else:
+            self._comm_stream = torch.cuda.current_stream()

        # reduction hook is only used if overlapping communication
        # if it is stage 1 without overlapping, no hook will be attached
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            self._attach_reduction_hook()

    @property
@ -267,7 +277,9 @@ class HybridZeroOptimizer(BaseOptimizer):
            global_id = str(i)
            for j in range(len(param.size())):
                global_id = "_".join([global_id, str(param.size()[j])])
-
+            if self._overlap_sync_param:
+                rank_to_go = self._param_bcast_sync_handler.get_rank_by_param(param)
+            else:
                rank_to_go = numel_per_rank.index(min(numel_per_rank))
            params_per_rank[rank_to_go].append(param)
            self.params_per_rank_id_dict[-1][rank_to_go].append(global_id)
@ -299,7 +311,9 @@ class HybridZeroOptimizer(BaseOptimizer):
                        self._grad_store.add_accumulate_grad_object(accum_grad_obj)

                        reduction_func = partial(
-                            self._store_and_try_reduce_grads_by_bucket, param=param, reduce_rank=reduce_rank
+                            self._store_and_try_reduce_grads_by_bucket,
+                            param=param,
+                            reduce_rank=reduce_rank,
                        )

                        # define hook
@ -384,17 +398,17 @@ class HybridZeroOptimizer(BaseOptimizer):
                self._reduce_and_copy(bucket=param_bucket, reduce_rank=reduce_rank)

    def _reduce_and_copy(self, bucket: TensorBucket, reduce_rank):
-        if self._overlap_communication:
-            stream = self._comm_stream
-            stream.synchronize()
+        if self._overlap_sync_grad:
+            self._comm_stream.synchronize()
            self._param_store.clear_grads_of_previous_reduced_params()
-        else:
-            stream = torch.cuda.current_stream()

-        with torch.cuda.stream(stream):
+        with torch.cuda.stream(self._comm_stream):
            flat = bucket.flatten()
            reduced_flat = reduce_tensor(
-                tensor=flat, dtype=self.dtype, dst_rank=reduce_rank, parallel_mode=ParallelMode.DATA
+                tensor=flat,
+                dtype=self.dtype,
+                dst_rank=reduce_rank,
+                parallel_mode=ParallelMode.DATA,
            )

            # update the reduced tensor
@ -483,6 +497,7 @@ class HybridZeroOptimizer(BaseOptimizer):
            grads = [self.padding_grad]
            params = [self.padding_tensor]

+        norm = 0
        if self._clip_grad_norm > 0:
            # this norm is before scaling, it will be very large
            norm = compute_norm(
@ -507,7 +522,7 @@ class HybridZeroOptimizer(BaseOptimizer):

        # if not overlapping communication (no reduction hook is attached)
        # we need to manually reduce these gradients
-        if not self._overlap_communication:
+        if not self._overlap_sync_grad:
            for group_id in range(len(self._fp16_param_groups)):
                for param in self._fp16_param_groups[group_id]:
                    if param.grad is not None:
@ -522,18 +537,21 @@ class HybridZeroOptimizer(BaseOptimizer):
            groups_norms.append(self._compute_norm_with_stage(group_id=group_id))

        # clear reduced grads
-        if self._overlap_communication:
+        if self._overlap_sync_grad:
            # grads in the last bucket is reduced
            self._comm_stream.synchronize()
            self._param_store.clear_grads_of_previous_reduced_params()

        # compute norm for gradients in the last bucket
-        total_norms = []
+        total_norms = {}
        for group_id in range(self.num_param_groups):
-            total_norms.append(
-                self._compute_norm_with_stage(
-                    group_id=group_id, last_bucket=True, last_stage=True, previous_norm=groups_norms[group_id]
-                )
+            group_name = self.param_groups[group_id]["name"] if "name" in self.param_groups[group_id] else "default"
+            group_name = f"{group_id}_{group_name}"
+            total_norms[group_name] = self._compute_norm_with_stage(
+                group_id=group_id,
+                last_bucket=True,
+                last_stage=True,
+                previous_norm=groups_norms[group_id],
            )

        timer("sync_grad").start()
@ -552,7 +570,7 @@ class HybridZeroOptimizer(BaseOptimizer):
        # found_inf = self._check_overflow()
        # Because you may encounter inf when computing norm

-        if -1 in norms:
+        if -1 in norms.values():
            found_inf = True

        loss_scale = float(self.loss_scale.item())  # backup
@ -562,10 +580,13 @@ class HybridZeroOptimizer(BaseOptimizer):
        if found_inf:
            if gpc.is_rank_for_log():
                logger.warning("Overflow occurs, please check it.")
-                send_alert_message(address=gpc.config.alert_address, message="Overflow occurs, please check it.")
+                send_alert_message(
+                    address=gpc.config.alert_address,
+                    message="Overflow occurs, please check it.",
+                )
            self._grad_store._averaged_gradients = dict()
            self.zero_grad()
-            return False, None
+            return False, norms

        # copy the grad of fp16 param to fp32 param
        single_grad_partition_groups = []
@ -597,15 +618,17 @@ class HybridZeroOptimizer(BaseOptimizer):

        # unscale and clip grads
        # get the global norm
-        global_norm_groups = []
+        global_norm_groups = {}
        if self._clip_grad_norm > 0:
-            for norm in norms:
-                global_norm_groups.append(norm**0.5)
+            for group_name, norm in norms.items():
+                global_norm_groups[group_name] = norm**0.5

        # the following operations are performed only on the rank to which parameters are assigned.
        if gpc.config.model.dtype is not torch.float32:
-            if len(single_grad_partition_groups) != 0:
-                self._unscale_and_clip_grads(single_grad_partition_groups, global_norm_groups, loss_scale)
+            if len(single_grad_partition_groups) != 0 and self._clip_grad_norm > 0:
+                self._unscale_and_clip_grads(
+                    single_grad_partition_groups, list(global_norm_groups.values()), loss_scale
+                )

        # update the parameters
        timer("step").start()
@ -625,35 +648,42 @@ class HybridZeroOptimizer(BaseOptimizer):
                    fp32_param = self._fp32_flat_param_groups_of_current_rank[group_id]
                    fp16_param.data.copy_(fp32_param)

-        # TODO: support broadcast overlap
-        self.broadcast_params(overlap=False)
+        with torch.cuda.stream(self._broadcast_comm_stream):
+            self.broadcast_params()

        timer("step").stop()
+
        # update gradients may not be needed here, because the sync_params function is used in initialization,
        # so synchronization is maintained
-        return True, [global_norm / loss_scale for global_norm in global_norm_groups]
+        for group_name, global_norm in global_norm_groups.items():
+            global_norm_groups[group_name] = global_norm / loss_scale
+        return True, global_norm_groups

-    def broadcast_params(self, overlap=False):
+    def broadcast_params(self):
        handles = []

-        for group_id in range(self.num_param_groups):
-            for rank in range(self._zero_world_size):
+        for rank, group_id in product(range(self._zero_world_size), range(self.num_param_groups)):
            # The following operations are performed only on the rank to which parameters are assigned.
-                if rank not in self.param_group_no_params_ranks[group_id]:
+            if rank in self.param_group_no_params_ranks[group_id]:
+                continue
            fp16_param = self._param_store.get_flat_fp16_param_by_rank_group(rank=rank, group_id=group_id)
            # grank = gpc.get_ranks_in_group(group_type)[rank]  # need to convert to the global rank
            # assert grank == rank, f"{grank} == {rank}"
            g_rank = gpc.get_ranks_in_group(self._broadcast_parallel_mode)[rank]
            handle = dist.broadcast(
-                        fp16_param, src=g_rank, group=gpc.get_group(ParallelMode.ZERO1), async_op=True
+                fp16_param,
+                src=g_rank,
+                group=gpc.get_group(ParallelMode.ZERO1),
+                async_op=True,
            )
+
+            if self._overlap_sync_param:
+                self._param_bcast_sync_handler.add_bcast_handle(rank, handle)
+            else:
                handles.append(handle)

-        if not overlap:
        for handle in handles:
            handle.wait()
-        else:
-            return handles

    ##################
    # FP16 Utilities #
@ -671,7 +701,11 @@ class HybridZeroOptimizer(BaseOptimizer):
                    if avg_grad is not None and has_inf_or_nan(avg_grad):
                        self._found_overflow.fill_(1.0)
                        break
-        dist.all_reduce(self._found_overflow, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.GLOBAL))
+        dist.all_reduce(
+            self._found_overflow,
+            op=dist.ReduceOp.MAX,
+            group=gpc.get_group(ParallelMode.GLOBAL),
+        )

        return self._found_overflow.item() > 0

--- a/internlm/solver/optimizer/utils.py
+++ b/internlm/solver/optimizer/utils.py
@ -3,15 +3,18 @@

 import math
 from abc import ABC, abstractmethod
-from typing import Dict, Optional
+from collections import OrderedDict
+from functools import partial
+from typing import Any, Dict, Optional, Union

 import torch
 import torch.distributed as dist
-from torch import Tensor
+from torch import Tensor, nn
 from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors

 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
+from internlm.core.naive_amp import NaiveAMPModel
 from internlm.utils.common import get_tensor_norm, move_norm_to_cuda
 from internlm.utils.logger import get_logger
 from internlm.utils.parallel import is_model_parallel_parameter
@ -60,12 +63,19 @@ def get_grad_accumulate_object(tensor):


 def split_half_float_double(tensor_list):
-    dtypes = ["torch.cuda.HalfTensor", "torch.cuda.FloatTensor", "torch.cuda.DoubleTensor", "torch.cuda.BFloat16Tensor"]
-    buckets = []
-    for _, dtype in enumerate(dtypes):
-        bucket = [t for t in tensor_list if t.type() == dtype]
-        if bucket:
-            buckets.append(bucket)
+    dtype_buckets = {
+        "torch.cuda.HalfTensor": [],
+        "torch.cuda.FloatTensor": [],
+        "torch.cuda.DoubleTensor": [],
+        "torch.cuda.BFloat16Tensor": [],
+    }
+
+    for t in tensor_list:
+        dtype = t.type()
+        if dtype in dtype_buckets:
+            dtype_buckets[dtype].append(t)
+
+    buckets = [bucket for bucket in dtype_buckets.values() if bucket]
    return buckets


@ -184,7 +194,10 @@ def calc_l2_norm(grads):
        if APEX_AVAILABLE:
            dummy_overflow_buf = torch.cuda.IntTensor([0])
            norm, _ = multi_tensor_applier(
-                amp_C.multi_tensor_l2norm, dummy_overflow_buf, [grads], False  # no per-parameter norm
+                amp_C.multi_tensor_l2norm,
+                dummy_overflow_buf,
+                [grads],
+                False,  # no per-parameter norm
            )
        else:
            norm, _ = multi_tensor_l2norm_torch(grads, False)
@ -228,7 +241,11 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no

        # Take max across all model-parallel GPUs.
        if gpc.get_world_size(ParallelMode.MODEL) > 1:
-            dist.all_reduce(total_norm_cuda, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.MODEL))
+            dist.all_reduce(
+                total_norm_cuda,
+                op=dist.ReduceOp.MAX,
+                group=gpc.get_group(ParallelMode.MODEL),
+            )
        total_norm = total_norm_cuda[0].item()
    else:
        tensor_parallel_grads = []
@ -280,7 +297,11 @@ def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, no

        # Sum across all model-parallel GPUs.
        if gpc.is_initialized(ParallelMode.MODEL):
-            dist.all_reduce(total_norm, op=dist.ReduceOp.SUM, group=gpc.get_group(ParallelMode.MODEL))
+            dist.all_reduce(
+                total_norm,
+                op=dist.ReduceOp.SUM,
+                group=gpc.get_group(ParallelMode.MODEL),
+            )

        # This is because we use zero1, so we need to use this reduction.
        # TODO: Check zero group to be a subset of dp group.
@ -459,3 +480,90 @@ class DynamicGradScaler(BaseGradScaler):
        self._scale = self._scale.fill_(state_dict["_scale"])
        self._growth_step = state_dict["_growth_step"]
        self._hysteresis_step = state_dict["_hysteresis_step"]
+
+
+class ParamBcastSyncHandler:
+    """
+    Model Partition Handler for overlap broadcast with forward
+    """
+
+    def __init__(self, model: Union[nn.Module, nn.ModuleList]) -> None:
+        self._block_to_param = OrderedDict()  # <key: nn.Module> <value: list(param)>
+        self._param_to_rank = dict()  # <key: param> <value: rank)>
+        self._block_to_rank = dict()  # <key: nn.Module> <value: rank)>
+        self._bcast_handles = dict()  # <key: rank> <value: list(bcast handles))>
+
+        zero1_size = gpc.get_world_size(ParallelMode.ZERO1)
+        total_param_num = sum(p.numel() for p in model.parameters())
+        avg_param_num = total_param_num * 1.0 // zero1_size
+
+        # just want to share same for loop for ModuleList and Module
+        if not isinstance(model, nn.ModuleList):
+            model = [model]
+
+        # record the parameters to transformer/embeding/head/norm block
+        for _chunk in model:
+            if isinstance(_chunk, NaiveAMPModel):
+                _chunk = _chunk.model
+
+            for _, children in _chunk.named_children():
+                # should be the transformer block definaton in modeling_xxx.py
+                if isinstance(children, nn.ModuleList):
+                    # record the block that a parameter belongs to
+                    for _, block in enumerate(children):
+                        # self._block_to_param[f"{name}.{idx}"] = list(block.parameters())
+                        self._block_to_param[block] = list(block.parameters())
+                else:
+                    # record the block that a parameter belongs to
+                    # self._block_to_param[name] = list(children.parameters())
+                    self._block_to_param[children] = list(children.parameters())
+
+        alloc_num = 0
+        rank_to_go = 0
+
+        # process the parameters in block_to_param sequencially,
+        # allocate each parameter to a local rank of ParallelMode.ZERO1,
+        # NOTE that we do NOT consider following scenarios:
+        # 1) whether a parameter is trainable;
+        # 2) paramters maybe in different optimizer group
+        for block, params in self._block_to_param.items():
+            # allocate a model block to a local rank of ParallelMode.ZERO1
+            self._block_to_rank[block] = [rank_to_go]
+            for p in params:
+                alloc_num = alloc_num + p.numel()
+                # in this case, allocate the param to next rank if possible
+                if alloc_num > avg_param_num * 1.01 and rank_to_go < zero1_size - 1:
+                    rank_to_go = rank_to_go + 1
+                    alloc_num = 0
+                    self._block_to_rank[block].append(rank_to_go)
+                # allocate a parameter to a local rank of ParallelMode.ZERO1
+                self._param_to_rank[p] = rank_to_go
+
+        # initialize an empty list for _bcast_handles of each rank
+        for rank in range(gpc.get_world_size(ParallelMode.ZERO1)):
+            self._bcast_handles[rank] = []
+
+        # register_forward_pre_hook for transformer/embeding/norm/xxx block
+        self._register_sync_parameters_hook()
+
+    def _register_sync_parameters_hook(self) -> None:
+        def _pre_forward_hook(model: nn.Module, inputs: Any):  # pylint: disable=W0613
+            bcast_handles = []
+            # gather all required broadcast hanles into a list
+            for rank in self._block_to_rank[model]:
+                bcast_handles.extend(self._bcast_handles[rank])
+                # need to clear _bcast_handles since they would be processed later
+                self._bcast_handles[rank] = []
+            # wait all required broadcast handles to be completed
+            for handle in bcast_handles:
+                handle.wait()
+
+        # register_forward_pre_hook for transformer/embeding/norm/xxx block
+        for block, _ in self._block_to_rank.items():
+            block.register_forward_pre_hook(partial(_pre_forward_hook))
+
+    def get_rank_by_param(self, param) -> int:
+        return self._param_to_rank[param]
+
+    def add_bcast_handle(self, rank, handle) -> None:
+        self._bcast_handles[rank].append(handle)
--- a/internlm/train/init.py
+++ b/internlm/train/init.py
@ -0,0 +1,19 @@
+from .training_internlm import (
+    get_train_data_loader,
+    get_validation_data_loader,
+    initialize_llm_profile,
+    initialize_model,
+    initialize_optimizer,
+    load_new_batch,
+    record_current_batch_training_metrics,
+)
+
+__all__ = [
+    "get_train_data_loader",
+    "get_validation_data_loader",
+    "initialize_llm_profile",
+    "initialize_model",
+    "initialize_optimizer",
+    "load_new_batch",
+    "record_current_batch_training_metrics",
+]
--- a/internlm/train/training_internlm.py
+++ b/internlm/train/training_internlm.py
@ -0,0 +1,422 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import time
+from functools import partial
+from typing import Callable, Iterable, Union
+
+import torch
+import torch.distributed as dist
+from torch import nn
+from torch.utils.data import ConcatDataset, DataLoader
+
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.core.naive_amp import NaiveAMPModel
+from internlm.core.trainer import TrainState
+from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
+from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
+from internlm.data.dataset import get_dataset_dict
+from internlm.data.dummy_dataset import RandomDataset
+from internlm.data.packed_dataset import (
+    PackedDataset,
+    PackedDatasetWithoutCuSeqlen,
+    get_packed_dataset_without_short_length,
+)
+from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
+from internlm.monitor import set_env_var
+from internlm.monitor.monitor import monitor_manager as mm
+from internlm.solver.beta2_scheduler import Beta2Scheduler
+from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
+from internlm.solver.optimizer import HybridZeroOptimizer
+from internlm.solver.optimizer.utils import ParamBcastSyncHandler
+from internlm.utils.common import DummyProfile
+from internlm.utils.logger import get_logger
+from internlm.utils.megatron_timers import megatron_timer as timer
+from internlm.utils.parallel import (
+    is_no_pp_or_last_stage,
+    sync_model_param,
+    sync_model_param_within_tp,
+)
+from internlm.utils.registry import MODEL_INITIALIZER
+
+logger = get_logger(__file__)
+
+
+def initialize_model():
+    """
+    Initialize model.
+
+    Returns: The neural network model to be trained or evaluated.
+    """
+
+    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
+    if isinstance(model, nn.ModuleList):
+        model = nn.ModuleList(
+            [
+                NaiveAMPModel(
+                    model=_m,
+                    output_to_fp32=False,  # manually controlled by interleaved pipleline scheduler
+                    dtype=gpc.config.model.get("dtype", torch.half),
+                    sync_buffer=False,
+                )
+                for _m in model
+            ]
+        )
+    else:
+        model = NaiveAMPModel(
+            model=model,
+            output_to_fp32=is_no_pp_or_last_stage(),
+            dtype=gpc.config.model.get("dtype", torch.half),
+            sync_buffer=False,
+        )
+
+    # This sync is very important, cause the model weights kept in optimizer are copied
+    # from the origin parameters in the memory, so we should make sure the dp sync
+    # does not influence the model weights in optimizer be different with the origin parameters.
+    sync_model_param(model, parallel_mode=ParallelMode.DATA)
+
+    # This function is needed to make sure parameters that are not splitted by tensor parallelism are
+    # the same across tensor parallelism.
+    sync_model_param_within_tp(model)
+
+    return model
+
+
+def initialize_optimizer(model: Union[nn.Module, nn.ModuleList]):
+    """
+    Initialize optimizer.
+
+    Args:
+        model (torch.nn.Module): Your model instance to be trained or evaluated.
+
+    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
+    """
+    if gpc.config.hybrid_zero_optimizer.overlap_sync_param:
+        param_bcast_sync_handler = ParamBcastSyncHandler(model)
+    else:
+        param_bcast_sync_handler = None
+
+    adam_cfg = gpc.config.adam
+    naive_optimizer = torch.optim.AdamW(
+        params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
+        lr=adam_cfg.lr,
+        betas=(adam_cfg.adam_beta1, adam_cfg.adam_beta2),
+        eps=adam_cfg.adam_eps,
+    )
+
+    optimizer = HybridZeroOptimizer(
+        naive_optimizer,
+        grad_scal_cfg=gpc.config.grad_scaler,
+        zero_cfg=gpc.config.hybrid_zero_optimizer,
+        param_bcast_sync_handler=param_bcast_sync_handler,
+    )
+
+    beta2_scheduler = Beta2Scheduler(optimizer=naive_optimizer, **gpc.config.beta2_scheduler)
+
+    lr_scheduler = FineTuneCosineAnnealingWarmupLR(optimizer, **gpc.config.lr_scheduler)
+
+    return optimizer, beta2_scheduler, lr_scheduler
+
+
+def get_train_data_loader(
+    num_worker: int = 0, dataset_generate_func: Callable = None, train_sampler=None, train_collate_fn=None
+):
+    """
+    Generate and return the training data loader.
+
+    Returns: A tuple of (train_dl, dataset_types).
+    """
+
+    # Get the dataset types
+    dataset_types = None
+    dataset_types = list(DATASET_TYPE_IDS_MAP.keys())
+    data_cfg = gpc.config.data
+
+    # Get the sample weight dictionary
+    train_folder = data_cfg.train_folder
+
+    if not train_folder:
+        train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
+        if data_cfg.pack_sample_into_one:
+            train_ds = PackedDatasetWithoutCuSeqlen(
+                train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
+            )
+        else:
+            train_ds = PackedDataset(
+                train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
+            )
+    else:
+        if dataset_generate_func is not None:
+            train_ds = dataset_generate_func()
+        else:
+            train_ds = get_packed_dataset_without_short_length(
+                folder=data_cfg.train_folder,
+                packed_length=data_cfg.packed_length,
+                max_length_per_sample=data_cfg.seq_len,
+                show_progress=dist.get_rank() == 0,
+                min_length=data_cfg.min_length,
+                min_length_dict=data_cfg.get("min_length_dict", {}),
+                pack_into_one_sample=data_cfg.pack_sample_into_one,
+            )
+
+    if dataset_generate_func is None or not train_folder:
+        # partition already completed
+        assert isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen, ConcatDataset))
+        # Create the training dataset sampler
+        train_sampler = StaticBatchSampler(
+            train_ds.datasets if isinstance(train_ds, ConcatDataset) else [train_ds],
+            batch_size=data_cfg.micro_num,
+            rampup_batch_size=data_cfg.rampup_batch_size,
+            micro_bsz=data_cfg.micro_bsz,
+            seed=1024,
+            drop_last=True,
+            data_rank=gpc.get_local_rank(ParallelMode.DATA),
+            data_world_size=gpc.get_world_size(ParallelMode.DATA),
+        )
+
+    if dataset_generate_func is None or not train_folder:
+        train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
+
+    # Create the training data loader
+    train_dl = DataLoader(
+        dataset=train_ds,
+        batch_sampler=train_sampler,
+        num_workers=num_worker,
+        pin_memory=True,
+        collate_fn=train_collate_fn,
+        persistent_workers=num_worker > 0,
+    )
+
+    return train_dl, dataset_types
+
+
+def get_validation_data_loader(
+    num_worker: int = 0, dataset_generate_func: Callable = None, val_collate_fn=None, dataloader_func=None
+):
+    """Generate and return the validation data loader."""
+
+    data_cfg = gpc.config.data
+
+    if not data_cfg.valid_folder:
+        val_ds = RandomDataset(num_samples=gpc.get_world_size(ParallelMode.DATA) * 500, max_len=data_cfg.seq_len)
+    else:
+        if dataset_generate_func is not None:
+            assert val_collate_fn and dataloader_func is not None
+            val_ds = dataset_generate_func()
+        else:
+            val_ds = get_dataset_dict(folder=data_cfg.valid_folder, split="")
+
+    if not isinstance(val_ds, dict):
+        val_ds = {"val": val_ds}
+
+    if val_collate_fn is None or not data_cfg.valid_folder:
+        val_collate_fn = partial(jsonl_ds_collate_fn, max_length_per_sample=data_cfg.seq_len)
+
+    val_dls = {}
+    for val_name, ds in val_ds.items():
+        if dataloader_func and data_cfg.valid_folder is not None:
+            val_dls[val_name] = dataloader_func(dataset=ds, collate_fn=val_collate_fn)
+            if gpc.is_rank_for_log():
+                logger.info(
+                    f"load validation dataset {val_name} with valid batch size {str(data_cfg.valid_micro_num)} and "
+                    f"{ds.size} Byte samples."
+                )
+        else:
+            # making the batch_size of validate larger can speed up the evaluation, but it should not be too large,
+            # otherwise too much data may be dropped
+            batch_size = min(
+                data_cfg.valid_micro_num * data_cfg.micro_bsz, len(ds) // gpc.get_world_size(ParallelMode.DATA)
+            )
+            batch_size = batch_size // data_cfg.micro_bsz * data_cfg.micro_bsz
+
+            if batch_size == 0 and gpc.is_rank_for_log():
+                logger.info(f"skip validate {val_name}.")
+                continue
+
+            val_dls[val_name] = get_dpsampler_dataloader(
+                ds,
+                shuffle=False,
+                num_workers=num_worker,
+                batch_size=batch_size,
+                collate_fn=val_collate_fn,
+                drop_last=True,
+            )  # drop_last=True, otherwise it may cause problems in the last batch
+
+            if gpc.is_rank_for_log():
+                logger.info(
+                    f"load validation dataset {val_name} with valid batch size {str(batch_size)} and "
+                    f"samples {str(len(val_dls[val_name]))}."
+                )
+
+    return val_dls
+
+
+def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
+    """
+    Load and return the new batch data based on training data loader.
+
+    Args:
+        train_dl (torch.utils.data.DataLoader): Dataloader for training.
+        train_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
+        train_state (TrainState): Current training state.
+
+    Returns: A batch data and the updated train_iter.
+    """
+
+    timer("batch-gen").start()
+    try:
+        batch = next(train_iter)  # structure is ({'input_ids': Tensor, 'cu_seqlens': Tensor}, Tensor)
+        if hasattr(train_state, "batch_sampler_iter"):
+            next(train_state.batch_sampler_iter)
+    except StopIteration:
+        train_iter = iter(train_dl)
+        batch = next(train_iter)
+        train_state.num_consumed_samples_in_epoch = 0
+        if hasattr(train_state, "batch_sampler"):
+            train_state.batch_sampler_iter = iter(train_state.batch_sampler)
+            next(train_state.batch_sampler_iter)
+    timer("batch-gen").stop()
+
+    if batch[0].get("type_ids", None) is not None:
+        # if use_flash_attn is False, we need to unpack type_ids
+        if not gpc.config.model.use_flash_attn:
+            batch[0]["type_ids"] = unpack_data(batch[0]["type_ids"], batch[0]["cu_seqlens"])
+
+    return batch, train_iter
+
+
+def initialize_llm_profile(profiling: bool = False, start_time: str = None):
+    """Initialize and return the profiler context manager instance."""
+
+    if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
+        llm_profile = torch.profiler.profile
+        logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
+    else:
+        llm_profile = DummyProfile
+
+    return llm_profile(
+        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+        schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler(
+            f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
+            + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
+            + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
+            + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
+        ),
+        with_stack=True,
+        with_modules=True,
+    )
+
+
+def record_current_batch_training_metrics(
+    get_tflops_func,
+    logger,
+    writer,
+    success_update,
+    batch_count,
+    batch,
+    train_state,
+    optimizer,
+    beta2_scheduler,
+    trainer,
+    start_time,
+    loss,
+    grad_norm,
+    metric,
+    update_panel,
+):
+    """
+    Print some training metrics of current batch.
+    """
+
+    set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))
+
+    if success_update in (0, True):
+        train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
+    if is_no_pp_or_last_stage():
+        acc_perplex = metric.get_metric()
+
+    if success_update and gpc.is_rank_for_log():
+        lr = optimizer.param_groups[0]["lr"]
+        if hasattr(trainer.engine.optimizer, "grad_scaler"):
+            scaler = trainer.engine.optimizer.grad_scaler._scale.item()
+        elif hasattr(trainer.engine.optimizer.optim, "grad_scaler"):
+            scaler = trainer.engine.optimizer.optim.grad_scaler._scale.item()
+
+        num_tokens_in_batch = batch[1].nelement()
+        num_samples_in_batch = sum([len(b) - 1 for b in batch[0]["cu_seqlens"]])
+        max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
+        max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
+        min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
+
+        tk_per_gpu = 0
+        tk_per_gpu = round(
+            num_tokens_in_batch
+            * gpc.get_world_size(ParallelMode.DATA)
+            / gpc.get_world_size(ParallelMode.GLOBAL)
+            / (time.time() - start_time),
+            2,
+        )
+
+        tflops = get_tflops_func((time.time() - start_time))
+
+        infos = {
+            "tflops": tflops,
+            "step": batch_count,
+            "loss": loss.item(),
+            "tgs (tokens/gpu/second)": tk_per_gpu,
+            "lr": lr,
+            "loss_scale": scaler,
+            "grad_norm": grad_norm,
+        }
+
+        infos["micro_num"] = len(batch[1])
+        infos["num_consumed_tokens"] = train_state.num_consumed_tokens
+        infos["inf_nan_skip_batches"] = train_state.inf_nan_skip_batches
+        infos["num_samples_in_batch"] = num_samples_in_batch  # the number of batches which have the most samples
+        infos["largest_length"] = max_length_in_batch  # the longest input
+        infos["largest_batch"] = max_samples_in_batch  # the batch with the most samples
+        infos["smallest_batch"] = min_samples_in_batch
+        infos["adam_beta2"] = beta2_scheduler.get_beta2()
+
+        fwd_bwd_time = round(timer("fwd-bwd").elapsed(), 2)
+        infos["fwd_bwd_time"] = fwd_bwd_time
+
+        for key, value in acc_perplex.items():
+            infos[key] = value
+
+        line = ""
+        for key, value in infos.items():
+            line += f"{key}={value} "
+            if isinstance(value, dict):
+                writer.add_scalars(key=key, value=value, step=train_state.step_count)
+            else:
+                writer.add_scalar(key=key, value=value, step=train_state.step_count)
+
+        if update_panel:
+            # metrics shown with dashboard panels
+            panel_metrics = {
+                "step": batch_count,
+                "lr": lr,
+                "num_consumed_tokens": train_state.num_consumed_tokens,
+                "loss": loss.item(),
+                "flops": tflops,
+                "tgs": tk_per_gpu,
+                "acc": acc_perplex["acc"],
+                "perplexity": acc_perplex["perplexity"],
+                "fwd_bwd_time": fwd_bwd_time,
+            }
+            for norm_key, norm_value in grad_norm.items():
+                panel_metrics[norm_key] = norm_value
+
+            logger.info(
+                "{line}",
+                line=line,
+                extra=panel_metrics,
+            )
+        else:
+            logger.info(line)
+
+        # if loss spike occurs, send alert info to feishu
+        mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
--- a/internlm/utils/evaluation.py
+++ b/internlm/utils/evaluation.py
@ -52,12 +52,12 @@ def switch_evaluation_pipeline_scheduler(trainer, num_microbatches, tensor_shape

@contextmanager
 def switch_sequence_parallel_mode():
-    prev_mode = gpc.config.model.sequence_parallel
+    prev_mode = gpc.config.parallel.sequence_parallel
    try:
-        gpc.config.model.sequence_parallel = False
+        gpc.config.parallel.sequence_parallel = False
        yield
    finally:
-        gpc.config.model.sequence_parallel = prev_mode
+        gpc.config.parallel.sequence_parallel = prev_mode


 def evaluate_on_val_dls(
@ -67,6 +67,7 @@ def evaluate_on_val_dls(
    logger,
    step_count,
    update_panel: bool = False,
+    streaming: bool = False,
 ):
    with switch_sequence_parallel_mode():
        torch.cuda.empty_cache()
@ -75,7 +76,7 @@ def evaluate_on_val_dls(
        data_cfg = gpc.config.data

        for val_name, val_dl in val_dls.items():
-            if len(val_dl) == 0 and verbose:
+            if not streaming and len(val_dl) == 0 and verbose:
                logger.info(f"Validation dataset: {val_name} is empty")
                continue

@ -91,7 +92,7 @@ def evaluate_on_val_dls(
            for val_idx, batch in tqdm(
                enumerate(val_dl),
                desc="Val.",
-                total=len(val_dl),
+                total=len(val_dl) if not streaming else None,
                position=1,
                disable=not verbose,
                leave=False,
@ -135,7 +136,7 @@ def evaluate_on_val_dls(
            dist.barrier()

            val_res = val_metric.get_metric()
-            if verbose and len(val_dl) != 0:
+            if verbose and (streaming or len(val_dl) != 0):
                val_loss = val_loss / (val_idx + 1 + 1e-6)
                infos = {
                    "step": step_count,
--- a/internlm/utils/gputest.py
+++ b/internlm/utils/gputest.py
@ -0,0 +1,163 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import math
+import socket
+
+import torch
+import torch.distributed as dist
+from flash_attn.modules.mha import FlashSelfAttention, SelfAttention
+from torch.utils import benchmark
+
+from internlm.utils.logger import get_logger
+
+try:
+    import GPUtil
+    import psutil
+except ImportError:
+    GPUtil, psutil = None, None
+
+from internlm.core.context import ParallelMode
+from internlm.core.context import global_context as gpc
+from internlm.utils.common import get_current_device
+
+logger = get_logger(__file__)
+
+
+def benchmark_forward(
+    test_fn,
+    *inputs,
+    repeats=100,
+    amp=True,
+    amp_dtype=torch.float16,
+    **kwinputs,
+):
+    """Use Pytorch Benchmark on the forward pass of an arbitrary function."""
+
+    def amp_wrapper(*inputs, **kwinputs):
+        with torch.autocast(device_type="cuda", dtype=amp_dtype, enabled=amp):
+            test_fn(*inputs, **kwinputs)
+
+    bench_timer = benchmark.Timer(
+        stmt="test_fn_amp(*inputs, **kwinputs)",
+        globals={"test_fn_amp": amp_wrapper, "inputs": inputs, "kwinputs": kwinputs},
+        num_threads=torch.get_num_threads(),
+    )
+    used_time = bench_timer.timeit(repeats)
+    return used_time.mean
+
+
+def flops(batch, seqlen, headdim, nheads, time_f):
+    """Compute the flops value of a GPU with give flashattention function"""
+
+    flop = 4 * batch * seqlen**2 * nheads * headdim
+    return (flop / time_f / 10**12) if not math.isnan(time_f) else 0.0
+
+
+def get_gpu_temperature():
+    """Get current GPU temperature."""
+    try:
+        gpu_id = torch.cuda.current_device()
+    except AssertionError:
+        gpu_id = -1
+
+    if GPUtil is not None and gpu_id >= 0:
+        gpus = GPUtil.getGPUs()
+        gpu_temperature = gpus[gpu_id].temperature
+    else:
+        gpu_temperature = -1
+
+    return gpu_temperature
+
+
+def get_cpu_temperature():
+    """Get current CPU temperature."""
+
+    if psutil is not None:
+        cpu_temperature = psutil.sensors_temperatures()["coretemp"][0].current
+    else:
+        cpu_temperature = -1
+
+    return cpu_temperature
+
+
+def bench_net():
+    """Benchmark nccl performance for slow node detection."""
+
+    if gpc.get_world_size(ParallelMode.GLOBAL) <= 1:
+        return
+
+    if gpc.is_rank_for_log():
+        logger.info("benchmarking network speed ...")
+
+    repeats = 100
+    input_data = torch.randn(
+        8 * 1024 * 1024,
+        device=get_current_device(),
+        dtype=torch.bfloat16,
+    )
+
+    def allreduce_fn(inputs):
+        dist.all_reduce(inputs, op=torch.distributed.ReduceOp.AVG, group=gpc.get_group(ParallelMode.NETTEST))
+
+    bench_timer = benchmark.Timer(
+        stmt="test_fn_amp(inputs)",
+        globals={"test_fn_amp": allreduce_fn, "inputs": input_data},
+        num_threads=torch.get_num_threads(),
+    )
+    allreduce_time = bench_timer.timeit(repeats).mean
+    allreduce_time = allreduce_time * 10**3
+    allreduce_time_this = allreduce_time
+    allreduce_time = torch.Tensor([allreduce_time]).to(device=get_current_device())
+    dist.all_reduce(allreduce_time, group=gpc.get_group(ParallelMode.GLOBAL))
+    allreduce_time_avg = allreduce_time / gpc.get_world_size(ParallelMode.GLOBAL)
+    allreduce_time_avg = float(allreduce_time_avg.item())
+
+    if allreduce_time_this >= allreduce_time_avg * 1.05:
+        logger.warning(
+            f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} NCCL test is slower than avg, "
+            f"Hostname {socket.gethostname()}, "
+            f"allreduce_time {allreduce_time_this:.2f}, avg {allreduce_time_avg:.2f}, "
+            f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
+        )
+
+
+def bench_gpu(use_flash_attn=True):
+    """Benchmark single GPU performance for slow node detection."""
+
+    if gpc.is_rank_for_log():
+        logger.info("benchmarking gpu speed ...")
+
+    headdim = 64
+    dim = 2048
+    batch_size, seqlen = 2, 1024
+    nheads = dim // headdim
+
+    inner_attn = FlashSelfAttention if use_flash_attn else SelfAttention
+    inner_attn = inner_attn(causal=True, softmax_scale=None, attention_dropout=0)
+
+    qkv = torch.randn(
+        batch_size,
+        seqlen,
+        3,
+        dim // headdim,
+        headdim,
+        device=get_current_device(),
+        dtype=torch.float16,
+        requires_grad=True,
+    )
+    time_f = benchmark_forward(inner_attn, qkv)
+    speed = flops(batch_size, seqlen, headdim, nheads, time_f)
+    speed_this = speed
+    speed = torch.Tensor([speed]).to(device=get_current_device())
+    dist.all_reduce(speed, group=gpc.get_group(ParallelMode.GLOBAL))
+    speed_avg = speed / gpc.get_world_size(ParallelMode.GLOBAL)
+    speed_avg = float(speed_avg.item())
+
+    if speed_this <= speed_avg * 0.95:
+        logger.warning(
+            f"Rank {gpc.get_local_rank(ParallelMode.GLOBAL)} GPU is slower than avg, "
+            f"Hostname {socket.gethostname()}, "
+            f"tflops {speed_this:.2f}, avg {speed_avg:.2f}, "
+            f"CPU temp {get_cpu_temperature()}, GPU temp { get_gpu_temperature()}"
+        )
--- a/internlm/utils/megatron_timers.py
+++ b/internlm/utils/megatron_timers.py
@ -14,18 +14,19 @@ class _Timer:
        self.elapsed_ = 0.0
        self.started_ = False
        self.start_time = time.time()
+        self.stream = torch.cuda.current_stream()

    def start(self):
        """Start the timer."""
        assert not self.started_, "timer has already been started"
-        torch.cuda.synchronize()
+        self.stream.synchronize()
        self.start_time = time.time()
        self.started_ = True

    def stop(self):
        """Stop the timer."""
        assert self.started_, "timer is not started"
-        torch.cuda.synchronize()
+        self.stream.synchronize()
        self.elapsed_ += time.time() - self.start_time
        self.started_ = False

--- a/internlm/utils/model_checkpoint.py
+++ b/internlm/utils/model_checkpoint.py
@ -2,7 +2,9 @@
 # -*- encoding: utf-8 -*-

 import copy
+import fcntl
 import os
+import socket
 import time
 from enum import Enum
 from typing import Dict
@ -12,6 +14,7 @@ import torch
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
 from internlm.core.trainer import TrainState
+from internlm.monitor import send_alert_message
 from internlm.solver.optimizer import HybridZeroOptimizer
 from internlm.utils.common import get_current_device
 from internlm.utils.logger import get_logger
@ -25,8 +28,6 @@ from internlm.utils.storage_manager import (

 logger = get_logger(__file__)

-quit_signal_handler = None
-

 class CheckpointType(Enum):
    NORMAL_CHECKPOINT = 1
@ -167,44 +168,6 @@ def save_optimizer_checkpoint(optim, state_path):
        llm_save(os.path.join(state_path, fp), states)


-def save_checkpoint(folder, model, optimizer, scheduler, train_state: TrainState, model_config: Dict = None):
-    """
-    Save checkpoint to the given folder path.
-    """
-
-    start = time.time()
-    torch.distributed.barrier()
-    folder = os.path.join(folder, str(train_state.step_count))
-    logger.info(
-        f"Saving checkpoint to `{folder}` at batch count:{train_state.step_count} from rank:{gpc.get_global_rank()}..."
-    )
-
-    timer("save-model").start()
-    save_model_checkpoint(folder=folder, model=model)
-    timer("save-model").stop()
-
-    timer("save-optimizer").start()
-    save_optimizer_checkpoint(optim=optimizer, state_path=folder)
-    timer("save-optimizer").stop()
-
-    if gpc.is_rank_for_log():
-        scheduler_states = scheduler.state_dict()
-        llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
-
-        sampler_state = train_state.batch_sampler.state_dict()
-        llm_save(os.path.join(folder, "sampler.pt"), saved_obj=sampler_state)
-        llm_save(os.path.join(folder, "context.pt"), saved_obj=train_state.state_dict())
-
-        if model_config is not None:
-            llm_save(os.path.join(folder, "model_config.pt"), saved_obj=model_config)
-
-    torch.distributed.barrier()
-
-    if gpc.is_rank_for_log():
-        timer.log(["save-model", "save-optimizer"], logger=logger)
-        logger.info(f"Step: {train_state.step_count}, rank 0 save ckpt use {time.time() - start:.3f} s")
-
-
 def load_optimizer_checkpoint(folder, optim):
    """Load the optimizer state from the local file system or remote
    object storage Service (OSS).
@ -304,19 +267,12 @@ def load_scheduler(ckpt_path: str, lr_scheduler, optimizer, learning_rate, train
        logger.info(f"reload load_scheduler:{lr_scheduler}")


-class CheckpointSaveManager:
+class CheckpointManager:
    """StorageManagerContext"""

-    def __init__(
-        self,
-        ckpt_config,
-        model,
-        optimizer,
-        lr_scheduler,
-        model_config,
-    ) -> None:
+    def __init__(self, ckpt_config, model, model_config=None, model_config_file=None, feishu_address=None) -> None:
        """
-        CheckpointSaveManager is used to decide when to store ckpt. If it is an asynchronous
+        CheckpointManager is used to decide when to store ckpt. If it is an asynchronous
        upload mode, you must call wait_async_upload_finish at the end of the program to wait
        for the asynchronous ckpt upload to complete.

@ -332,26 +288,96 @@ class CheckpointSaveManager:
        self.save_ckpt_folder = ckpt_config.save_ckpt_folder
        self.snapshot_ckpt_folder = ckpt_config.snapshot_ckpt_folder
        self.oss_snapshot_freq: int = ckpt_config.oss_snapshot_freq
+        self.stop_file_path = ckpt_config.stop_file_path
+        self.load_model_only_folder = ckpt_config.load_model_only_folder
+        self.feishu_address = feishu_address
        self.storage_manager = get_storage_manager()
        self.snapshot_counter = 0
+        self.load_optimizer = gpc.config.ckpt.load_optimizer

        self.model = model
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
        self.model_config = model_config
+        self.model_config_file = model_config_file
+
+        if self.stop_file_path and gpc.get_global_rank() == 0:
+            dir_path = os.path.dirname(self.stop_file_path)
+            if dir_path != "" and not os.path.exists(dir_path):
+                os.makedirs(dir_path)
+            with open(self.stop_file_path, "w", encoding="utf-8") as f:
+                f.write("0")
+
+        if ckpt_config.load_given_ckpt is False:
+            # Priority: load_given_ckpt(True) > latest_checkpoint > load_model_only_folder
+            latest_ckpt_path = self.query_lastest_ckpt()
+            if latest_ckpt_path:
+                self.load_ckpt_folder = latest_ckpt_path
+            else:
+                # At this time, we have to load model init weights and train from step 0.
+                self.load_ckpt_folder = self.load_model_only_folder
+        else:
+            self.load_ckpt_folder = ckpt_config.load_ckpt_folder
+
+        if gpc.is_rank_for_log():
+            logger.info(f"load_ckpt_folder will set to :'{self.load_ckpt_folder}'")
+            if self.stop_file_path is None:
+                logger.warning("no set stop_file_path, quit_signal_handler is disable")
+
+    def quit_signal_handler(self, train_state) -> bool:
+        """
+        Exit signal detection function, if we write the exit step in the 'QUIT_FILE_PATH' file,
+        all ranks will save ckpt and exit.
+        Negative integer step means save ckpt.
+        Positive integer step means save ckpt and quit.
+
+        Args:
+            train_state (TrainState):
+        Returns:
+            bool: whether to quit.
+        """
+        now_break, now_save_ckpt, save_type = False, False, CheckpointType.NORMAL_CHECKPOINT
+
+        if self.stop_file_path is None:
+            return now_break, now_save_ckpt, save_type
+
+        with open(self.stop_file_path, "a+", encoding="utf-8") as f:
+            fcntl.flock(f, fcntl.LOCK_EX)
+            f.seek(0)
+            msg = f.read()
+            fcntl.flock(f, fcntl.LOCK_UN)
+            action_step = int(msg)
+
+        if action_step < 0 and abs(action_step) == train_state.step_count:
+            now_save_ckpt = True
+
+        if action_step > 0 and action_step == train_state.step_count:
+            now_break, now_save_ckpt = True, True
+
+        if action_step != 0 and gpc.is_rank_for_log():
+            msg = "Stop" if action_step > 0 else "Save"
+            action_step = abs(action_step)
+            if train_state.step_count <= action_step:
+                if self.feishu_address:
+                    send_alert_message(
+                        address=self.feishu_address,
+                        message=f"training will {msg} at step_count {action_step}!\
+now step_count is {train_state.step_count}",
+                    )
+
+        return now_break, now_save_ckpt, save_type

    def try_save_checkpoint(self, train_state):
        if not self.enable_save_ckpt:
-            return
+            return False

        save_ckpts, save_type = False, CheckpointType.NORMAL_CHECKPOINT
        if self.oss_snapshot_freq > 1 and train_state.step_count % self.oss_snapshot_freq == 0:
            save_ckpts, save_type = True, CheckpointType.SNAPSHOT_CHECKPOINT
        if train_state.step_count % self.checkpoint_every == 0:
            save_ckpts, save_type = True, CheckpointType.NORMAL_CHECKPOINT
+        now_break, singal_save_ckpts, singal_save_type = self.quit_signal_handler(train_state)
        if save_ckpts is False:
-            if quit_signal_handler is not None:
-                save_ckpts, save_type = quit_signal_handler(train_state)
+            save_ckpts = singal_save_ckpts
+            save_type = singal_save_type

        if save_ckpts:
            # Wait for the previous round of asynchronous upload storage to complete.
@ -361,18 +387,247 @@ class CheckpointSaveManager:
                self.snapshot_counter = (self.snapshot_counter + 1) % 2
                save_ckpt_folder = os.path.join(self.snapshot_ckpt_folder, f"{self.snapshot_counter}")
            else:
-                save_ckpt_folder = self.save_ckpt_folder
+                save_ckpt_folder = os.path.join(self.save_ckpt_folder, str(train_state.step_count))

-            save_checkpoint(
+            self.save_checkpoint(
                folder=save_ckpt_folder,
                model=self.model,
                optimizer=self.optimizer,
                scheduler=self.lr_scheduler,
                train_state=train_state,
                model_config=self.model_config,
+                model_config_file=self.model_config_file,
            )

+        return now_break
+
    def wait_async_upload_finish(self):
        """wait for all checkpoint uploads to be completed"""
        self.storage_manager.wait()
        torch.distributed.barrier()
+
+    def query_latest_snapshot_step_boto3(self):
+        """query_latest_snapshot_step_boto3
+        Returns:
+            Tuple(str, int): path of latest ckpt and ckpt step, if not found, None will return.
+        """
+        ckpt_list = self.storage_manager.get_fns(self.save_ckpt_folder)
+        if len(ckpt_list) == 0:
+            return None, None
+
+        max_normal_step = 0
+        ckpt_list = list(map(lambda a: int(a.strip("/")) if a.strip("/").isdigit() else 0, ckpt_list))
+        ckpt_list.sort(reverse=True)
+        for ckpt in ckpt_list:
+            fns_list = self.storage_manager.get_fns(os.path.join(self.save_ckpt_folder, str(ckpt)))
+            for fn in fns_list:
+                if fn.endswith(".step"):
+                    max_normal_step = ckpt
+                    break
+            if max_normal_step != 0:
+                break
+
+        max_normal_step = ckpt_list[0]
+        load_normal_ckpt_path = os.path.join(self.save_ckpt_folder, str(max_normal_step))
+
+        snapshot_path_0 = os.path.join(self.save_ckpt_folder, "snapshot", "0")
+        snapshot_path_1 = os.path.join(self.save_ckpt_folder, "snapshot", "1")
+        ckpt_list_1 = self.storage_manager.get_fns(snapshot_path_0)
+        ckpt_list_2 = self.storage_manager.get_fns(snapshot_path_1)
+        max_step_0, max_step_1 = 0, 0
+        for ckpt in ckpt_list_1:
+            ckpt = ckpt.strip("/")
+            if ckpt.endswith(".step"):
+                max_step_0 = max(max_step_0, int(ckpt.split(".")[0]))
+        for ckpt in ckpt_list_2:
+            ckpt = ckpt.strip("/")
+            if ckpt.endswith(".step"):
+                max_step_1 = max(max_step_1, int(ckpt.split(".")[0]))
+
+        snap_load_path = snapshot_path_0 if max_step_0 > max_step_1 else snapshot_path_1
+        snap_step = max(max_step_0, max_step_1)
+        load_path = snap_load_path if snap_step > max_normal_step else load_normal_ckpt_path
+        load_step = max(snap_step, max_normal_step)
+        return load_path, load_step
+
+    def query_latest_snapshot_step_local(self):
+        max_step, max_step_path = 0, None
+        for root, _, files in os.walk(self.save_ckpt_folder, followlinks=True):
+            for fn in files:
+                fn = fn.strip("/")
+                if fn.endswith(".step"):
+                    # We assume that both normal ckpt and snapshot ckpt will store the '.step' file
+                    # as an integrity flag.
+                    step = int(fn.rsplit(".", maxsplit=1)[0])
+                    if max_step < step:
+                        max_step = step
+                        max_step_path = root
+
+        return max_step_path, max_step
+
+    def query_lastest_ckpt(self):
+        latest_checkpoint = None
+        # Training was automatically restarted by the process, forcing the latest snapshot to be read.
+        if self.save_ckpt_folder:
+            if self.save_ckpt_folder.startswith("boto3"):
+                latest_checkpoint, step = self.query_latest_snapshot_step_boto3()
+            elif self.save_ckpt_folder.startswith("local"):
+                latest_checkpoint, step = self.query_latest_snapshot_step_local()
+            else:
+                latest_checkpoint, step = None, 0
+
+            if latest_checkpoint is not None:
+                if gpc.is_rank_for_log():
+                    logger.info(f"Found latest ckpt : {latest_checkpoint}, step: {step}")
+                    send_alert_message(
+                        address=self.feishu_address,
+                        message=f"Auto restart resume from ckpt-path: '{latest_checkpoint}', step : {step}",
+                    )
+            else:
+                if gpc.is_rank_for_log():
+                    send_alert_message(
+                        address=self.feishu_address,
+                        message=f"Can't find snapshot checkpoint, use default load-ckpt path: {latest_checkpoint}",
+                    )
+
+        return latest_checkpoint
+
+    def try_load_model(self, current_time=""):
+        model_load_path = None
+
+        if self.load_ckpt_folder and self.load_model_only_folder:
+            raise ValueError(
+                "Error, try to use both load_ckpt_folder and load_model_only_folder paths, \
+if you only need to load model weights (for example starting an SFT task for the first time), \
+set load_model_only_folder path, if you need to resume training from ckpt, \
+set load_ckpt_folder or use default value \
+(if is the default value, internlm will try to load the latest ckpt from save_ckpt_folder)"
+            )
+
+        if self.load_ckpt_folder:
+            if gpc.is_rank_for_log():
+                logger.info(
+                    f"===========Resume training from `{self.load_ckpt_folder}` {current_time} on host:"
+                    f"{socket.gethostname()}==========="
+                )
+            model_load_path = self.load_ckpt_folder
+        elif self.load_model_only_folder:
+            if gpc.is_rank_for_log():
+                logger.info(
+                    f"===========Load Model from `{self.load_model_only_folder}` {current_time} on host:"
+                    f"{socket.gethostname()}==========="
+                )
+            model_load_path = self.load_model_only_folder
+        else:
+            if gpc.is_rank_for_log():
+                logger.info(
+                    f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
+                    f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
+                    f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
+                )
+
+        # Loading model weights must be done before zero is initialized.
+        if model_load_path is not None:
+            load_model_checkpoint(folder=model_load_path, model=self.model)
+
+    def try_resume_training(self, lr_scheduler, optimizer, lr, train_state, train_dl):
+        """Attempt to restore the training state of the last ckpt.
+
+        Args:
+            lr_scheduler (_LRScheduler): lr_scheduler object.
+            optimizer (Optimizer): optimizer object.
+            lr (float): learning rate.
+            train_state (dict): traing states.
+            train_dl (DataLoader): traning dataloader object
+        """
+        if self.load_ckpt_folder is not None:
+            # load optimzier states.
+            if self.load_optimizer:
+                load_optimizer_checkpoint(self.load_ckpt_folder, optimizer)
+            # load lr scheduler states.
+            load_scheduler(self.load_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
+            # load training states.
+            load_context(self.load_ckpt_folder, train_dl, train_state)
+            # load dataloader sampler states.
+            if hasattr(train_state, "batch_sampler") and not isinstance(
+                train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
+            ):
+                load_sampler(self.load_ckpt_folder, train_dl.batch_sampler)
+            if hasattr(train_state, "data_state_dict"):
+                train_dl.dataset.load_state_dict(
+                    llm_load(os.path.join(self.load_ckpt_folder, "sampler_0.pt")), ckpt_path=self.load_ckpt_folder
+                )
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+
+    def save_checkpoint(
+        self,
+        folder,
+        model,
+        optimizer,
+        scheduler,
+        train_state: TrainState,
+        model_config: Dict = None,
+        model_config_file: str = None,
+    ):
+        """
+        Save checkpoint to the given folder path.
+        """
+
+        start = time.time()
+        self.set_save_folder(folder, train_state.step_count)
+        torch.cuda.synchronize()
+        torch.distributed.barrier()
+        if gpc.is_rank_for_log():
+            logger.info(f"Saving checkpoint to `{folder}` at batch count:{train_state.step_count}...")
+
+        timer("save-model").start()
+        save_model_checkpoint(folder=folder, model=model)
+        timer("save-model").stop()
+
+        timer("save-optimizer").start()
+        save_optimizer_checkpoint(optim=optimizer, state_path=folder)
+        timer("save-optimizer").stop()
+
+        if (
+            hasattr(train_state, "data_state_dict")
+            and gpc.get_local_rank(ParallelMode.TENSOR) == 0
+            and gpc.get_local_rank(ParallelMode.PIPELINE) == 0
+        ):
+            llm_save(
+                os.path.join(folder, f"sampler_{gpc.get_local_rank(ParallelMode.DATA)}.pt"),
+                saved_obj=train_state.data_state_dict,
+            )
+
+        if gpc.is_rank_for_log():
+            scheduler_states = scheduler.state_dict()
+            llm_save(os.path.join(folder, "schedulder.pt"), saved_obj=scheduler_states)
+            if hasattr(train_state, "batch_sampler") and not isinstance(
+                train_state.batch_sampler, torch.utils.data.sampler.BatchSampler
+            ):
+                sampler_state = train_state.batch_sampler.state_dict()
+                llm_save(os.path.join(folder, "sampler.pt"), saved_obj=sampler_state)
+            llm_save(os.path.join(folder, "context.pt"), saved_obj=train_state.state_dict())
+
+            if model_config is not None:
+                # Model configuration dictionary.
+                llm_save(os.path.join(folder, "model_config.pt"), saved_obj=model_config)
+
+            if model_config_file is not None:
+                # The complete training config file content, stored in binary format.
+                llm_save(os.path.join(folder, "config_file.pt"), saved_obj=model_config_file)
+
+        torch.distributed.barrier()
+
+        if gpc.is_rank_for_log():
+            timer.log(["save-model", "save-optimizer"], logger=logger)
+            logger.info(f"Step: {train_state.step_count}, rank 0 save ckpt use {time.time() - start:.3f} s")
+            if self.storage_manager.async_mode is False:
+                llm_save(
+                    os.path.join(folder, f"{train_state.step_count}.step"),
+                    saved_obj=dict({"step": train_state.step_count}),
+                )
+
+    def set_save_folder(self, folder, step):
+        self.storage_manager.latest_save_folder = folder
+        self.storage_manager.latest_save_step = step
--- a/internlm/utils/simple_memory_profiler.py
+++ b/internlm/utils/simple_memory_profiler.py
@ -1,15 +1,13 @@
 import os
 import time
 from collections import OrderedDict
-from functools import partial
+from functools import partial, reduce
 from typing import Any, Dict, List, Tuple

 import pyecharts
 import torch

-from internlm.core.context import ParallelMode
-from internlm.core.context import global_context as gpc
-from internlm.solver.pipeline_utils import partition_uniform
+from internlm.core.naive_amp import NaiveAMPModel

 mb = 1024 * 1024

@ -107,6 +105,8 @@ class SimpleMemState:
        """
        Update the total memory usage of the model and sub-models.
        """
+        self._total_mem = self._layer_mem
+
        for stat in self.sub_model_stats.values():
            # Update sub-model status first.
            stat.update_total_memory()
@ -169,6 +169,39 @@ class SimpleMemState:
            return {"name": self.layer_name, "children": children}


+class ActivationMemState:
+    """
+    Activation Memory State
+    """
+
+    def __init__(self, num_chunks: int) -> None:
+        self._num_chunks = num_chunks
+
+        self.inited: List[bool] = [False for _ in range(num_chunks)]
+        self.states: List[SimpleMemState] = [SimpleMemState(f"activations_{idx}") for idx in range(num_chunks)]
+
+    @property
+    def total_mem(self) -> int:
+        return sum(state.total_mem for state in self.states)
+
+    def dump(self, prefix: str = "") -> str:
+        return reduce(lambda x, y: x + y, [state.dump(prefix) for state in self.states])
+
+    def to_json(self, base: int = 1024 * 1024) -> List:
+        return [state.to_json(base) for state in self.states]
+
+
+def _unpack_naive_wrapper(model: torch.nn.Module) -> Tuple[torch.nn.Module, int]:
+    num_chunks = len(model) if isinstance(model, torch.nn.ModuleList) else 1
+
+    if num_chunks > 1:
+        model = torch.nn.ModuleList([_model.model if isinstance(_model, NaiveAMPModel) else _model for _model in model])
+    else:
+        model = model.model if isinstance(model, NaiveAMPModel) else model
+
+    return model, num_chunks
+
+
 class SimpleMemoryProfiler:
    """
    A memory profiler for a llm model.
@ -177,7 +210,7 @@ class SimpleMemoryProfiler:
        model (torch.nn.Module): The model to profile.
        optimizer (torch.optim.Optimizer): The optimizer used for training the model.
        log_file (str): The file to write the memory state information to.
-        activation_config (List[str], optional): The list of activation layers to track. Defaults to None.
+        total_steps: number of steps to trace.
    """

    def __init__(
@ -186,9 +219,8 @@ class SimpleMemoryProfiler:
        optimizer: torch.optim.Optimizer,
        log_folder: str,
        total_steps: int = 5,
-        activation_config: List[str] = None,
    ):
-        self._model = model
+        self._model, self._num_model_chunks = _unpack_naive_wrapper(model)
        self._optimizer = optimizer
        self._log_folder = log_folder
        self._remaining_steps = total_steps
@ -197,17 +229,20 @@ class SimpleMemoryProfiler:
        self._record_start_time = time.time()

        # For activation memory state.
-        self._activation_config = activation_config
-        self._activation_mem_inited: bool = False
+
        self._activation_mem: int = 0
-        self._activation_max_count = 0
-        self._activation_base_mem: SimpleMemState = SimpleMemState("activations")
+        self._activation_mem_max: int = 0
+        self._activation_base_mems = ActivationMemState(self._num_model_chunks)

        # Check or create log folder
        os.makedirs(self._log_folder, exist_ok=True)

        # Register activation memory tracking hooks
-        self._register_activation_trace_hooks()
+        if self._num_model_chunks > 1:
+            for chunk_id in range(self._num_model_chunks):
+                self._register_activation_trace_hooks(chunk_id, self._model[chunk_id])
+        else:
+            self._register_activation_trace_hooks(0, self._model)

        # Calculate static parameter cuda memory
        self._param_mem_state = SimpleMemState("param_mem")
@ -221,7 +256,7 @@ class SimpleMemoryProfiler:
        self._calc_tensor_group_memory(self._os_params_mem_state, list(enumerate(self._optimizer.param_groups)))

        # Generate the first memory record
-        self.point(create=True)
+        self.point(with_options="params,grads,os_params", create=True)

    def point(self, with_options: str = "", create: bool = False) -> None:
        """
@ -272,7 +307,7 @@ class SimpleMemoryProfiler:
        if "os_state" in options:
            layout_info += "os_state_layout:\n" + self._os_state_mem_state.dump()
        if "activation_base" in options:
-            layout_info += "activation_base_layout:\n" + self._activation_base_mem.dump()
+            layout_info += "activation_base_layout:\n" + self._activation_base_mems.dump()

        # Write memory state information to log file
        file_mode = "w" if create else "a"
@ -315,14 +350,14 @@ class SimpleMemoryProfiler:
                [self._os_params_mem_state.to_json(), self._os_state_mem_state.to_json()],
                "os_memory_sunburst",
            )
-            self._render_sunburst_chart(self._activation_base_mem.to_json()["children"], "activation_memory_sunburst")
+            self._render_sunburst_chart(self._activation_base_mems.to_json(), "activation_memory_sunburst")
            # Generate summary sunburst chart
            summary_sunburst_data = [
                {"name": "params", "value": self._param_mem_state.total_mem // mb},
                {"name": "grads", "value": self._grad_mem_state.total_mem // mb},
                {"name": "os_params", "value": self._os_params_mem_state.total_mem // mb},
                {"name": "os_state", "value": self._os_state_mem_state.total_mem // mb},
-                {"name": "activation", "value": self._activation_base_mem.total_mem // mb},
+                {"name": "activation", "value": self._activation_mem_max // mb},
            ]

            self._render_sunburst_chart(summary_sunburst_data, "summary_sunburst")
@ -337,12 +372,13 @@ class SimpleMemoryProfiler:
                {},
                {
                    "r0": "10%",
-                    "r": "40%",
+                    "r": "35%",
                    "itemStyle": {"borderWidth": 3},
                    "label": {"align": "left"},
                },
-                {"r0": "40%", "r": "65%", "label": {"align": "left"}},
-                {"r0": "65%", "r": "80%", "label": {"align": "left"}},
+                {"r0": "35%", "r": "55%", "label": {"align": "left"}},
+                {"r0": "55%", "r": "70%", "label": {"align": "left"}},
+                {"r0": "70%", "r": "80%", "label": {"align": "left"}},
                {"r0": "80%", "r": "90%", "label": {"align": "left"}},
                {
                    "r0": "90%",
@ -357,7 +393,14 @@ class SimpleMemoryProfiler:
            f"{self._log_folder}/{name}.html"
        )

-    def _inner_activation_trace_hook(self, layer_name: str, model: Any, inputs: Any, output: torch.Tensor) -> None:
+    def _inner_activation_trace_hook(
+        self,
+        chunk_id: int,
+        layer_name: str,
+        model: Any,
+        inputs: Any,
+        output: torch.Tensor,
+    ) -> None:
        """
        Hook function to trace the activation memory usage for a inner layer.

@ -373,13 +416,15 @@ class SimpleMemoryProfiler:
        del model, inputs
        assert isinstance(output, torch.Tensor), f"Invalid output type: {type(output)}"

-        if self._stoped or self._activation_mem_inited:
+        if self._stoped or self._activation_base_mems.inited[chunk_id]:
            return

        # Delay updating the total_mem of activation_base_mem here, it will be handled in the forward ending hook.
-        self._activation_base_mem.add(layer_name, output.element_size() * output.nelement(), flush=False)
+        self._activation_base_mems.states[chunk_id].add(
+            layer_name, output.element_size() * output.nelement(), flush=False
+        )

-    def _activation_trace_hook_forward(self, model: Any, inputs: Any, output: torch.Tensor) -> None:
+    def _activation_trace_hook_forward(self, chunk_id: int, model: Any, inputs: Any, output: torch.Tensor) -> None:
        """
        Hook function to trace the activation memory usage for a forward pass.

@ -398,23 +443,24 @@ class SimpleMemoryProfiler:
            return

        # Check if the activation memory has been initialized
-        if self._activation_mem_inited is False:
+        if self._activation_base_mems.inited[chunk_id] is False:
+            self._activation_base_mems.inited[chunk_id] = True
            # Update the total memory of the activation base memory state
-            self._activation_base_mem.update_total_memory()
+            self._activation_base_mems.states[chunk_id].update_total_memory()
            # Set with_options to "activation_base" to include activation_base_layout in the memory dump
-            self._activation_mem_inited = True
+            with_options = "activation_base"
+        else:
+            with_options = ""

        # Accumulate activation memory usage for each forward pass
-        self._activation_mem += self._activation_base_mem.total_mem
-
-        # Update activation max count
-        if self._activation_mem // self._activation_base_mem.total_mem > self._activation_max_count:
-            self._activation_max_count = self._activation_mem // self._activation_base_mem.total_mem
+        self._activation_mem += self._activation_base_mems.states[chunk_id].total_mem
+        if self._activation_mem > self._activation_mem_max:
+            self._activation_mem_max = self._activation_mem

        # Trigger a memory record
-        self.point()
+        self.point(with_options)

-    def _activation_tarce_hook_backward(self, model: Any, inputs: Any, grad_outputs: Any) -> None:
+    def _activation_tarce_hook_backward(self, chunk_id: int, model: Any, inputs: Any, grad_outputs: Any) -> None:
        """
        Hook function to trace the activation memory usage for a backward pass.

@ -432,37 +478,28 @@ class SimpleMemoryProfiler:
            return

        # Release activation memory usage for each backward pass
-        self._activation_mem -= self._activation_base_mem.total_mem
+        self._activation_mem -= self._activation_base_mems.states[chunk_id].total_mem

        # Trigger a memory record
        self.point()

-    def _register_activation_trace_hooks(self) -> None:
+    def _register_activation_trace_hooks(self, chunk_id: int, model_chunk: torch.nn.Module) -> None:
        """
        Register activation trace hooks for the model and each submodule in the model.
        """

        # Register inner activation trace hooks for each submodule in the model
-        for layer_name in self._activation_config:
-            # Register a hook for every activation
-            model = self._model
-            sub_models = layer_name.split(".")
-            # Get the target sub-model
-            for sub_model_name in sub_models:
-                try:
-                    model = model.get_submodule(sub_model_name)
-                except AttributeError:
-                    model = None
-                    break
-
+        for layer_name, sub_model in model_chunk.named_modules():
            # Register the hook
-            if model is not None:
-                model.register_forward_hook(partial(self._inner_activation_trace_hook, layer_name))
+            if len(sub_model._modules) != 0:
+                continue  # TODO: in some special cases, we may need some additional configuration to correct
+
+            sub_model.register_forward_hook(partial(self._inner_activation_trace_hook, chunk_id, layer_name))

        # Register a forward hook for the main model to track activation memory usage
-        self._model.register_forward_hook(self._activation_trace_hook_forward)
+        model_chunk.register_forward_hook(partial(self._activation_trace_hook_forward, chunk_id))
        # Register a backward hook for the main model to release activation memory usage
-        self._model.register_full_backward_hook(self._activation_tarce_hook_backward)
+        model_chunk.register_full_backward_hook(partial(self._activation_tarce_hook_backward, chunk_id))

    def _calc_tensor_memory(
        self, root_stat: SimpleMemState, named_tensors: Dict[str, torch.Tensor], require_grad: bool = False
@ -554,48 +591,6 @@ class SimpleMemoryProfiler:
            self._calc_tensor_memory(root_stat, named_tensors)


-def build_activation_config(num_layers: int, num_chunks: int = 1) -> List[str]:
-    # TODO: support interleaved pipeline scheduling.
-    assert num_chunks == 1, "Only support num_chunks == 1"
-
-    if gpc.is_initialized(ParallelMode.PIPELINE):
-        pipeline_size = gpc.get_world_size(ParallelMode.PIPELINE)
-        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
-    else:
-        pipeline_size = 1
-        pipeline_rank = 0
-
-    all_parts = partition_uniform(num_layers, pipeline_size, num_chunks)
-    parts = all_parts[pipeline_rank]
-    start, end = parts[0]
-    num_blocks = end - start
-
-    block_conf_tmpl = [
-        "mixer.rotary_emb",
-        "mixer.Wqkv",
-        "mixer.inner_attn",
-        "mixer.inner_cross_attn",
-        "mixer.out_proj",
-        # "dropout1", # skip when dropout_selective_checkpoint is True
-        # "dropout2", # skip when dropout_selective_checkpoint is True
-        "norm1",
-        "norm2",
-        "mlp.w1",
-        "mlp.w2",
-        "mlp.w3",
-    ]
-
-    block_conf = []
-    for block_id in range(num_blocks):
-        block_conf += [f"blocks.{block_id}.{layer}" for layer in block_conf_tmpl]
-
-    # We don't need to care about whether the embedding, norm, and head layers exist in the model after partitioning.
-    # If they don't exist, they will be automatically ignored when registering activation trace hooks.
-    activation_conf = ["embedding", "norm", "head"] + block_conf
-
-    return activation_conf
-
-
 if __name__ == "__main__":

    class SimpleModel(torch.nn.Module):
@ -635,32 +630,39 @@ if __name__ == "__main__":

            return output

+    def _simple_schedule(_num_chunks, _model_chunks, _input) -> torch.Tensor:
+        if _num_chunks > 1:
+            _output = _input
+            for _model_chunk in _model_chunks:
+                _output = _model_chunk(_output)
+        else:
+            _output = _model_chunks(_input)
+
+        return _output
+
+    # num_chunks config
+    _num_chunks = 1
+
    # init model and optimizer
-    _model: torch.nn.Module = SimpleModel()
+    if _num_chunks > 1:
+        _chunks = [SimpleModel(skip_layer2=idx % 2 == 0) for idx in range(_num_chunks)]
+        _model = torch.nn.ModuleList(_chunks).cuda()
+    else:
+        _model: torch.nn.Module = SimpleModel().cuda()
    _optimizer = torch.optim.Adam(_model.parameters())

-    # create activation config for simple model layer by layer.
-    activation_configs = [
-        # model level 0
-        "layer1",
-        "layer2",
-        "layer3",
-        # model level 1
-        "layer2.layer1",
-        "layer2.layer3",
-    ]
-
-    _model.modules()
-
    # init profiler
-    profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler.log", activation_configs)
+    profiler = SimpleMemoryProfiler(_model, _optimizer, "./test_simple_memory_profiler", total_steps=1)

    _optimizer.zero_grad()

-    x1 = torch.randn((128, 5120))
-    x2 = torch.randn((128, 5120))
-    out1 = _model(x1)
-    out2 = _model(x2)
+    # inputs
+    x1 = torch.randn((128, 5120)).cuda()
+    x2 = torch.randn((128, 5120)).cuda()
+    # forward
+    out1 = _simple_schedule(_num_chunks, _model, x1)
+    out2 = _simple_schedule(_num_chunks, _model, x2)
+    # backward
    out1.mean().backward()
    out2.mean().backward()

--- a/internlm/utils/storage_manager.py
+++ b/internlm/utils/storage_manager.py
@ -15,8 +15,6 @@ from asyncio.tasks import ALL_COMPLETED
 from datetime import datetime
 from typing import Any, Awaitable, Callable, Dict, List, Union

-import boto3
-import botocore
 import torch
 import torch.distributed as dist

@ -24,6 +22,13 @@ from internlm.core.context import global_context as gpc
 from internlm.utils.common import SingletonMeta
 from internlm.utils.logger import get_logger

+try:
+    import boto3
+    import botocore
+except ImportError:
+    pass
+
+
 logger = get_logger(__file__)

 boto3_url_re = re.compile(r"([^\.]+)\.([\d\.]+)")
@ -234,13 +239,13 @@ class Boto3Client(StorageClient):
        """
        paginator = handler.client.get_paginator("list_objects_v2")
        pages = paginator.paginate(Bucket=bucket_name, Prefix=fp)
-
        folder_name_list = []
        for page in pages:
+            if "Contents" in page:
                for obj in page["Contents"]:
-                fp: str = obj["Key"]
-                folder_name_list.append(fp.rsplit("/", maxsplit=1)[1])
-        return folder_name_list
+                    pth: str = obj["Key"]
+                    folder_name_list.append(pth.split(fp, maxsplit=1)[1].strip("/").split("/", maxsplit=1)[0])
+        return list(set(folder_name_list))

    @staticmethod
    def async_upload_fileobj(handler, bucket_name: str, fp: str, local_nvme_path: str):
@ -391,6 +396,11 @@ class StorageManager(metaclass=SingletonMeta):
        self.tmp_local_folder = tmp_local_folder
        self.async_mode = async_mode
        self.has_warning = False
+        self._async_loop = None
+        self._thread_pool = None
+        self.latest_save_folder = None
+        self.latest_save_step = 0
+        self.async_task_peeding = False

        if enable_save and self.async_mode:
            self._async_loop = asyncio.new_event_loop()
@ -485,6 +495,7 @@ class StorageManager(metaclass=SingletonMeta):
                torch.save(saved_obj, f, pickle_protocol=pickle.HIGHEST_PROTOCOL)
            self.async_executor(meta.async_upload_fn, *unpack_meta(meta))
            os.chmod(tmp_step_file, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
+            self.async_task_peeding = True
        else:
            meta.client.sync_upload_fileobj(*unpack_meta(meta), *args, saved_obj=saved_obj, **kwargs)
            self.upload_count += 1
@ -523,23 +534,22 @@ class StorageManager(metaclass=SingletonMeta):
                    pass

    async def _sync_tasks(self) -> Awaitable[None]:
-        if not self._async_stack:
-            return
-
+        if self._async_stack:
            await asyncio.wait(self._async_stack, return_when=ALL_COMPLETED)
-
-        for task in self._async_stack:
+            count = 0
+            while self._async_stack:
+                t = self._async_stack[0]
                try:
-                task.exception()
+                    e = t.exception()
+                    if e:
+                        self._exception_list.append((e, count))
+                        logger.error(f"File:{self._to_be_del_files[count]}, upload failed for {e}")
+                        # raise e
+                    count += 1
+                    self._async_stack.pop(0)
                except InvalidStateError:
-                continue
-            except Exception as e:
-                file_id = len(self._exception_list)
-                self._exception_list.append((e, file_id))
-
-                logger.error(f"File: {self._to_be_del_files[file_id]}, " f"upload failed with {e}")
-
-        self._async_stack.clear()
+                    # Not finished. https://docs.python.org/3/library/asyncio-task.html#asyncio.Task.exception
+                    pass

    def async_executor(self, fn: Callable, *args, **kwargs) -> None:
        """
@ -559,11 +569,14 @@ class StorageManager(metaclass=SingletonMeta):
        if not self.async_mode:
            return

+        if not self.async_task_peeding:
+            return
+
        if self._async_loop:
            self._async_loop.run_until_complete(self._sync_tasks())

        if self._exception_list:
-            for file_id, error_msg in self._exception_list:
+            for error_msg, file_id in self._exception_list:
                logger.error(
                    f"Node:{socket.gethostname()}, Error: Checkpoint {self._to_be_del_files[file_id]} "
                    f"failed on step {self.upload_count}: {error_msg}"
@ -577,10 +590,16 @@ class StorageManager(metaclass=SingletonMeta):
        self._del_tmp_folder()
        self._exception_list.clear()
        self._to_be_del_files.clear()
+        self.async_task_peeding = False

        if gpc.is_rank_for_log():
-            logger.info("all async uploads succeeded!")
            self.upload_count += 1
+            if self.async_mode:
+                self.save(
+                    os.path.join(self.latest_save_folder, f"{self.latest_save_step}.step"),
+                    saved_obj=dict({"step": self.latest_save_step}),
+                    async_upload=False,
+                )


 storage_manager: StorageManager = None
--- a/internlm/utils/writer.py
+++ b/internlm/utils/writer.py
@ -11,10 +11,6 @@ from torch.utils.tensorboard import SummaryWriter
 from internlm.core.context import global_context as gpc


-def copy_ignore_folder(source_path, target_path):
-    os.system(f"cp -r {source_path}/* {target_path}/")
-
-
 def tb_save_run_info(writer, config_lines, global_step=0):
    writer.add_text(tag="cmd", text_string=" ".join(sys.argv[:]), global_step=global_step)
    lines = []
@ -42,9 +38,21 @@ def init_tb_writer(
        tb_folder = tensorboard_folder

    if gpc.get_global_rank() == 0:
+        # If we don't load ckpt, 'resume_tb_folder' is set as the tensorboard
+        # dir of the last task by 'make_launch_script.sh'.
+        # If we load ckpt, 'resume_tb_folder' will be overwritten as the
+        # reloaded 'train_state.resume_tb_folder'.s
        if resume_tb_folder is not None:
+            assert len(resume_tb_folder) > 0 and resume_tb_folder != "/"
+            if not os.path.exists(resume_tb_folder):
+                logger.error(
+                    f"Can't found resume_tb_folder{resume_tb_folder}, \
+please make sure this folder is located at local file system."
+                )
+            else:
                logger.info(f"Try mv tensorboard logs: {resume_tb_folder} to {tb_folder}... ")
-            copy_ignore_folder(resume_tb_folder, tb_folder)
+                os.system(f"cp -r {resume_tb_folder}/* {tb_folder}/")
+                os.system(f"chmod -R +w {tb_folder}/")
        else:
            logger.info(f"Login tensorboard logs to: {tb_folder}")

@ -126,6 +134,14 @@ class Writer:
        except Exception:
            traceback.print_exc()

+    def add_scalars(self, key, value, step):
+        try:
+            assert isinstance(value, dict)
+            if self.enable_tb and self.tb_writer is not None:
+                self.tb_writer.add_scalars(main_tag=key, tag_scalar_dict=value, global_step=step)
+        except Exception:
+            traceback.print_exc()
+
    def add_text(self, key, value, step):
        try:
            if self.enable_tb and self.tb_writer is not None:
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@ -13,4 +13,4 @@ boto3
 botocore
 torch-scatter
 pyecharts
-f https://data.pyg.org/whl/torch-1.13.0+cu117.html
+-f https://data.pyg.org/whl/torch-1.13.1+cu117.html
--- a/train.py
+++ b/train.py
@ -5,99 +5,48 @@ import socket
 import time
 import traceback
 from functools import partial
-from typing import Iterable

-import numpy as np
 import torch
 import torch.distributed as dist
-from torch import nn
-from torch.utils.data import DataLoader

 import internlm
 from internlm.core.context import ParallelMode
 from internlm.core.context import global_context as gpc
-from internlm.core.naive_amp import NaiveAMPModel
 from internlm.core.scheduler import SchedulerMetricHook
 from internlm.core.trainer import TrainState
-from internlm.data.batch_sampler import StaticBatchSampler, get_dpsampler_dataloader
-from internlm.data.collaters import jsonl_ds_collate_fn, packed_collate_fn
-from internlm.data.dataset import get_dataset_dict
-from internlm.data.dummy_dataset import RandomDataset
-from internlm.data.packed_dataset import (
-    PackedDataset,
-    PackedDatasetWithoutCuSeqlen,
-    get_packed_dataset_without_short_length,
-)
-from internlm.data.utils import DATASET_TYPE_IDS_MAP, unpack_data
+from internlm.initialize import initialize_distributed_env
 from internlm.model.loss import FlashGPTLMLoss
 from internlm.model.metrics import AccPerplex
-from internlm.monitor import initialize_monitor_manager, send_alert_message, set_env_var
+from internlm.monitor import initialize_monitor_manager, send_alert_message
 from internlm.monitor.monitor import monitor_manager as mm
-from internlm.solver.beta2_scheduler import Beta2Scheduler
-from internlm.solver.lr_scheduler import FineTuneCosineAnnealingWarmupLR
-from internlm.solver.optimizer import HybridZeroOptimizer
+from internlm.train import (
+    get_train_data_loader,
+    get_validation_data_loader,
+    initialize_llm_profile,
+    initialize_model,
+    initialize_optimizer,
+    load_new_batch,
+    record_current_batch_training_metrics,
+)
 from internlm.utils.common import (
    BatchSkipper,
-    DummyProfile,
-    get_master_node,
    get_megatron_flops,
    launch_time,
    parse_args,
 )
 from internlm.utils.evaluation import evaluate_on_val_dls
+from internlm.utils.gputest import bench_gpu, bench_net
 from internlm.utils.logger import get_logger, initialize_uniscale_logger
 from internlm.utils.megatron_timers import megatron_timer as timer
-from internlm.utils.model_checkpoint import (
-    CheckpointSaveManager,
-    load_context,
-    load_model_checkpoint,
-    load_optimizer_checkpoint,
-    load_sampler,
-    load_scheduler,
-)
-from internlm.utils.parallel import (
-    get_parallel_log_file_name,
-    is_no_pp_or_last_stage,
-    sync_model_param,
-    sync_model_param_within_tp,
-)
-from internlm.utils.registry import MODEL_INITIALIZER
-from internlm.utils.simple_memory_profiler import (
-    SimpleMemoryProfiler,
-    build_activation_config,
-)
+from internlm.utils.model_checkpoint import CheckpointManager
+from internlm.utils.parallel import get_parallel_log_file_name
+from internlm.utils.simple_memory_profiler import SimpleMemoryProfiler
 from internlm.utils.writer import Writer

 # global llm logger
 logger = get_logger(__file__)


-def initialize_distributed_env(config: str, launcher: str = "slurm", master_port: int = 8888, seed: int = 1024):
-    """
-    Initialize distributed environment for distributed training.
-
-    Args:
-        config (str): Config file path.
-        launcher (str): Launcher for launching distributed environment, can be slurm or torch. "slurm" by default.
-        master_port (str): The master port for distributed training. 8888 by default.
-        seed (int, optional): Specified random seed for every process. 1024 by default.
-    """
-
-    torch.cuda.empty_cache()
-
-    if launcher == "torch":
-        internlm.launch_from_torch(config=config, seed=seed)
-    elif launcher == "slurm":
-        internlm.launch_from_slurm(
-            config=config,
-            host=get_master_node(),
-            port=master_port,
-            seed=seed,
-        )
-    else:
-        assert launcher in ["slurm", "torch"], "launcher only support slurm or torch"
-
-
 def initialize_llm_logger(start_time: str):
    """
    Initialize customed uniscale logger.
@ -118,357 +67,14 @@ def initialize_llm_logger(start_time: str):
    return uniscale_logger


-def initialize_model():
-    """
-    Initialize model.
-
-    Returns: The neural network model to be trained or evaluated.
-    """
-
-    model = MODEL_INITIALIZER.get_module(module_name=gpc.config.model_type)(**(gpc.config.model))
-    if isinstance(model, nn.ModuleList):
-        model = nn.ModuleList(
-            [
-                NaiveAMPModel(
-                    model=_m,
-                    output_to_fp32=False,  # manually controlled by interleaved pipleline scheduler
-                    dtype=gpc.config.model.get("dtype", torch.half),
-                    sync_buffer=False,
-                )
-                for _m in model
-            ]
-        )
-    else:
-        model = NaiveAMPModel(
-            model=model,
-            output_to_fp32=is_no_pp_or_last_stage(),
-            dtype=gpc.config.model.get("dtype", torch.half),
-            sync_buffer=False,
-        )
-
-    # This sync is very important, cause the model weights kept in optimizer are copied
-    # from the origin parameters in the memory, so we should make sure the dp sync
-    # does not influence the model weights in optimizer be different with the origin parameters.
-    sync_model_param(model, parallel_mode=ParallelMode.DATA)
-
-    # This function is needed to make sure parameters that are not splitted by tensor parallelism are
-    # the same across tensor parallelism.
-    sync_model_param_within_tp(model)
-
-    return model
-
-
-def get_train_data_loader(num_worker: int = 0):
-    """
-    Generate and return the training data loader.
-
-    Returns: A tuple of (train_dl, dataset_types).
-    """
-
-    # Get the dataset types
-    dataset_types = None
-    dataset_types = list(DATASET_TYPE_IDS_MAP.keys())
-    data_cfg = gpc.config.data
-
-    # Get the sample weight dictionary
-    train_folder = data_cfg.train_folder
-
-    if not train_folder:
-        train_ds = RandomDataset(num_samples=1000000, max_len=data_cfg.seq_len)
-        if data_cfg.pack_sample_into_one:
-            train_ds = PackedDatasetWithoutCuSeqlen(
-                train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
-            )
-        else:
-            train_ds = PackedDataset(
-                train_ds, max_length_per_sample=data_cfg.seq_len, packed_length=data_cfg.packed_length
-            )
-    else:
-        train_ds = get_packed_dataset_without_short_length(
-            folder=data_cfg.train_folder,
-            packed_length=data_cfg.packed_length,
-            max_length_per_sample=data_cfg.seq_len,
-            show_progress=dist.get_rank() == 0,
-            min_length=data_cfg.min_length,
-            min_length_dict=data_cfg.get("min_length_dict", {}),
-            pack_into_one_sample=data_cfg.pack_sample_into_one,
-        )
-
-    # partition already completed
-    # assert isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen))
-    if isinstance(train_ds, (PackedDataset, PackedDatasetWithoutCuSeqlen)):
-        datasets = [train_ds]
-    else:
-        datasets = train_ds.datasets
-
-    # Create the training dataset sampler
-    train_sampler = StaticBatchSampler(
-        datasets,
-        batch_size=data_cfg.micro_num,
-        rampup_batch_size=data_cfg.rampup_batch_size,
-        micro_bsz=data_cfg.micro_bsz,
-        seed=1024,
-        drop_last=True,
-        data_rank=gpc.get_local_rank(ParallelMode.DATA),
-        data_world_size=gpc.get_world_size(ParallelMode.DATA),
-    )
-
-    train_collate_fn = partial(packed_collate_fn, packed_length=data_cfg.packed_length)
-
-    # Create the training data loader
-    train_dl = DataLoader(
-        dataset=train_ds,
-        batch_sampler=train_sampler,
-        num_workers=num_worker,
-        pin_memory=True,
-        collate_fn=train_collate_fn,
-        persistent_workers=True,
-    )
-
-    return train_dl, dataset_types
-
-
-def get_validation_data_loader(num_worker: int = 0):
-    """Generate and return the validation data loader."""
-
-    data_cfg = gpc.config.data
-
-    if not data_cfg.valid_folder:
-        val_ds = RandomDataset(num_samples=gpc.get_world_size(ParallelMode.DATA) * 500, max_len=data_cfg.seq_len)
-    else:
-        val_ds = get_dataset_dict(folder=data_cfg.valid_folder, split="")
-
-    if not isinstance(val_ds, dict):
-        val_ds = {"val": val_ds}
-
-    val_collate_fn = partial(jsonl_ds_collate_fn, max_length_per_sample=data_cfg.seq_len)
-
-    val_dls = {}
-    for val_name, ds in val_ds.items():
-        # making the batch_size of validate larger can speed up the evaluation, but it should not be too large,
-        # otherwise too much data may be dropped
-        batch_size = min(
-            data_cfg.valid_micro_num * data_cfg.micro_bsz, len(ds) // gpc.get_world_size(ParallelMode.DATA)
-        )
-        batch_size = batch_size // data_cfg.micro_bsz * data_cfg.micro_bsz
-
-        if batch_size == 0 and gpc.is_rank_for_log():
-            logger.info(f"skip validate {val_name}.")
-            continue
-
-        val_dls[val_name] = get_dpsampler_dataloader(
-            ds, shuffle=False, num_workers=num_worker, batch_size=batch_size, collate_fn=val_collate_fn, drop_last=True
-        )  # drop_last=True, otherwise it may cause problems in the last batch
-
-        if gpc.is_rank_for_log():
-            logger.info(
-                f"load validation dataset {val_name} with valid batch size {str(batch_size)} and "
-                f"samples {str(len(val_dls[val_name]))}."
-            )
-
-    return val_dls
-
-
-def load_new_batch(train_dl: DataLoader, train_iter: Iterable, train_state: TrainState):
-    """
-    Load and return the new batch data based on training data loader.
-
-    Args:
-        train_dl (torch.utils.data.DataLoader): Dataloader for training.
-        train_iter (Iterable): Data iterator from which get a batch of data, obtained by calling iter(dataloader).
-        train_state (TrainState): Current training state.
-
-    Returns: A batch data and the updated train_iter.
-    """
-
-    timer("batch-gen").start()
-    try:
-        batch = next(train_iter)  # structure is ({'input_ids': Tensor, 'cu_seqlens': Tensor}, Tensor)
-        next(train_state.batch_sampler_iter)
-    except StopIteration:
-        train_iter = iter(train_dl)
-        batch = next(train_iter)
-        train_state.batch_sampler_iter = iter(train_state.batch_sampler)
-        next(train_state.batch_sampler_iter)
-        train_state.num_consumed_samples_in_epoch = 0
-    timer("batch-gen").stop()
-
-    if batch[0].get("type_ids", None) is not None:
-        # if use_flash_attn is False, we need to unpack type_ids
-        if not gpc.config.model.use_flash_attn:
-            batch[0]["type_ids"] = unpack_data(batch[0]["type_ids"], batch[0]["cu_seqlens"])
-
-    return batch, train_iter
-
-
-def initialize_optimizer(model: nn.Module):
-    """
-    Initialize optimizer.
-
-    Args:
-        model (torch.nn.Module): Your model instance to be trained or evaluated.
-
-    Returns: A tuple of (optimizer, beta2_scheduler, lr_scheduler).
-    """
-    adam_cfg = gpc.config.adam
-    naive_optimizer = torch.optim.AdamW(
-        params=[{"params": model.parameters(), "weight_decay": adam_cfg.weight_decay}],
-        lr=adam_cfg.lr,
-        betas=(adam_cfg.adam_beta1, adam_cfg.adam_beta2),
-        eps=adam_cfg.adam_eps,
-    )
-
-    optimizer = HybridZeroOptimizer(
-        naive_optimizer, grad_scal_cfg=gpc.config.grad_scaler, zero_cfg=gpc.config.hybrid_zero_optimizer
-    )
-
-    beta2_scheduler = Beta2Scheduler(optimizer=naive_optimizer, **gpc.config.beta2_scheduler)
-
-    lr_scheduler = FineTuneCosineAnnealingWarmupLR(optimizer, **gpc.config.lr_scheduler)
-
-    return optimizer, beta2_scheduler, lr_scheduler
-
-
-def initialize_llm_profile(profiling: bool = False, start_time: str = None):
-    """Initialize and return the profiler context manager instance."""
-
-    if profiling and gpc.get_local_rank(ParallelMode.DATA) == 0 and gpc.get_local_rank(ParallelMode.TENSOR) == 0:
-        llm_profile = torch.profiler.profile
-        logger.info(f"Do profiling in rank {gpc.get_global_rank()}!")
-    else:
-        llm_profile = DummyProfile
-
-    return llm_profile(
-        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
-        schedule=torch.profiler.schedule(skip_first=5, wait=1, warmup=1, active=1, repeat=1),
-        on_trace_ready=torch.profiler.tensorboard_trace_handler(
-            f"{gpc.config.JOB_NAME}/{start_time}/traces/rank{gpc.get_global_rank()}_"
-            + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
-            + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}_"
-            + f"pp{gpc.get_local_rank(ParallelMode.PIPELINE)}",
-        ),
-        with_stack=True,
-        with_modules=True,
-    )
-
-
-def record_current_batch_training_metrics(
-    get_tflops_func,
-    logger,
-    writer,
-    success_update,
-    batch_count,
-    batch,
-    train_state,
-    optimizer,
-    beta2_scheduler,
-    trainer,
-    start_time,
-    loss,
-    grad_norm,
-    metric,
-    update_panel,
-):
-    """
-    Print some training metrics of current batch.
-    """
-
-    set_env_var(key="LAST_ACTIVE_TIMESTAMP", value=int(time.time()))
-
-    if success_update in (0, True):
-        train_state.num_consumed_tokens += batch[1].nelement() * gpc.get_world_size(ParallelMode.DATA)
-    if is_no_pp_or_last_stage():
-        acc_perplex = metric.get_metric()
-
-    if success_update and gpc.is_rank_for_log():
-        lr = optimizer.param_groups[0]["lr"]
-        if hasattr(trainer.engine.optimizer, "grad_scaler"):
-            scaler = trainer.engine.optimizer.grad_scaler._scale.item()
-        elif hasattr(trainer.engine.optimizer.optim, "grad_scaler"):
-            scaler = trainer.engine.optimizer.optim.grad_scaler._scale.item()
-
-        num_tokens_in_batch = batch[1].nelement()
-        num_samples_in_batch = sum([len(b) - 1 for b in batch[0]["cu_seqlens"]])
-        max_length_in_batch = max([(b[1:] - b[:-1]).max().item() for b in batch[0]["cu_seqlens"]])
-        max_samples_in_batch = max([len(b) - 1 for b in batch[0]["cu_seqlens"]])
-        min_samples_in_batch = min([len(b) - 1 for b in batch[0]["cu_seqlens"]])
-
-        tk_per_gpu = 0
-        tk_per_gpu = round(
-            num_tokens_in_batch
-            * gpc.get_world_size(ParallelMode.DATA)
-            / gpc.get_world_size(ParallelMode.GLOBAL)
-            / (time.time() - start_time),
-            2,
-        )
-
-        tflops = get_tflops_func((time.time() - start_time))
-
-        infos = {
-            "tflops": tflops,
-            "step": batch_count,
-            "loss": loss.item(),
-            "tgs (tokens/gpu/second)": tk_per_gpu,
-            "lr": lr,
-            "loss_scale": scaler,
-            "grad_norm": grad_norm,
-        }
-
-        infos["micro_num"] = len(batch[1])
-        infos["num_consumed_tokens"] = train_state.num_consumed_tokens
-        infos["inf_nan_skip_batches"] = train_state.inf_nan_skip_batches
-        infos["num_samples_in_batch"] = num_samples_in_batch  # the number of batches which have the most samples
-        infos["largest_length"] = max_length_in_batch  # the longest input
-        infos["largest_batch"] = max_samples_in_batch  # the batch with the most samples
-        infos["smallest_batch"] = min_samples_in_batch
-        infos["adam_beta2"] = beta2_scheduler.get_beta2()
-
-        fwd_bwd_time = round(timer("fwd-bwd").elapsed(), 2)
-        infos["fwd_bwd_time"] = fwd_bwd_time
-
-        for key, value in acc_perplex.items():
-            infos[key] = value
-
-        line = ""
-        for key, value in infos.items():
-            line += f"{key}={value} "
-            writer.add_scalar(key=key, value=value, step=train_state.step_count)
-
-        if update_panel:
-            logger.info(
-                line,
-                extra={
-                    "step": batch_count,
-                    "lr": lr,
-                    "num_consumed_tokens": train_state.num_consumed_tokens,
-                    "grad_norm": grad_norm,
-                    "loss": loss.item(),
-                    "flops": tflops,
-                    "tgs": tk_per_gpu,
-                    "acc": acc_perplex["acc"],
-                    "perplexity": acc_perplex["perplexity"],
-                    "fwd_bwd_time": fwd_bwd_time,
-                },
-            )
-        else:
-            logger.info(line)
-
-        # if loss spike occurs, send alert info to feishu
-        mm.monitor_loss_spike(alert_address=gpc.config.alert_address, step_count=batch_count, cur_step_loss=loss.item())
-
-
 def main(args):
    # init setting
    skip_batches = gpc.config.data.skip_batches
    total_steps = gpc.config.data.total_steps
    valid_every = gpc.config.data.valid_every
-    load_optimizer = gpc.config.ckpt.load_optimizer
    label_smoothing = gpc.config.loss.label_smoothing
    lr = gpc.config.adam.lr

-    load_model_only_folder = gpc.config.ckpt.get("load_model_only_folder", None)
-    load_resume_ckpt_folder = gpc.config.ckpt.get("load_ckpt_folder", None)
-
    get_tflops_func = partial(
        get_megatron_flops,
        checkpoint=gpc.config.model.checkpoint,
@ -490,46 +96,22 @@ def main(args):
    # initialize customed llm logger
    uniscale_logger = initialize_llm_logger(start_time=current_time)

-    # initialize customed llm writer
-    with open(args.config, "r") as f:
-        config_lines = f.readlines()
-    writer = Writer(
-        job_name=gpc.config.JOB_NAME,
-        launch_time=current_time,
-        file_name=get_parallel_log_file_name(),
-        tensorboard_folder=gpc.config.tensorboard_folder,
-        resume_tb_folder=gpc.config.resume_tb_folder,
-        config=config_lines,
-        logger=logger,
-        enable_tb=gpc.config.enable_tb,
-    )
-
-    model_load_path = None
-    if load_resume_ckpt_folder is not None:
-        logger.info(
-            f"===========Resume training from `{load_resume_ckpt_folder}` {current_time} on host:"
-            f"{socket.gethostname()}==========="
-        )
-        model_load_path = load_resume_ckpt_folder
-    elif load_model_only_folder is not None:
-        logger.info(
-            f"===========SFT training from `{load_model_only_folder}` {current_time} on host:"
-            f"{socket.gethostname()}==========="
-        )
-        model_load_path = load_model_only_folder
-    else:
-        logger.info(
-            f"===========New Run {current_time} on host:{socket.gethostname()},rank={gpc.get_global_rank()},"
-            f"tp={gpc.get_local_rank(ParallelMode.TENSOR)},pp={gpc.get_local_rank(ParallelMode.PIPELINE)},"
-            f"dp={gpc.get_local_rank(ParallelMode.DATA)}==========="
-        )
-
    # initialize and resume train state
    train_state = TrainState(gpc.config)

    # initialize model
    model = initialize_model()

+    with open(args.config, "r") as f:
+        config_lines = f.readlines()
+    ckpt_manager = CheckpointManager(
+        ckpt_config=gpc.config.ckpt,
+        model=model,
+        model_config=gpc.config.model,
+        model_config_file="".join(config_lines),
+        feishu_address=gpc.config.alert_address,
+    )
+
    # initialize loss function
    criterion = FlashGPTLMLoss(parallel_output=True, label_smoothing=label_smoothing)

@ -539,29 +121,24 @@ def main(args):
    train_state.init_batch_sampler(train_dl)

    # Loading model weights must be done before zero is initialized.
-    if model_load_path is not None:
-        load_model_checkpoint(folder=model_load_path, model=model)
+    ckpt_manager.try_load_model(current_time)

    optimizer, beta2_scheduler, lr_scheduler = initialize_optimizer(model=model)

    # Loading other persistent training states.
-    if load_resume_ckpt_folder is not None:
-        # load lr scheduler states.
-        load_scheduler(load_resume_ckpt_folder, lr_scheduler, optimizer, lr, train_state)
-        # load training states.
-        load_context(load_resume_ckpt_folder, train_dl, train_state)
-        # load dataloader sampler states.
-        load_sampler(load_resume_ckpt_folder, train_dl.batch_sampler)
-        # load optimzier states.
-        if load_optimizer:
-            load_optimizer_checkpoint(load_resume_ckpt_folder, optimizer)
+    ckpt_manager.try_resume_training(lr_scheduler, optimizer, lr, train_state, train_dl)

-    ckpt_save_manager = CheckpointSaveManager(
-        ckpt_config=gpc.config.ckpt,
-        model=model,
-        optimizer=optimizer,
-        lr_scheduler=lr_scheduler,
-        model_config=gpc.config.model,
+    # initialize customed llm writer
+    writer = Writer(
+        job_name=gpc.config.JOB_NAME,
+        launch_time=current_time,
+        file_name=get_parallel_log_file_name(),
+        tensorboard_folder=gpc.config.tensorboard_folder,
+        resume_tb_folder=train_state.resume_tb_folder,  # resume from ckpt.
+        step_count=train_state.step_count,  # resume from ckpt.
+        config=config_lines,
+        logger=logger,
+        enable_tb=gpc.config.enable_tb,
    )

    # initialize metric for calculating accuracy and perplexity
@ -598,12 +175,11 @@ def main(args):
    # initialize simple memory profiler
    if args.profiling:
        memory_profiler = SimpleMemoryProfiler(
-            model.model,
+            model,
            optimizer.optim,
            log_folder=f"memory_trace/rank{gpc.get_global_rank()}_"
            + f"dp{gpc.get_local_rank(ParallelMode.DATA)}_"
            + f"tp{gpc.get_local_rank(ParallelMode.TENSOR)}",
-            activation_config=build_activation_config(gpc.config.model.num_layers),
        )
    else:
        memory_profiler = None
@ -621,6 +197,8 @@ def main(args):
        for batch_count in range(train_state.batch_count, total_steps):
            if batch_count % 50 == 0:
                torch.cuda.empty_cache()
+                bench_gpu()
+                bench_net()

            start_time = time.time()
            timer("one-batch").start()
@ -645,6 +223,7 @@ def main(args):

            # do forward and backward
            timer("fwd-bwd").start()
+
            _, _, loss = trainer.execute_schedule(
                batch, forward_only=False, return_loss=True, return_output_label=False
            )
@ -659,7 +238,7 @@ def main(args):
                train_state.step_count += 1
            else:
                train_state.inf_nan_skip_batches += 1  # record the amount of updating parameters unsuccessfully.
-                if -99.0 in grad_norm_groups and gpc.is_rank_for_log():  # -99.0 encodes a specific failure case
+                if -1 in grad_norm_groups.values() and gpc.is_rank_for_log():  # -1 encodes a specific failure case
                    logger.warning(f"Warning: skip parameter update at step {batch_count}.")
                    send_alert_message(
                        address=gpc.config.alert_address,
@ -680,7 +259,7 @@ def main(args):
                trainer=trainer,
                start_time=start_time,
                loss=loss,
-                grad_norm=np.array(grad_norm_groups),
+                grad_norm=grad_norm_groups,
                metric=metric,
                update_panel=uniscale_logger is not None,
            )
@ -700,14 +279,17 @@ def main(args):

            # checkpoint the training states in specific steps, which is determined by the args "checkpoint_every"
            # # save batch sampler that tracks the true consumed samples
-            ckpt_save_manager.try_save_checkpoint(train_state)
+            now_break = ckpt_manager.try_save_checkpoint(train_state)
+            if now_break:
+                break

            if memory_profiler is not None:
                memory_profiler.step()

+            if batch_count % 2 == 0:
                prof.step()

-    ckpt_save_manager.wait_async_upload_finish()
+    ckpt_manager.wait_async_upload_finish()


 if __name__ == "__main__":